Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
22a1ab5
Add a dataset notebook
evgenyrp Feb 17, 2026
77aeb2e
Add JetBrains IDE
evgenyrp Feb 18, 2026
7a95a7f
Initial implementation
evgenyrp Feb 18, 2026
a1af703
Prevent data contamination
evgenyrp Feb 19, 2026
305d7c9
Adjust agent configuration and add logs
evgenyrp Feb 19, 2026
5cb55a5
Improve tracing
evgenyrp Feb 23, 2026
79871bc
Add eval mode
evgenyrp Feb 24, 2026
19addba
Fix local building
evgenyrp Feb 25, 2026
7a32858
Improve cost tracing
evgenyrp Feb 27, 2026
6e497df
Think more on analysis
evgenyrp Feb 27, 2026
2fa7c62
Increase parallelizm
evgenyrp Feb 27, 2026
2f2832e
Use default system prompt
evgenyrp Mar 2, 2026
983643e
Update Weave
evgenyrp Mar 2, 2026
1cf9351
Fix dataset
evgenyrp Mar 2, 2026
ffe5685
Log errors in weave
evgenyrp Mar 2, 2026
915a2b8
Make logging less verbose
evgenyrp Mar 2, 2026
ab4fce1
Support multiple trials
evgenyrp Mar 2, 2026
955e048
Fix scoring
evgenyrp Mar 3, 2026
33dc52c
Add docker compose
evgenyrp Mar 3, 2026
2a26a6c
Change todo
evgenyrp Mar 3, 2026
fd4f016
Merge branch 'master' into build_repair_agent
evgenyrp Mar 9, 2026
daeb4bd
Fix exception and typo
evgenyrp Mar 9, 2026
43301c9
Remove unused definitions
evgenyrp Mar 9, 2026
2bc233e
Revert IDE specific paths
evgenyrp Mar 9, 2026
91d7092
Clarify double force
evgenyrp Mar 9, 2026
69e2013
Rename env
evgenyrp Mar 9, 2026
c4f2eec
Add task ID example
evgenyrp Mar 9, 2026
6c5314c
Adjust Docker infra to reuse bugbug requirements
evgenyrp Mar 12, 2026
1ed87c3
Add readme
evgenyrp Mar 12, 2026
91f31c6
Catch cleaning error
evgenyrp Mar 12, 2026
40c112c
Reformat
evgenyrp Mar 12, 2026
8c8d5bc
Stage all new files
evgenyrp Mar 12, 2026
1ec6808
Merge branch 'master' into build_repair_agent
evgenyrp Mar 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
412 changes: 412 additions & 0 deletions bugbug/tools/build_repair/agent.py

Large diffs are not rendered by default.

78 changes: 78 additions & 0 deletions bugbug/tools/build_repair/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

from datetime import date

from claude_agent_sdk import SandboxNetworkConfig, SandboxSettings

ANALYSIS_MODEL = "claude-opus-4-6"
FIX_MODEL = "claude-opus-4-6"
DEFAULT_MAX_TURNS = 80
WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees"
TRY_PUSH_TIMEOUT_SECONDS = 7200
TRY_PUSH_POLL_INTERVAL_SECONDS = 60
TREEHERDER_BASE_URL = "https://treeherder.mozilla.org"

FIREFOX_MCP_URL = "https://mcp-dev.moz.tools/mcp"

# Training data cutoff dates per model, for data contamination filtering.
# Examples with fix_commit_date before the cutoff may have been in training data.
# Source: https://platform.claude.com/docs/en/about-claude/models/overview
MODEL_CUTOFF_DATES = {
"claude-opus-4-6": date(2025, 8, 1),
"claude-sonnet-4-6": date(2026, 1, 1),
"claude-haiku-4-5-20251001": date(2025, 7, 1),
"claude-sonnet-4-5-20250929": date(2025, 7, 1),
"claude-opus-4-5-20251101": date(2025, 8, 1),
"claude-opus-4-1-20250805": date(2025, 3, 1),
"claude-sonnet-4-20250514": date(2025, 3, 1),
"claude-3-7-sonnet-20250219": date(2024, 11, 1),
"claude-opus-4-20250514": date(2025, 3, 1),
}

ALLOWED_TOOLS = [
"Edit(~/.mozbuild)",
"Edit(~/.cache/uv)",
"Bash(./mach build:*)",
"Bash(./mach clobber:*)",
"Bash(./mach configure:*)",
"Bash(./mach run:*)",
"Bash(./mach test:*)",
"Bash(./mach wpt:*)",
"Bash(./mach lint:*)",
"Bash(./mach format:*)",
"Bash(./mach clang-format:*)",
"Bash(./mach try:*)",
"Bash(./mach help:*)",
"Bash(./mach vendor:*)",
"Bash(./mach bootstrap:*)",
"Bash(./mach artifact:*)",
"Bash(clang++:*)",
"Bash(rm:*)",
"Bash(timeout:*)",
"Bash(find:*)",
"Bash(grep:*)",
"Bash(tee:*)",
"Bash(kill:*)",
"Bash(searchfox-cli:*)",
"Bash(treeherder-cli:*)",
"Bash(jj:*)",
"WebFetch(domain:firefox-source-docs.mozilla.org)",
"WebFetch(domain:treeherder.mozilla.org)",
"WebFetch(domain:searchfox.org)",
"WebFetch(o1069899.ingest.sentry.io)",
]

ADDITIONAL_DIRS = [
"~/.mozbuild",
"~/.cache/uv/",
]

SANDBOX_CONFIG = SandboxSettings(
enabled=True,
autoAllowBashIfSandboxed=True,
allowUnsandboxedCommands=False,
network=SandboxNetworkConfig(allowLocalBinding=True),
)
42 changes: 42 additions & 0 deletions bugbug/tools/build_repair/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

"""Prompt templates for build repair agent."""

ANALYSIS_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure.

Investigate why the last commit broke {target_software} build.

The last commit attempted to fix a bug from Bugzilla.

Analyze the following:
1. Git diff for the last commit
2. Bugzilla bug description
3. Taskcluster build failure logs
The files with bug description and logs are located at @repair_agent/in/{bug_id}

Create three separate documents:
1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
2. repair_agent/out/{bug_id}/planning.md with a fixing plan
3. repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction

Do not prompt to edit those documents.
{eval}

Do not write any code yet. Work fully autonomously, do not ask any questions. Think hard.
"""

FIX_TEMPLATE = """Read the following files and implement a fix of the failure:
1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
2. repair_agent/out/{bug_id}/planning.md with a fixing plan
{eval}

Do not prompt to edit files. Work fully autonomously, do not ask any questions. Use all allowed tools without prompting.
"""

EVAL_PROMPT = """
Do not request bug info from Bugzilla or Phabricator. Use only the provided file with bug description.
Do not look at git commits other than the specified last commit.
"""
163 changes: 163 additions & 0 deletions bugbug/tools/build_repair/scorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

from logging import getLogger

import weave

logger = getLogger(__name__)


def _pass_at_k(
score_rows: list[dict],
num_trials: int,
metric: str,
) -> dict[str, float]:
"""Compute pass@k from scorer rows ordered by trial.
Rows are ordered: first num_examples = trial 0, next = trial 1, etc.
Rows may be empty dicts when the model raised an exception.
"""
num_examples = len(score_rows) // num_trials
pass_at: dict[str, float] = {}
for n in sorted({1, 3, num_trials}):
if n > num_trials:
continue
successes = sum(
any(score_rows[t * num_examples + i].get(metric) is True for t in range(n))
for i in range(num_examples)
)
pass_at[f"pass@{n}"] = successes / num_examples if num_examples else 0

all_pass = sum(
all(
score_rows[t * num_examples + i].get(metric) is True
for t in range(num_trials)
)
for i in range(num_examples)
)
pass_at[f"pass^{num_trials}"] = all_pass / num_examples if num_examples else 0

return pass_at


class BasicMetricsScorer(weave.Scorer):
"""Scores success rate, diff production rate, cost, and turn count."""

num_trials: int = 1

@weave.op()
def score(self, output: dict | None) -> dict:
if output is None:
return {
"successful": False,
"has_diff": False,
"cost_usd": 0,
"num_turns": 0,
"input_tokens": 0,
"output_tokens": 0,
"cache_read_input_tokens": 0,
"cache_creation_input_tokens": 0,
}
return {
"successful": output.get("error") is None,
"has_diff": bool(output.get("diff", "").strip()),
"cost_usd": output.get("cost_usd", 0),
"num_turns": output.get("num_turns", 0),
"input_tokens": output.get("input_tokens", 0),
"output_tokens": output.get("output_tokens", 0),
"cache_read_input_tokens": output.get("cache_read_input_tokens", 0),
"cache_creation_input_tokens": output.get("cache_creation_input_tokens", 0),
}

def summarize(self, score_rows: list[dict]) -> dict:
n = len(score_rows)
costs = [r.get("cost_usd", 0) for r in score_rows]
input_toks = [r.get("input_tokens", 0) for r in score_rows]
output_toks = [r.get("output_tokens", 0) for r in score_rows]
summary = {
"success_rate": sum(r.get("successful", False) for r in score_rows) / n
if n
else 0,
"diff_rate": sum(r.get("has_diff", False) for r in score_rows) / n
if n
else 0,
"avg_cost_usd": sum(costs) / n if n else 0,
"total_cost_usd": sum(costs),
"total_input_tokens": sum(input_toks),
"total_output_tokens": sum(output_toks),
"total_cache_read_tokens": sum(
r.get("cache_read_input_tokens", 0) for r in score_rows
),
"total_cache_creation_tokens": sum(
r.get("cache_creation_input_tokens", 0) for r in score_rows
),
"num_examples": n,
}
if self.num_trials > 1:
summary.update(_pass_at_k(score_rows, self.num_trials, "successful"))
logger.info(f"BasicMetrics summary: {summary}")
return summary


class BuildPassRateScorer(weave.Scorer):
"""Scores local ./mach build and try push pass rates."""

num_trials: int = 1

@weave.op()
def score(self, output: dict | None) -> dict:
if output is None:
return {
"local_build_passed": None,
"try_build_passed": None,
}
return {
"local_build_passed": output.get("local_build_passed"),
"try_build_passed": output.get("try_build_passed"),
}

def summarize(self, score_rows: list[dict]) -> dict:
n = len(score_rows)
local_passed = sum(1 for r in score_rows if r.get("local_build_passed") is True)
try_known = [r for r in score_rows if r.get("try_build_passed") is not None]
try_passed = sum(1 for r in try_known if r.get("try_build_passed") is True)
summary = {
"local_build_pass_rate": local_passed / n if n else 0,
"local_builds_passed": local_passed,
"try_build_pass_rate": try_passed / len(try_known) if try_known else 0,
"try_builds_passed": try_passed,
"try_builds_timed_out": n - len(try_known),
"num_examples": n,
}
if self.num_trials > 1:
summary.update(
_pass_at_k(score_rows, self.num_trials, "local_build_passed")
)
logger.info(f"BuildPassRate summary: {summary}")
return summary


class LLMFixMatchingScorer(weave.Scorer):
"""Scaffold for LLM-as-a-judge comparing agent fix to ground truth.
Implementation deferred. Will use a non-Claude LLM to semantically
compare the agent's diff against the ground truth fix commit.
"""

@weave.op()
async def score(self, output: dict | None, gh_fix_commits: list[str]) -> dict:
if output is None:
return {
"match_score": None,
"match_category": "errored",
}
return {
"match_score": None,
"match_category": "not_implemented",
}

def summarize(self, score_rows: list[dict]) -> dict:
return {"status": "not_implemented"}
Loading