-
Notifications
You must be signed in to change notification settings - Fork 326
Build Repair Agent: initial evaluation harness #5762
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+1,817
−0
Merged
Changes from all commits
Commits
Show all changes
33 commits
Select commit
Hold shift + click to select a range
22a1ab5
Add a dataset notebook
evgenyrp 77aeb2e
Add JetBrains IDE
evgenyrp 7a95a7f
Initial implementation
evgenyrp a1af703
Prevent data contamination
evgenyrp 305d7c9
Adjust agent configuration and add logs
evgenyrp 5cb55a5
Improve tracing
evgenyrp 79871bc
Add eval mode
evgenyrp 19addba
Fix local building
evgenyrp 7a32858
Improve cost tracing
evgenyrp 6e497df
Think more on analysis
evgenyrp 2fa7c62
Increase parallelizm
evgenyrp 2f2832e
Use default system prompt
evgenyrp 983643e
Update Weave
evgenyrp 1cf9351
Fix dataset
evgenyrp ffe5685
Log errors in weave
evgenyrp 915a2b8
Make logging less verbose
evgenyrp ab4fce1
Support multiple trials
evgenyrp 955e048
Fix scoring
evgenyrp 33dc52c
Add docker compose
evgenyrp 2a26a6c
Change todo
evgenyrp fd4f016
Merge branch 'master' into build_repair_agent
evgenyrp daeb4bd
Fix exception and typo
evgenyrp 43301c9
Remove unused definitions
evgenyrp 2bc233e
Revert IDE specific paths
evgenyrp 91d7092
Clarify double force
evgenyrp 69e2013
Rename env
evgenyrp c4f2eec
Add task ID example
evgenyrp 6c5314c
Adjust Docker infra to reuse bugbug requirements
evgenyrp 1ed87c3
Add readme
evgenyrp 91f31c6
Catch cleaning error
evgenyrp 40c112c
Reformat
evgenyrp 8c8d5bc
Stage all new files
evgenyrp 1ec6808
Merge branch 'master' into build_repair_agent
evgenyrp File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Empty file.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| # -*- coding: utf-8 -*- | ||
| # This Source Code Form is subject to the terms of the Mozilla Public | ||
| # License, v. 2.0. If a copy of the MPL was not distributed with this file, | ||
| # You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
|
||
| from datetime import date | ||
|
|
||
| from claude_agent_sdk import SandboxNetworkConfig, SandboxSettings | ||
|
|
||
| ANALYSIS_MODEL = "claude-opus-4-6" | ||
| FIX_MODEL = "claude-opus-4-6" | ||
| DEFAULT_MAX_TURNS = 80 | ||
| WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees" | ||
| TRY_PUSH_TIMEOUT_SECONDS = 7200 | ||
| TRY_PUSH_POLL_INTERVAL_SECONDS = 60 | ||
| TREEHERDER_BASE_URL = "https://treeherder.mozilla.org" | ||
|
|
||
| FIREFOX_MCP_URL = "https://mcp-dev.moz.tools/mcp" | ||
|
|
||
| # Training data cutoff dates per model, for data contamination filtering. | ||
| # Examples with fix_commit_date before the cutoff may have been in training data. | ||
| # Source: https://platform.claude.com/docs/en/about-claude/models/overview | ||
| MODEL_CUTOFF_DATES = { | ||
| "claude-opus-4-6": date(2025, 8, 1), | ||
| "claude-sonnet-4-6": date(2026, 1, 1), | ||
| "claude-haiku-4-5-20251001": date(2025, 7, 1), | ||
| "claude-sonnet-4-5-20250929": date(2025, 7, 1), | ||
| "claude-opus-4-5-20251101": date(2025, 8, 1), | ||
| "claude-opus-4-1-20250805": date(2025, 3, 1), | ||
| "claude-sonnet-4-20250514": date(2025, 3, 1), | ||
| "claude-3-7-sonnet-20250219": date(2024, 11, 1), | ||
| "claude-opus-4-20250514": date(2025, 3, 1), | ||
| } | ||
|
|
||
| ALLOWED_TOOLS = [ | ||
| "Edit(~/.mozbuild)", | ||
| "Edit(~/.cache/uv)", | ||
| "Bash(./mach build:*)", | ||
| "Bash(./mach clobber:*)", | ||
| "Bash(./mach configure:*)", | ||
| "Bash(./mach run:*)", | ||
| "Bash(./mach test:*)", | ||
| "Bash(./mach wpt:*)", | ||
| "Bash(./mach lint:*)", | ||
| "Bash(./mach format:*)", | ||
| "Bash(./mach clang-format:*)", | ||
| "Bash(./mach try:*)", | ||
| "Bash(./mach help:*)", | ||
| "Bash(./mach vendor:*)", | ||
| "Bash(./mach bootstrap:*)", | ||
| "Bash(./mach artifact:*)", | ||
| "Bash(clang++:*)", | ||
| "Bash(rm:*)", | ||
| "Bash(timeout:*)", | ||
| "Bash(find:*)", | ||
| "Bash(grep:*)", | ||
| "Bash(tee:*)", | ||
| "Bash(kill:*)", | ||
| "Bash(searchfox-cli:*)", | ||
| "Bash(treeherder-cli:*)", | ||
| "Bash(jj:*)", | ||
| "WebFetch(domain:firefox-source-docs.mozilla.org)", | ||
| "WebFetch(domain:treeherder.mozilla.org)", | ||
| "WebFetch(domain:searchfox.org)", | ||
| "WebFetch(o1069899.ingest.sentry.io)", | ||
| ] | ||
|
|
||
| ADDITIONAL_DIRS = [ | ||
| "~/.mozbuild", | ||
| "~/.cache/uv/", | ||
| ] | ||
|
|
||
| SANDBOX_CONFIG = SandboxSettings( | ||
| enabled=True, | ||
| autoAllowBashIfSandboxed=True, | ||
| allowUnsandboxedCommands=False, | ||
| network=SandboxNetworkConfig(allowLocalBinding=True), | ||
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| # -*- coding: utf-8 -*- | ||
| # This Source Code Form is subject to the terms of the Mozilla Public | ||
| # License, v. 2.0. If a copy of the MPL was not distributed with this file, | ||
| # You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
|
||
| """Prompt templates for build repair agent.""" | ||
|
|
||
| ANALYSIS_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure. | ||
|
|
||
| Investigate why the last commit broke {target_software} build. | ||
|
|
||
| The last commit attempted to fix a bug from Bugzilla. | ||
|
|
||
| Analyze the following: | ||
| 1. Git diff for the last commit | ||
| 2. Bugzilla bug description | ||
| 3. Taskcluster build failure logs | ||
| The files with bug description and logs are located at @repair_agent/in/{bug_id} | ||
|
|
||
| Create three separate documents: | ||
| 1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues | ||
| 2. repair_agent/out/{bug_id}/planning.md with a fixing plan | ||
| 3. repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction | ||
|
|
||
| Do not prompt to edit those documents. | ||
| {eval} | ||
|
|
||
| Do not write any code yet. Work fully autonomously, do not ask any questions. Think hard. | ||
| """ | ||
|
|
||
| FIX_TEMPLATE = """Read the following files and implement a fix of the failure: | ||
| 1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues | ||
| 2. repair_agent/out/{bug_id}/planning.md with a fixing plan | ||
| {eval} | ||
|
|
||
| Do not prompt to edit files. Work fully autonomously, do not ask any questions. Use all allowed tools without prompting. | ||
| """ | ||
|
|
||
| EVAL_PROMPT = """ | ||
| Do not request bug info from Bugzilla or Phabricator. Use only the provided file with bug description. | ||
| Do not look at git commits other than the specified last commit. | ||
| """ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,163 @@ | ||
| # -*- coding: utf-8 -*- | ||
| # This Source Code Form is subject to the terms of the Mozilla Public | ||
| # License, v. 2.0. If a copy of the MPL was not distributed with this file, | ||
| # You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
|
||
| from logging import getLogger | ||
|
|
||
| import weave | ||
|
|
||
| logger = getLogger(__name__) | ||
|
|
||
|
|
||
| def _pass_at_k( | ||
| score_rows: list[dict], | ||
| num_trials: int, | ||
| metric: str, | ||
| ) -> dict[str, float]: | ||
| """Compute pass@k from scorer rows ordered by trial. | ||
| Rows are ordered: first num_examples = trial 0, next = trial 1, etc. | ||
| Rows may be empty dicts when the model raised an exception. | ||
| """ | ||
| num_examples = len(score_rows) // num_trials | ||
| pass_at: dict[str, float] = {} | ||
| for n in sorted({1, 3, num_trials}): | ||
| if n > num_trials: | ||
| continue | ||
| successes = sum( | ||
| any(score_rows[t * num_examples + i].get(metric) is True for t in range(n)) | ||
| for i in range(num_examples) | ||
| ) | ||
| pass_at[f"pass@{n}"] = successes / num_examples if num_examples else 0 | ||
|
|
||
| all_pass = sum( | ||
| all( | ||
| score_rows[t * num_examples + i].get(metric) is True | ||
| for t in range(num_trials) | ||
| ) | ||
| for i in range(num_examples) | ||
| ) | ||
| pass_at[f"pass^{num_trials}"] = all_pass / num_examples if num_examples else 0 | ||
|
|
||
| return pass_at | ||
|
|
||
|
|
||
| class BasicMetricsScorer(weave.Scorer): | ||
| """Scores success rate, diff production rate, cost, and turn count.""" | ||
|
|
||
| num_trials: int = 1 | ||
|
|
||
| @weave.op() | ||
| def score(self, output: dict | None) -> dict: | ||
| if output is None: | ||
| return { | ||
| "successful": False, | ||
| "has_diff": False, | ||
| "cost_usd": 0, | ||
| "num_turns": 0, | ||
| "input_tokens": 0, | ||
| "output_tokens": 0, | ||
| "cache_read_input_tokens": 0, | ||
| "cache_creation_input_tokens": 0, | ||
| } | ||
| return { | ||
| "successful": output.get("error") is None, | ||
| "has_diff": bool(output.get("diff", "").strip()), | ||
| "cost_usd": output.get("cost_usd", 0), | ||
| "num_turns": output.get("num_turns", 0), | ||
| "input_tokens": output.get("input_tokens", 0), | ||
| "output_tokens": output.get("output_tokens", 0), | ||
| "cache_read_input_tokens": output.get("cache_read_input_tokens", 0), | ||
| "cache_creation_input_tokens": output.get("cache_creation_input_tokens", 0), | ||
| } | ||
|
|
||
| def summarize(self, score_rows: list[dict]) -> dict: | ||
| n = len(score_rows) | ||
| costs = [r.get("cost_usd", 0) for r in score_rows] | ||
| input_toks = [r.get("input_tokens", 0) for r in score_rows] | ||
| output_toks = [r.get("output_tokens", 0) for r in score_rows] | ||
| summary = { | ||
| "success_rate": sum(r.get("successful", False) for r in score_rows) / n | ||
| if n | ||
| else 0, | ||
| "diff_rate": sum(r.get("has_diff", False) for r in score_rows) / n | ||
| if n | ||
| else 0, | ||
| "avg_cost_usd": sum(costs) / n if n else 0, | ||
| "total_cost_usd": sum(costs), | ||
| "total_input_tokens": sum(input_toks), | ||
| "total_output_tokens": sum(output_toks), | ||
| "total_cache_read_tokens": sum( | ||
| r.get("cache_read_input_tokens", 0) for r in score_rows | ||
| ), | ||
| "total_cache_creation_tokens": sum( | ||
| r.get("cache_creation_input_tokens", 0) for r in score_rows | ||
| ), | ||
| "num_examples": n, | ||
| } | ||
| if self.num_trials > 1: | ||
| summary.update(_pass_at_k(score_rows, self.num_trials, "successful")) | ||
| logger.info(f"BasicMetrics summary: {summary}") | ||
| return summary | ||
|
|
||
|
|
||
| class BuildPassRateScorer(weave.Scorer): | ||
| """Scores local ./mach build and try push pass rates.""" | ||
|
|
||
| num_trials: int = 1 | ||
|
|
||
| @weave.op() | ||
| def score(self, output: dict | None) -> dict: | ||
| if output is None: | ||
| return { | ||
| "local_build_passed": None, | ||
| "try_build_passed": None, | ||
| } | ||
| return { | ||
| "local_build_passed": output.get("local_build_passed"), | ||
| "try_build_passed": output.get("try_build_passed"), | ||
| } | ||
|
|
||
| def summarize(self, score_rows: list[dict]) -> dict: | ||
| n = len(score_rows) | ||
| local_passed = sum(1 for r in score_rows if r.get("local_build_passed") is True) | ||
| try_known = [r for r in score_rows if r.get("try_build_passed") is not None] | ||
| try_passed = sum(1 for r in try_known if r.get("try_build_passed") is True) | ||
| summary = { | ||
| "local_build_pass_rate": local_passed / n if n else 0, | ||
| "local_builds_passed": local_passed, | ||
| "try_build_pass_rate": try_passed / len(try_known) if try_known else 0, | ||
| "try_builds_passed": try_passed, | ||
| "try_builds_timed_out": n - len(try_known), | ||
| "num_examples": n, | ||
| } | ||
| if self.num_trials > 1: | ||
| summary.update( | ||
| _pass_at_k(score_rows, self.num_trials, "local_build_passed") | ||
| ) | ||
| logger.info(f"BuildPassRate summary: {summary}") | ||
| return summary | ||
|
|
||
|
|
||
| class LLMFixMatchingScorer(weave.Scorer): | ||
| """Scaffold for LLM-as-a-judge comparing agent fix to ground truth. | ||
| Implementation deferred. Will use a non-Claude LLM to semantically | ||
| compare the agent's diff against the ground truth fix commit. | ||
| """ | ||
|
|
||
| @weave.op() | ||
| async def score(self, output: dict | None, gh_fix_commits: list[str]) -> dict: | ||
| if output is None: | ||
| return { | ||
| "match_score": None, | ||
| "match_category": "errored", | ||
| } | ||
| return { | ||
| "match_score": None, | ||
| "match_category": "not_implemented", | ||
| } | ||
|
|
||
| def summarize(self, score_rows: list[dict]) -> dict: | ||
| return {"status": "not_implemented"} | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.