Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
df06331
dask-dependencies
recursix Oct 29, 2024
b4231ca
minor
recursix Oct 29, 2024
765c746
replace with ray
recursix Oct 31, 2024
f3f9843
adjust tests and move a few things
recursix Oct 31, 2024
0227afa
markdown report
recursix Oct 31, 2024
3eab2ed
automatic relaunch
recursix Oct 31, 2024
98a8836
add dependencies
recursix Oct 31, 2024
ff1ce46
reformat
recursix Oct 31, 2024
3e2994c
fix unit-test
recursix Oct 31, 2024
bd45ae7
catch timeout
recursix Oct 31, 2024
ba3c4b8
Merge branch 'dask-dependencies' of github.com:ServiceNow/AgentLab in…
recursix Oct 31, 2024
76e0507
fixing bugs and making things work
recursix Nov 1, 2024
b0b92ba
adress comments and black format
recursix Nov 2, 2024
af41a5b
new dependencies viewer
recursix Nov 2, 2024
b6295f6
Update benchmark to use visualwebarena instead of webarena
recursix Nov 2, 2024
351f30c
Fix import and uncomment code in get_ray_url.py
recursix Nov 2, 2024
0f4ca46
Add ignore_dependencies option to Study and _agents_on_benchmark func…
recursix Nov 2, 2024
6ea1772
Update load_most_recent method to include contains parameter
recursix Nov 2, 2024
418a05d
Update load_most_recent method to accept contains parameter and add w…
recursix Nov 2, 2024
b8580e6
Refactor backend preparation in Study class and improve logging for i…
recursix Nov 4, 2024
e910ae3
finallly some results with claude on webarena
recursix Nov 4, 2024
2e28f2e
Add warnings for Windows timeouts and clarify parallel backend option…
recursix Nov 4, 2024
aece13b
black
recursix Nov 4, 2024
209c8d0
ensure timeout is int (For the 3rd time?)
recursix Nov 4, 2024
54c4681
Merge branch 'dev' into dask-dependencies
recursix Nov 4, 2024
729ec92
Refactor timeout handling in context manager; update test to reduce a…
recursix Nov 4, 2024
4f937f2
black
recursix Nov 4, 2024
366afbc
Change parallel backend from "joblib" to "ray" in run_experiments fun…
recursix Nov 4, 2024
7b1ecf1
Update src/agentlab/experiments/study.py
recursix Nov 4, 2024
8e8f07a
Update src/agentlab/analyze/inspect_results.py
recursix Nov 4, 2024
a922678
Refactor logging initialization and update layout configurations in d…
recursix Nov 4, 2024
c3b61ba
Merge branch 'dev' into dask-dependencies
recursix Nov 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
repository.
"""

import bgym
import logging
from agentlab.agents.generic_agent import (
RANDOM_SEARCH_AGENT,
Expand All @@ -26,7 +25,7 @@

# ## select the benchmark to run on
benchmark = "miniwob_tiny_test"
# benchmark = "miniwob_all"
# benchmark = "miniwob"
# benchmark = "workarena_l1"
# benchmark = "workarena_l2"
# benchmark = "workarena_l3"
Expand All @@ -53,13 +52,18 @@

if relaunch:
# relaunch an existing study
study = Study.load_most_recent()
study.find_incomplete(relaunch_mode="incomplete_or_error")
study = Study.load_most_recent(contains=None)
study.find_incomplete(include_errors=True)

else:
study = Study(agent_args, benchmark)

study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=reproducibility_mode)
study = Study(agent_args, benchmark, logging_level_stdout=logging.WARNING)

study.run(
n_jobs=n_jobs,
parallel_backend="ray",
strict_reproducibility=reproducibility_mode,
n_relaunch=3,
)

if reproducibility_mode:
study.append_to_journal(strict_reproducibility=True)
1 change: 1 addition & 0 deletions reproducibility_journal.csv
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ ThibaultLSDC,GenericAgent-gpt-4o,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-
ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.564,0.027,1,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.279,0.025,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,workarena_l1,0.4.1,2024-10-23_22-30-06,2024-10-23_14-17-40,0.567,0.027,4,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,4cd1e2d4189ddfbeb94129f7b0c9a00c3400ebac,,0.9.0,f25bdcd6b946fc4a79cdbee5fbcad53548af8724,
recursix,GenericAgent-anthropic_claude-3.5-sonnet:beta,webarena,0.11.3,2024-11-02_23-50-17,22a9d3f5-9d86-455e-b451-3ea17690ce8a,0.329,0.016,0,812/812,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.3,418a05d90c74800cd66371b7846ef861185b8c47,,0.11.3,160167ff0d2631826f0131e8e30b92ef448d6881,
ThibaultLSDC,GenericAgent-gpt-4o-mini,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.013,0.007,2,235/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
ThibaultLSDC,GenericAgent-gpt-4o,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.085,0.018,3,233/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,workarena_l2_agent_curriculum_eval,0.4.1,2024-10-24_17-08-53,2024-10-23_17-10-46,0.391,0.032,3,235/235,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,827d847995f19dc337f3899427340bdddbd81cd5,,0.10.0,None,
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ gradio>=5
gitpython # for the reproducibility script
requests
matplotlib
ray[default]
8 changes: 6 additions & 2 deletions src/agentlab/analyze/inspect_results.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,11 @@
"metadata": {},
"outputs": [],
"source": [
"print(inspect_results.error_report(result_df, max_stack_trace=1))"
"from IPython.display import Markdown, display\n",
"\n",
"report = inspect_results.error_report(result_df, max_stack_trace=2, use_log=True)\n",
"# display(Markdown(report))\n",
"print(report)"
]
},
{
Expand All @@ -166,7 +170,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "ui-copilot",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand Down
61 changes: 45 additions & 16 deletions src/agentlab/analyze/inspect_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,10 +581,12 @@ def set_wrap_style(df):
# ------------


def map_err_key(err_msg):
def map_err_key(err_msg: str):
if err_msg is None:
return err_msg

# remove logs from the message if any
err_msg = err_msg[: err_msg.find("=== logs ===")].rstrip()
regex_replacements = [
(
r"your messages resulted in \d+ tokens",
Expand All @@ -601,7 +603,7 @@ def map_err_key(err_msg):
return err_msg


def error_report(df: pd.DataFrame, max_stack_trace=10):
def error_report(df: pd.DataFrame, max_stack_trace=10, use_log=False):
"""Report the error message for each agent."""

if "err_key" not in df:
Expand All @@ -611,35 +613,62 @@ def error_report(df: pd.DataFrame, max_stack_trace=10):
report = []
for err_key, count in unique_counts.items():
report.append("-------------------")
report.append(f"{count}x : {err_key}\n")
report.append(f"## {count}x : " + err_key.replace("\n", "<br>") + "\n")

# find sub_df with this error message
sub_df = df[df["err_key"] == err_key]
idx = 0

exp_result_list = [get_exp_result(row.exp_dir) for _, row in sub_df.iterrows()]
task_names = [exp_result.exp_args.env_args.task_name for exp_result in exp_result_list]

# count unique using numpy
unique_task_names, counts = np.unique(task_names, return_counts=True)
task_and_count = sorted(zip(unique_task_names, counts), key=lambda x: x[1], reverse=True)
for task_name, count in task_and_count:
report.append(f"{count:2d} {task_name}")
exp_result_list = sorted(exp_result_list, key=lambda x: x.exp_args.env_args.task_name)
for exp_result in exp_result_list:
report.append(
f"* {exp_result.exp_args.env_args.task_name} seed: {exp_result.exp_args.env_args.task_seed}"
)

report.append(f"\nShowing Max {max_stack_trace} stack traces:\n")
for exp_result in exp_result_list:
if idx >= max_stack_trace:
break
# print task name and stack trace
stack_trace = exp_result.summary_info.get("stack_trace", "")
report.append(f"Task Name: {exp_result.exp_args.env_args.task_name}\n")
report.append(f"exp_dir: {exp_result.exp_dir}\n")
report.append(f"Stack Trace: \n {stack_trace}\n")
report.append("\n")

if not use_log:
# print task name and stack trace
stack_trace = exp_result.summary_info.get("stack_trace", "")
report.append(f"Task Name: {exp_result.exp_args.env_args.task_name}\n")
report.append(f"exp_dir: {exp_result.exp_dir}\n")
report.append(f"Stack Trace: \n {stack_trace}\n")
report.append("\n")
else:
report.append(f"```bash\n{_format_log(exp_result)}\n```")

idx += 1

return "\n".join(report)


def _format_log(exp_result: ExpResult, head_lines=10, tail_lines=50):
"""Extract head and tail of the log. Try to find the traceback."""
log = exp_result.logs
if log is None:
return "No log found"

log_lines = log.split("\n")
if len(log_lines) <= head_lines + tail_lines:
return log

# first 10 lines:
log_head = "\n".join(log_lines[:head_lines])

try:
traceback_idx = log.rindex("Traceback (most recent call last):")
tail_idx = log.rindex("action:", 0, traceback_idx)
log_tail = log[tail_idx:]
except ValueError:
log_tail = "\n".join(log_lines[-tail_lines:])

return log_head + "\n...\n...truncated middle of the log\n...\n" + log_tail


def categorize_error(row):
if pd.isna(row.get("err_msg", None)):
return None
Expand Down
160 changes: 146 additions & 14 deletions src/agentlab/experiments/exp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
from tqdm import tqdm
import logging
from browsergym.experiments.loop import ExpArgs
from contextlib import contextmanager
import signal
import sys
from time import time, sleep

logger = logging.getLogger(__name__) # Get logger based on module name


# TODO move this to a more appropriate place
Expand All @@ -19,8 +25,148 @@
RESULTS_DIR.mkdir(parents=True, exist_ok=True)


def run_exp(exp_arg: ExpArgs, *dependencies, avg_step_timeout=60):
"""Run exp_args.run() with a timeout and handle dependencies."""
episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout)
with timeout_manager(seconds=episode_timeout):
return exp_arg.run()


def _episode_timeout(exp_arg: ExpArgs, avg_step_timeout=60):
"""Some logic to determine the episode timeout."""
max_steps = getattr(exp_arg.env_args, "max_steps", None)
if max_steps is None:
episode_timeout_global = 10 * 60 * 60 # 10 hours
else:
episode_timeout_global = exp_arg.env_args.max_steps * avg_step_timeout

episode_timeout_exp = getattr(exp_arg, "episode_timeout", episode_timeout_global)

return min(episode_timeout_global, episode_timeout_exp)


@contextmanager
def timeout_manager(seconds: int = None):
"""Context manager to handle timeouts."""

if isinstance(seconds, float):
seconds = max(1, int(seconds)) # make sure seconds is at least 1

if seconds is None or sys.platform == "win32":
try:
logger.warning("Timeouts are not supported on Windows.")
yield
finally:
pass
return

def alarm_handler(signum, frame):

logger.warning(
f"Operation timed out after {seconds}s, sending SIGINT and raising TimeoutError."
)
# send sigint
os.kill(os.getpid(), signal.SIGINT)

# Still raise TimeoutError for immediate handling
raise TimeoutError(f"Operation timed out after {seconds} seconds")

previous_handler = signal.signal(signal.SIGALRM, alarm_handler)
signal.alarm(seconds)

try:
yield
finally:
signal.alarm(0)
signal.signal(signal.SIGALRM, previous_handler)
Comment on lines +77 to +81
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is black magic to me. I'll just trust you :)



def add_dependencies(exp_args_list: list[ExpArgs], task_dependencies: dict[str, list[str]] = None):
"""Add dependencies to a list of ExpArgs.

Args:
exp_args_list: list[ExpArgs]
A list of experiments to run.
task_dependencies: dict
A dictionary mapping task names to a list of task names that they
depend on. If None or empty, no dependencies are added.

Returns:
list[ExpArgs]
The modified exp_args_list with dependencies added.
"""

if task_dependencies is None or all([len(dep) == 0 for dep in task_dependencies.values()]):
# nothing to be done
return exp_args_list

for exp_args in exp_args_list:
exp_args.make_id() # makes sure there is an exp_id

exp_args_map = {exp_args.env_args.task_name: exp_args for exp_args in exp_args_list}
if len(exp_args_map) != len(exp_args_list):
raise ValueError(
(
"Task names are not unique in exp_args_map, "
"you can't run multiple seeds with task dependencies."
)
)

for task_name in exp_args_map.keys():
if task_name not in task_dependencies:
raise ValueError(f"Task {task_name} is missing from task_dependencies")

# turn dependencies from task names to exp_ids
for task_name, exp_args in exp_args_map.items():
exp_args.depends_on = tuple(
exp_args_map[dep_name].exp_id for dep_name in task_dependencies[task_name]
)

return exp_args_list


# Mock implementation of the ExpArgs class with timestamp checks for unit testing
class MockedExpArgs:
def __init__(self, exp_id, depends_on=None):
self.exp_id = exp_id
self.depends_on = depends_on if depends_on else []
self.start_time = None
self.end_time = None
self.env_args = None

def run(self):
self.start_time = time()

# # simulate playright code, (this was causing issues due to python async loop)
# import playwright.sync_api

# pw = playwright.sync_api.sync_playwright().start()
# pw.selectors.set_test_id_attribute("mytestid")
sleep(3) # Simulate task execution time
self.end_time = time()
return self


def make_seeds(n, offset=42):
raise DeprecationWarning("This function will be removed. Comment out this error if needed.")
return [seed + offset for seed in range(n)]


def order(exp_args_list: list[ExpArgs]):
raise DeprecationWarning("This function will be removed. Comment out this error if needed.")
"""Store the order of the list of experiments to be able to sort them back.

This is important for progression or ablation studies.
"""
for i, exp_args in enumerate(exp_args_list):
exp_args.order = i
return exp_args_list


# This was an old function for filtering some issue with the experiments.
def hide_some_exp(base_dir, filter: callable, just_test):
"""Move all experiments that match the filter to a new name."""
raise DeprecationWarning("This function will be removed. Comment out this error if needed.")
exp_list = list(yield_all_exp_results(base_dir, progress_fn=None))

msg = f"Searching {len(exp_list)} experiments to move to _* expriments where `filter(exp_args)` is True."
Expand All @@ -38,17 +184,3 @@ def hide_some_exp(base_dir, filter: callable, just_test):
_move_old_exp(exp.exp_dir)
filtered_out.append(exp)
return filtered_out


def make_seeds(n, offset=42):
return [seed + offset for seed in range(n)]


def order(exp_args_list: list[ExpArgs]):
"""Store the order of the list of experiments to be able to sort them back.

This is important for progression or ablation studies.
"""
for i, exp_args in enumerate(exp_args_list):
exp_args.order = i
return exp_args_list
5 changes: 5 additions & 0 deletions src/agentlab/experiments/get_ray_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import ray

context = ray.init(address="auto", ignore_reinit_error=True)

print(context)
Loading