Skip to content
Merged
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a032854
yet another way to kill timedout jobs
recursix Nov 6, 2024
ac1a461
Improve timeout handling in task polling logic
recursix Nov 6, 2024
6384b85
Merge branch 'dev' into clean-pipeline
recursix Nov 6, 2024
b0594ab
Merge branch 'dev' into clean-pipeline
recursix Nov 7, 2024
e9fffc2
Merge branch 'dev' into clean-pipeline
recursix Nov 7, 2024
290b88d
Add method to override max_steps in Study class
recursix Nov 7, 2024
3f05803
add support for tab visibility in observation flags and update relate…
recursix Nov 8, 2024
2fe585f
fix tests
recursix Nov 8, 2024
4a8cbb2
black
recursix Nov 8, 2024
17fc3d1
Improve timeout handling in task polling logic
recursix Nov 6, 2024
1e07d3e
yet another way to kill timedout jobs (#108)
recursix Nov 6, 2024
63d8deb
Add method to override max_steps in Study class
recursix Nov 7, 2024
b88a058
add support for tab visibility in observation flags and update relate…
recursix Nov 8, 2024
e97d023
fix tests
recursix Nov 8, 2024
ccd7b8b
black
recursix Nov 8, 2024
1aa4916
black
gasse Nov 8, 2024
c990e76
recursix Nov 8, 2024
8de36e2
Fix sorting bug.
recursix Nov 8, 2024
c4e8acb
fix test
recursix Nov 8, 2024
c9f184c
black
recursix Nov 8, 2024
2ab4f3a
Merge branch 'fix-tabs' of github.com:ServiceNow/AgentLab into fix-tabs
recursix Nov 9, 2024
b465e63
Merge branch 'dev' into fix-gradio
recursix Nov 9, 2024
d0fcb39
Merge branch 'fix-tabs' into fix-gradio
recursix Nov 9, 2024
3a96d56
tmp
recursix Nov 9, 2024
2b4775b
Merge branch 'dev' into clean-pipeline
recursix Nov 13, 2024
a16aea0
add error report, add cum cost to summary and ray backend by default
recursix Nov 13, 2024
bd12318
from previous
recursix Nov 15, 2024
6a50756
Merge branch 'dev' into Study-to-multi-eval
recursix Nov 15, 2024
50d4571
sequential studies
recursix Nov 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 149 additions & 50 deletions src/agentlab/experiments/study.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
import gzip
import logging
import pickle
import re
import uuid
from dataclasses import dataclass
from datetime import datetime
Expand All @@ -13,7 +13,6 @@

from agentlab.agents.agent_args import AgentArgs
from agentlab.analyze import inspect_results
from agentlab.experiments import args
from agentlab.experiments import reproducibility_util as repro
from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies
from agentlab.experiments.launch_exp import (
Expand All @@ -22,11 +21,96 @@
run_experiments,
)


logger = logging.getLogger(__name__)


def make_study(
agent_args: list[AgentArgs],
benchmark: bgym.Benchmark,
logging_level_stdout=logging.WARNING,
suffix="",
comment=None,
ignore_dependencies=False,
):

if isinstance(benchmark, str):
benchmark = bgym.DEFAULT_BENCHMARKS[benchmark]()

"""Make a study from a list of agents and a benchmark."""
if "webarena" in benchmark.name and len(agent_args) > 1:
logger.warning(
"*WebArena* requires manual reset after each evaluation. Running through SequentialStudies."
)
studies = []
for agent in agent_args:
studies.append(
Study(
[agent],
benchmark,
logging_level=logging_level_stdout,
suffix=suffix,
comment=comment,
ignore_dependencies=ignore_dependencies,
)
)

return SequentialStudies(studies)
else:
return Study(
agent_args,
benchmark,
logging_level=logging_level_stdout,
suffix=suffix,
comment=comment,
ignore_dependencies=ignore_dependencies,
)


class AbstractStudy(ABC):
dir: Path = None
suffix: str = ""

@abstractmethod
def find_incomplete(self, include_errors=True):
"""Search for missing"""

@abstractmethod
def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3):
"""Run the study"""

def make_dir(self, exp_root=RESULTS_DIR):
if self.dir is None:
dir_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{self.name}"

self.dir = Path(exp_root) / dir_name
self.dir.mkdir(parents=True, exist_ok=True)

def save(self, exp_root=RESULTS_DIR):
"""Pickle the study to the directory"""
# TODO perhaps remove exp_args_list before pickling and when loading bring them from the individual directories

self.make_dir(exp_root=exp_root)
with gzip.open(self.dir / "study.pkl.gz", "wb") as f:
pickle.dump(self, f)

def get_results(self, suffix="", also_save=True):
"""Recursively load all results from the study directory and summarize them."""
result_df = inspect_results.load_result_df(self.dir)
error_report = inspect_results.error_report(result_df, max_stack_trace=3, use_log=True)
summary_df = inspect_results.summarize_study(result_df)

if also_save:
suffix = f"_{suffix}" if suffix else ""
result_df.to_csv(self.dir / f"result_df{suffix}.csv")
summary_df.to_csv(self.dir / f"summary_df{suffix}.csv")
(self.dir / f"error_report{suffix}.md").write_text(error_report)

return result_df, summary_df, error_report


@dataclass
class Study:
class Study(AbstractStudy):
"""A study coresponds to one or multiple agents evaluated on a benchmark.

This is part of the high level API to help keep experiments organized and reproducible.
Expand Down Expand Up @@ -142,7 +226,7 @@ def run(
self._run(n_jobs, parallel_backend, strict_reproducibility)

suffix = f"trial_{i + 1}_of_{n_relaunch}"
_, summary_df, error_report = self.get_results(suffix=suffix)
_, summary_df, _ = self.get_results(suffix=suffix)
logger.info("\n" + str(summary_df))

n_incomplete, n_error = self.find_incomplete(include_errors=relaunch_errors)
Expand Down Expand Up @@ -200,60 +284,17 @@ def append_to_journal(self, strict_reproducibility=True):
ValueError: If the reproducibility information is not compatible
with the report.
"""
_, summary_df, _ = self.get_results()
repro.append_to_journal(
self.reproducibility_info,
self.get_report(),
summary_df,
strict_reproducibility=strict_reproducibility,
)

def get_results(self, suffix="", also_save=True):
result_df = inspect_results.load_result_df(self.dir)
error_report = inspect_results.error_report(result_df, max_stack_trace=3, use_log=True)
summary_df = inspect_results.summarize_study(result_df)

if also_save:
suffix = f"_{suffix}" if suffix else ""
result_df.to_csv(self.dir / f"result_df{suffix}.csv")
summary_df.to_csv(self.dir / f"summary_df{suffix}.csv")
(self.dir / f"error_report{suffix}.md").write_text(error_report)

return result_df, summary_df, error_report

@property
def name(self):
agent_names = [a.agent_name for a in self.agent_args]
if len(agent_names) == 1:
study_name = f"{agent_names[0]}_on_{self.benchmark.name}"
else:
study_name = f"{len(agent_names)}_agents_on_{self.benchmark.name}"

study_name = slugify(study_name, max_length=100, allow_unicode=True)

if self.suffix:
study_name += f"_{self.suffix}"
return study_name

def make_dir(self, exp_root=RESULTS_DIR):
if self.dir is None:
dir_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{self.name}"

self.dir = Path(exp_root) / dir_name
self.dir.mkdir(parents=True, exist_ok=True)

def save(self):
"""Pickle the study to the directory"""

# TODO perhaps remove exp_args_list before pickling and when loading bring them from the individual directories

self.make_dir()

with gzip.open(self.dir / "study.pkl.gz", "wb") as f:
pickle.dump(self, f)

def get_report(self, ignore_cache=False, ignore_stale=False):
return inspect_results.get_study_summary(
self.dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale
)
return _make_study_name(agent_names, [self.benchmark.name], self.suffix)

def override_max_steps(self, max_steps):
for exp_args in self.exp_args_list:
Expand Down Expand Up @@ -288,6 +329,64 @@ def load_most_recent(root_dir: Path = None, contains=None) -> "Study":
return Study.load(get_most_recent_study(root_dir, contains=contains))


def _make_study_name(agent_names, benchmark_names, suffix=None):
"""Make a study name from the agent and benchmark names."""
if len(agent_names) == 1:
agent_name = agent_names[0]
else:
agent_name = f"{len(agent_names)}_agents"

if len(benchmark_names) == 1:
benchmark_name = benchmark_names[0]
else:
benchmark_name = f"{len(benchmark_names)}_benchmarks"

study_name = f"{agent_name}_on_{benchmark_name}_{suffix if suffix else ''}"

return slugify(study_name, max_length=200, allow_unicode=True)


@dataclass
class SequentialStudies(AbstractStudy):
"""
Sequential execution of multiple studies.

This is required for e.g. WebArena, where a server reset is required between evaluations of each agent.
"""

studies: list[Study]

@property
def name(self):
"""The name of the study."""
agent_names = [a.agent_name for study in self.studies for a in study.agent_args]
benchmark_names = [study.benchmark.name for study in self.studies]
return _make_study_name(agent_names, benchmark_names, self.suffix)

def find_incomplete(self, include_errors=True):
for study in self.studies:
study.find_incomplete(include_errors=include_errors)

def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3):

self.save()

for study in self.studies:
study.make_dir(exp_root=self.dir)
study.run(n_jobs, parallel_backend, strict_reproducibility, n_relaunch)
_, summary_df, _ = self.get_results()
logger.info("\n" + str(summary_df))
logger.info(f"SequentialStudies {self.name} finished.")

def override_max_steps(self, max_steps):
for study in self.studies:
study.override_max_steps(max_steps)

def append_to_journal(self, strict_reproducibility=True):
for study in self.studies:
study.append_to_journal(strict_reproducibility=strict_reproducibility)


def get_most_recent_study(
root_dir: Path = None, date_format: str = "%Y-%m-%d_%H-%M-%S", contains=None
):
Expand Down