Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions reproducibility_journal.csv
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,8 @@ recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21
recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.8.1,2024-10-17_10-13-28,,0.557,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.7,1.39.0,0.2.2,7bba275c004f1f90dfd83eaaab963ab5066e2baf,,0.8.1,None,
ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.8.1,2024-10-17_10-50-53,,0.563,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.7,1.39.0,0.2.2,057b7d4a201cc1cd1ebd7bc884f6a91e104c479d,,0.8.1,None,
ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.4.1,2024-10-17_17-30-43,,0.258,0.024,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,7bba275c004f1f90dfd83eaaab963ab5066e2baf,,0.8.1,None,
ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.4.1,2024-10-17_18-30-28,,0.273,0.025,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,8b2b3f39a2bdb9efafad97791536a0b8cff4e708,,0.8.1,None,
Comment on lines +13 to +16
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comparison of before/after performances w/ 4o mini on miniwob and workarena.l1

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

awesome!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this with our without the new benchmark class with new miniwob action space?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1st 3rd line are without
2nd 4th are with

ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_all,0.9.0,2024-10-20_01-54-16,2024-10-20_01-54-02,0.588,0.014,0,1250/1250,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,1770eba87fabfe1e32cdf6078d71032fe00db736,,0.9.0,None,
21 changes: 11 additions & 10 deletions src/agentlab/agents/dynamic_prompting.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html

from agentlab.llm.llm_utils import (
BaseMessage,
ParseError,
count_tokens,
extract_code_blocks,
Expand Down Expand Up @@ -122,7 +123,7 @@ def __init__(self, visible: bool = True) -> None:
self._visible = visible

@property
def prompt(self):
def prompt(self) -> str | BaseMessage:
"""Avoid overriding this method. Override _prompt instead."""
if self.is_visible:
return self._prompt
Expand Down Expand Up @@ -253,7 +254,14 @@ def fit_tokens(
if isinstance(prompt, str):
prompt_str = prompt
elif isinstance(prompt, list):
# warn deprecated
warn(
"Using list of prompts is deprecated. Use a Discussion object instead.",
DeprecationWarning,
)
prompt_str = "\n".join([p["text"] for p in prompt if p["type"] == "text"])
elif isinstance(prompt, BaseMessage):
prompt_str = str(prompt)
else:
raise ValueError(f"Unrecognized type for prompt: {type(prompt)}")
n_token = count_tokens(prompt_str, model=model_name)
Expand Down Expand Up @@ -405,21 +413,14 @@ def _prompt(self) -> str:

"""

def add_screenshot(self, prompt):
def add_screenshot(self, prompt: BaseMessage) -> BaseMessage:
if self.flags.use_screenshot:
if isinstance(prompt, str):
prompt = [{"type": "text", "text": prompt}]
if self.flags.use_som:
screenshot = self.obs["screenshot_som"]
else:
screenshot = self.obs["screenshot"]
img_url = image_to_jpg_base64_url(screenshot)
prompt.append(
{
"type": "image_url",
"image_url": {"url": img_url, "detail": self.flags.openai_vision_detail},
}
)
prompt.add_image(img_url, detail=self.flags.openai_vision_detail)
return prompt


Expand Down
11 changes: 4 additions & 7 deletions src/agentlab/agents/generic_agent/generic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from agentlab.agents import dynamic_prompting as dp
from agentlab.agents.agent_args import AgentArgs
from agentlab.llm.chat_api import BaseModelArgs, make_system_message, make_user_message
from agentlab.llm.llm_utils import ParseError, retry
from agentlab.llm.llm_utils import Discussion, ParseError, SystemMessage, retry
from agentlab.llm.tracking import cost_tracker_decorator

from .generic_agent_prompt import GenericPromptFlags, MainPrompt
Expand Down Expand Up @@ -99,9 +99,9 @@ def get_action(self, obs):

max_prompt_tokens, max_trunc_itr = self._get_maxes()

system_prompt = dp.SystemPrompt().prompt
system_prompt = SystemMessage(dp.SystemPrompt().prompt)

prompt = dp.fit_tokens(
human_prompt = dp.fit_tokens(
shrinkable=main_prompt,
max_prompt_tokens=max_prompt_tokens,
model_name=self.chat_model_args.model_name,
Expand All @@ -112,10 +112,7 @@ def get_action(self, obs):
# TODO, we would need to further shrink the prompt if the retry
# cause it to be too long

chat_messages = [
make_system_message(system_prompt),
make_user_message(prompt),
]
chat_messages = Discussion([system_prompt, human_prompt])
ans_dict = retry(
self.chat_llm,
chat_messages,
Expand Down
20 changes: 14 additions & 6 deletions src/agentlab/agents/generic_agent/generic_agent_prompt.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from dataclasses import dataclass
import logging
from dataclasses import dataclass

from browsergym.core import action
from browsergym.core.action.base import AbstractActionSet

from agentlab.agents import dynamic_prompting as dp
from agentlab.llm.llm_utils import parse_html_tags_raise
from agentlab.llm.llm_utils import HumanMessage, parse_html_tags_raise


@dataclass
Expand Down Expand Up @@ -90,8 +92,9 @@ def time_for_caution():
self.memory = Memory(visible=lambda: flags.use_memory)

@property
def _prompt(self) -> str:
prompt = f"""\
def _prompt(self) -> HumanMessage:
prompt = HumanMessage(
f"""\
{self.instructions.prompt}\
{self.obs.prompt}\
{self.history.prompt}\
Expand All @@ -103,9 +106,11 @@ def _prompt(self) -> str:
{self.memory.prompt}\
{self.criticise.prompt}\
"""
)

if self.flags.use_abstract_example:
prompt += f"""
prompt.add_text(
f"""
# Abstract Example

Here is an abstract version of the answer with description of the content of
Expand All @@ -117,9 +122,11 @@ def _prompt(self) -> str:
{self.criticise.abstract_ex}\
{self.action_prompt.abstract_ex}\
"""
)

if self.flags.use_concrete_example:
prompt += f"""
prompt.add_text(
f"""
# Concrete Example

Here is a concrete example of how to format your answer.
Expand All @@ -130,6 +137,7 @@ def _prompt(self) -> str:
{self.criticise.concrete_ex}\
{self.action_prompt.concrete_ex}\
"""
)
return self.obs.add_screenshot(prompt)

def shrink(self):
Expand Down
8 changes: 5 additions & 3 deletions src/agentlab/agents/generic_agent/reproducibility_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from agentlab.agents.agent_args import AgentArgs
from agentlab.experiments.study import Study
from agentlab.llm.chat_api import make_assistant_message
from agentlab.llm.llm_utils import messages_to_dict
from agentlab.llm.llm_utils import Discussion, messages_to_dict

from .generic_agent import GenericAgent, GenericAgentArgs

Expand All @@ -43,7 +43,7 @@ def __init__(self, old_messages, delay=1) -> None:
self.old_messages = old_messages
self.delay = delay

def __call__(self, messages: list):
def __call__(self, messages: list | Discussion):
self.new_messages = copy(messages)

if len(messages) >= len(self.old_messages):
Expand Down Expand Up @@ -95,7 +95,7 @@ def get_action(self, obs):
# same answers
step = len(self.actions)
step_info = self.exp_result.get_step_info(step)
old_chat_messages = step_info.agent_info.get("chat_messages", None)
old_chat_messages = step_info.agent_info.get("chat_messages", None) # type: Discussion

if old_chat_messages is None:
err_msg = self.exp_result.summary_info["err_msg"]
Expand Down Expand Up @@ -135,6 +135,8 @@ def _make_agent_stats(action, agent_info, step_info, old_chat_messages, new_chat


def _format_messages(messages: list[dict]):
if isinstance(messages, Discussion):
return messages.to_string()
messages = messages_to_dict(messages)
return "\n".join(f"{m['role']} message:\n{m['content']}\n" for m in messages)

Expand Down
48 changes: 30 additions & 18 deletions src/agentlab/agents/most_basic_agent/most_basic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,18 @@

import bgym

from agentlab.agents.agent_args import AgentArgs
from agentlab.llm.chat_api import make_system_message, make_user_message
from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
from agentlab.llm.llm_utils import ParseError, extract_code_blocks, retry
from agentlab.llm.llm_utils import (
Discussion,
HumanMessage,
ParseError,
SystemMessage,
extract_code_blocks,
retry,
)
from agentlab.llm.tracking import cost_tracker_decorator
from agentlab.agents.agent_args import AgentArgs

if TYPE_CHECKING:
from agentlab.llm.chat_api import BaseModelArgs
Expand Down Expand Up @@ -51,25 +58,25 @@ def __init__(

@cost_tracker_decorator
def get_action(self, obs: Any) -> tuple[str, dict]:
system_prompt = f"""
You are a web assistant.
"""
prompt = f"""
messages = Discussion(SystemMessage("You are a web assistant."))
messages.append(
HumanMessage(
f"""
You are helping a user to accomplish the following goal on a website:

{obs["goal"]}

Here is the current state of the website, in the form of an html:

{obs["dom_txt"]}

To do so, you can interact with the environment using the following actions:

{self.action_set.describe(with_long_description=False)}

The inputs to those functions are the bids given in the html.

The action you provide must be in between triple ticks.
Here is the current state of the website, in the form of an html:

{obs["pruned_html"]}

The action you provide must be in between triple ticks and leverage the 'bid=' information provided in the html.
Here is an example of how to use the bid action:

```
Expand All @@ -79,15 +86,14 @@ def get_action(self, obs: Any) -> tuple[str, dict]:
Please provide a single action at a time and wait for the next observation. Provide only a single action per step.
Focus on the bid that are given in the html, and use them to perform the actions.
"""
)
)
if self.use_chain_of_thought:
prompt += f"""
messages.add_text(
f"""
Provide a chain of thoughts reasoning to decompose the task into smaller steps. And execute only the next step.
"""

messages = [
make_system_message(system_prompt),
make_user_message(prompt),
]
)

def parser(response: str) -> tuple[dict, bool, str]:
blocks = extract_code_blocks(response)
Expand All @@ -108,7 +114,7 @@ def parser(response: str) -> tuple[dict, bool, str]:
think=thought,
chat_messages=messages,
# put any stats that you care about as long as it is a number or a dict of numbers
stats={"prompt_length": len(prompt), "response_length": len(thought)},
stats={"prompt_length": len(messages), "response_length": len(thought)},
markdown_page="Add any txt information here, including base 64 images, to display in xray",
extra_info={"chat_model_args": asdict(self.chat_model_args)},
),
Expand Down Expand Up @@ -147,6 +153,12 @@ def parser(response: str) -> tuple[dict, bool, str]:
),
]

AGENT_4o_MINI = MostBasicAgentArgs(
temperature=0.3,
use_chain_of_thought=True,
chat_model_args=chat_model_args,
)


def experiment_config():
return exp_args
7 changes: 5 additions & 2 deletions src/agentlab/analyze/agent_xray.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@

from agentlab.analyze import inspect_results
from agentlab.experiments.exp_utils import RESULTS_DIR
from agentlab.llm.chat_api import make_system_message, make_user_message
from agentlab.experiments.study import get_most_recent_study
from agentlab.llm.chat_api import make_system_message, make_user_message
from agentlab.llm.llm_utils import Discussion

select_dir_instructions = "Select Experiment Directory"
AGENT_NAME_KEY = "agent.agent_name"
Expand Down Expand Up @@ -581,7 +582,9 @@ def update_chat_messages():
global info
agent_info = info.exp_result.steps_info[info.step].agent_info
chat_messages = agent_info.get("chat_messages", ["No Chat Messages"])
messages = []
if isinstance(chat_messages, Discussion):
return chat_messages.to_markdown()
messages = [] # TODO(ThibaultLSDC) remove this at some point
for i, m in enumerate(chat_messages):
if isinstance(m, BaseMessage): # TODO remove once langchain is deprecated
m = m.content
Comment on lines +585 to 590
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Discussion class deprecates a lot code pieces but I figured it might be safer to keep for a while

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For backward compatibility? Perhaps we can wrap backward compatible code in some isolated function (no need to do now).

As long as we're at laeast forward compatible :)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does that work with AgentInfo in browsergym. They type won't be Discussion since it's only defined in AgentLab.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haven't thought of that. I feel like it would be a weird usecase to use only browsergym on traces that were made with Agentlab though

Expand Down
1 change: 1 addition & 0 deletions src/agentlab/experiments/reproducibility_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ def get_reproducibility_info(
"*/reproducibility_script.py",
"*reproducibility_journal.csv",
"*main.py",
"*inspect_results.ipynb",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's just convenient

),
ignore_changes=False,
):
Expand Down
6 changes: 5 additions & 1 deletion src/agentlab/llm/chat_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import agentlab.llm.tracking as tracking
from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs
from agentlab.llm.huggingface_utils import HFBaseChatModel
from agentlab.llm.llm_utils import Discussion


def make_system_message(content: str) -> dict:
Expand All @@ -31,7 +32,10 @@ class CheatMiniWoBLLM(AbstractChatModel):
"""For unit-testing purposes only. It only work with miniwob.click-test task."""

def __call__(self, messages) -> str:
prompt = messages[-1]["content"]
if isinstance(messages, Discussion):
prompt = messages.to_string()
else:
prompt = messages[1].get("content", "")
match = re.search(r"^\s*\[(\d+)\].*button", prompt, re.MULTILINE | re.IGNORECASE)

if match:
Expand Down
Loading