ServiceNow · TLSDC · Oct 21, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 17, 2024
diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
@@ -10,3 +10,8 @@ recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21
 recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
 recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
 recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.8.1,2024-10-17_10-13-28,,0.557,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.7,1.39.0,0.2.2,7bba275c004f1f90dfd83eaaab963ab5066e2baf,,0.8.1,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.8.1,2024-10-17_10-50-53,,0.563,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.7,1.39.0,0.2.2,057b7d4a201cc1cd1ebd7bc884f6a91e104c479d,,0.8.1,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.4.1,2024-10-17_17-30-43,,0.258,0.024,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,7bba275c004f1f90dfd83eaaab963ab5066e2baf,,0.8.1,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.4.1,2024-10-17_18-30-28,,0.273,0.025,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,8b2b3f39a2bdb9efafad97791536a0b8cff4e708,,0.8.1,None,
+ThibaultLSDC,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_all,0.9.0,2024-10-20_01-54-16,2024-10-20_01-54-02,0.588,0.014,0,1250/1250,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.2,1770eba87fabfe1e32cdf6078d71032fe00db736,,0.9.0,None,
diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
@@ -15,6 +15,7 @@
 from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
 
 from agentlab.llm.llm_utils import (
+    BaseMessage,
     ParseError,
     count_tokens,
     extract_code_blocks,
@@ -122,7 +123,7 @@ def __init__(self, visible: bool = True) -> None:
         self._visible = visible
 
     @property
-    def prompt(self):
+    def prompt(self) -> str | BaseMessage:
         """Avoid overriding this method. Override _prompt instead."""
         if self.is_visible:
             return self._prompt
@@ -253,7 +254,14 @@ def fit_tokens(
         if isinstance(prompt, str):
             prompt_str = prompt
         elif isinstance(prompt, list):
+            # warn deprecated
+            warn(
+                "Using list of prompts is deprecated. Use a Discussion object instead.",
+                DeprecationWarning,
+            )
             prompt_str = "\n".join([p["text"] for p in prompt if p["type"] == "text"])
+        elif isinstance(prompt, BaseMessage):
+            prompt_str = str(prompt)
         else:
             raise ValueError(f"Unrecognized type for prompt: {type(prompt)}")
         n_token = count_tokens(prompt_str, model=model_name)
@@ -405,21 +413,14 @@ def _prompt(self) -> str:
 
 """
 
-    def add_screenshot(self, prompt):
+    def add_screenshot(self, prompt: BaseMessage) -> BaseMessage:
         if self.flags.use_screenshot:
-            if isinstance(prompt, str):
-                prompt = [{"type": "text", "text": prompt}]
             if self.flags.use_som:
                 screenshot = self.obs["screenshot_som"]
             else:
                 screenshot = self.obs["screenshot"]
             img_url = image_to_jpg_base64_url(screenshot)
-            prompt.append(
-                {
-                    "type": "image_url",
-                    "image_url": {"url": img_url, "detail": self.flags.openai_vision_detail},
-                }
-            )
+            prompt.add_image(img_url, detail=self.flags.openai_vision_detail)
         return prompt
 
 

diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -9,7 +9,7 @@
 from agentlab.agents import dynamic_prompting as dp
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.llm.chat_api import BaseModelArgs, make_system_message, make_user_message
-from agentlab.llm.llm_utils import ParseError, retry
+from agentlab.llm.llm_utils import Discussion, ParseError, SystemMessage, retry
 from agentlab.llm.tracking import cost_tracker_decorator
 
 from .generic_agent_prompt import GenericPromptFlags, MainPrompt
@@ -99,9 +99,9 @@ def get_action(self, obs):
 
         max_prompt_tokens, max_trunc_itr = self._get_maxes()
 
-        system_prompt = dp.SystemPrompt().prompt
+        system_prompt = SystemMessage(dp.SystemPrompt().prompt)
 
-        prompt = dp.fit_tokens(
+        human_prompt = dp.fit_tokens(
             shrinkable=main_prompt,
             max_prompt_tokens=max_prompt_tokens,
             model_name=self.chat_model_args.model_name,
@@ -112,10 +112,7 @@ def get_action(self, obs):
             # TODO, we would need to further shrink the prompt if the retry
             # cause it to be too long
 
-            chat_messages = [
-                make_system_message(system_prompt),
-                make_user_message(prompt),
-            ]
+            chat_messages = Discussion([system_prompt, human_prompt])
             ans_dict = retry(
                 self.chat_llm,
                 chat_messages,

diff --git a/src/agentlab/agents/generic_agent/generic_agent_prompt.py b/src/agentlab/agents/generic_agent/generic_agent_prompt.py
@@ -1,9 +1,11 @@
-from dataclasses import dataclass
 import logging
+from dataclasses import dataclass
+
 from browsergym.core import action
 from browsergym.core.action.base import AbstractActionSet
+
 from agentlab.agents import dynamic_prompting as dp
-from agentlab.llm.llm_utils import parse_html_tags_raise
+from agentlab.llm.llm_utils import HumanMessage, parse_html_tags_raise
 
 
 @dataclass
@@ -90,8 +92,9 @@ def time_for_caution():
         self.memory = Memory(visible=lambda: flags.use_memory)
 
     @property
-    def _prompt(self) -> str:
-        prompt = f"""\
+    def _prompt(self) -> HumanMessage:
+        prompt = HumanMessage(
+            f"""\
 {self.instructions.prompt}\
 {self.obs.prompt}\
 {self.history.prompt}\
@@ -103,9 +106,11 @@ def _prompt(self) -> str:
 {self.memory.prompt}\
 {self.criticise.prompt}\
 """
+        )
 
         if self.flags.use_abstract_example:
-            prompt += f"""
+            prompt.add_text(
+                f"""
 # Abstract Example
 
 Here is an abstract version of the answer with description of the content of
@@ -117,9 +122,11 @@ def _prompt(self) -> str:
 {self.criticise.abstract_ex}\
 {self.action_prompt.abstract_ex}\
 """
+            )
 
         if self.flags.use_concrete_example:
-            prompt += f"""
+            prompt.add_text(
+                f"""
 # Concrete Example
 
 Here is a concrete example of how to format your answer.
@@ -130,6 +137,7 @@ def _prompt(self) -> str:
 {self.criticise.concrete_ex}\
 {self.action_prompt.concrete_ex}\
 """
+            )
         return self.obs.add_screenshot(prompt)
 
     def shrink(self):

diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py
@@ -26,7 +26,7 @@
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.experiments.study import Study
 from agentlab.llm.chat_api import make_assistant_message
-from agentlab.llm.llm_utils import messages_to_dict
+from agentlab.llm.llm_utils import Discussion, messages_to_dict
 
 from .generic_agent import GenericAgent, GenericAgentArgs
 
@@ -43,7 +43,7 @@ def __init__(self, old_messages, delay=1) -> None:
         self.old_messages = old_messages
         self.delay = delay
 
-    def __call__(self, messages: list):
+    def __call__(self, messages: list | Discussion):
         self.new_messages = copy(messages)
 
         if len(messages) >= len(self.old_messages):
@@ -95,7 +95,7 @@ def get_action(self, obs):
         # same answers
         step = len(self.actions)
         step_info = self.exp_result.get_step_info(step)
-        old_chat_messages = step_info.agent_info.get("chat_messages", None)
+        old_chat_messages = step_info.agent_info.get("chat_messages", None)  # type: Discussion
 
         if old_chat_messages is None:
             err_msg = self.exp_result.summary_info["err_msg"]
@@ -135,6 +135,8 @@ def _make_agent_stats(action, agent_info, step_info, old_chat_messages, new_chat
 
 
 def _format_messages(messages: list[dict]):
+    if isinstance(messages, Discussion):
+        return messages.to_string()
     messages = messages_to_dict(messages)
     return "\n".join(f"{m['role']} message:\n{m['content']}\n" for m in messages)
 

diff --git a/src/agentlab/agents/most_basic_agent/most_basic_agent.py b/src/agentlab/agents/most_basic_agent/most_basic_agent.py
@@ -4,11 +4,18 @@
 
 import bgym
 
+from agentlab.agents.agent_args import AgentArgs
 from agentlab.llm.chat_api import make_system_message, make_user_message
 from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
-from agentlab.llm.llm_utils import ParseError, extract_code_blocks, retry
+from agentlab.llm.llm_utils import (
+    Discussion,
+    HumanMessage,
+    ParseError,
+    SystemMessage,
+    extract_code_blocks,
+    retry,
+)
 from agentlab.llm.tracking import cost_tracker_decorator
-from agentlab.agents.agent_args import AgentArgs
 
 if TYPE_CHECKING:
     from agentlab.llm.chat_api import BaseModelArgs
@@ -51,25 +58,25 @@ def __init__(
 
     @cost_tracker_decorator
     def get_action(self, obs: Any) -> tuple[str, dict]:
-        system_prompt = f"""
-You are a web assistant.
-"""
-        prompt = f"""
+        messages = Discussion(SystemMessage("You are a web assistant."))
+        messages.append(
+            HumanMessage(
+                f"""
 You are helping a user to accomplish the following goal on a website:
 
 {obs["goal"]}
 
-Here is the current state of the website, in the form of an html:
-
-{obs["dom_txt"]}
-
 To do so, you can interact with the environment using the following actions:
 
 {self.action_set.describe(with_long_description=False)}
 
 The inputs to those functions are the bids given in the html.
 
-The action you provide must be in between triple ticks.
+Here is the current state of the website, in the form of an html:
+
+{obs["pruned_html"]}
+
+The action you provide must be in between triple ticks and leverage the 'bid=' information provided in the html.
 Here is an example of how to use the bid action:
 
 ```
@@ -79,15 +86,14 @@ def get_action(self, obs: Any) -> tuple[str, dict]:
 Please provide a single action at a time and wait for the next observation. Provide only a single action per step. 
 Focus on the bid that are given in the html, and use them to perform the actions.
 """
+            )
+        )
         if self.use_chain_of_thought:
-            prompt += f"""
+            messages.add_text(
+                f"""
 Provide a chain of thoughts reasoning to decompose the task into smaller steps. And execute only the next step.
 """
-
-        messages = [
-            make_system_message(system_prompt),
-            make_user_message(prompt),
-        ]
+            )
 
         def parser(response: str) -> tuple[dict, bool, str]:
             blocks = extract_code_blocks(response)
@@ -108,7 +114,7 @@ def parser(response: str) -> tuple[dict, bool, str]:
                 think=thought,
                 chat_messages=messages,
                 # put any stats that you care about as long as it is a number or a dict of numbers
-                stats={"prompt_length": len(prompt), "response_length": len(thought)},
+                stats={"prompt_length": len(messages), "response_length": len(thought)},
                 markdown_page="Add any txt information here, including base 64 images, to display in xray",
                 extra_info={"chat_model_args": asdict(self.chat_model_args)},
             ),
@@ -147,6 +153,12 @@ def parser(response: str) -> tuple[dict, bool, str]:
     ),
 ]
 
+AGENT_4o_MINI = MostBasicAgentArgs(
+    temperature=0.3,
+    use_chain_of_thought=True,
+    chat_model_args=chat_model_args,
+)
+
 
 def experiment_config():
     return exp_args
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
@@ -19,8 +19,9 @@
 
 from agentlab.analyze import inspect_results
 from agentlab.experiments.exp_utils import RESULTS_DIR
-from agentlab.llm.chat_api import make_system_message, make_user_message
 from agentlab.experiments.study import get_most_recent_study
+from agentlab.llm.chat_api import make_system_message, make_user_message
+from agentlab.llm.llm_utils import Discussion
 
 select_dir_instructions = "Select Experiment Directory"
 AGENT_NAME_KEY = "agent.agent_name"
@@ -581,7 +582,9 @@ def update_chat_messages():
     global info
     agent_info = info.exp_result.steps_info[info.step].agent_info
     chat_messages = agent_info.get("chat_messages", ["No Chat Messages"])
-    messages = []
+    if isinstance(chat_messages, Discussion):
+        return chat_messages.to_markdown()
+    messages = []  # TODO(ThibaultLSDC) remove this at some point
     for i, m in enumerate(chat_messages):
         if isinstance(m, BaseMessage):  # TODO remove once langchain is deprecated
             m = m.content

diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py
@@ -177,6 +177,7 @@ def get_reproducibility_info(
         "*/reproducibility_script.py",
         "*reproducibility_journal.csv",
         "*main.py",
+        "*inspect_results.ipynb",
     ),
     ignore_changes=False,
 ):

diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
@@ -13,6 +13,7 @@
 import agentlab.llm.tracking as tracking
 from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs
 from agentlab.llm.huggingface_utils import HFBaseChatModel
+from agentlab.llm.llm_utils import Discussion
 
 
 def make_system_message(content: str) -> dict:
@@ -31,7 +32,10 @@ class CheatMiniWoBLLM(AbstractChatModel):
     """For unit-testing purposes only. It only work with miniwob.click-test task."""
 
     def __call__(self, messages) -> str:
-        prompt = messages[-1]["content"]
+        if isinstance(messages, Discussion):
+            prompt = messages.to_string()
+        else:
+            prompt = messages[1].get("content", "")
         match = re.search(r"^\s*\[(\d+)\].*button", prompt, re.MULTILINE | re.IGNORECASE)
 
         if match: