Updated agents

lavague-ai · May 13, 2024 · 96a81b0 · 96a81b0
1 parent 8211c89
commit 96a81b0
Show file tree

Hide file tree

Showing 5 changed files with 115 additions and 28 deletions.
diff --git a/src/lavague/agents.py b/src/lavague/agents.py
@@ -48,9 +48,11 @@ def run(self, objective, display=True):
                 for _ in range(N_ATTEMPTS):
                     try:
                         action = action_engine.action_from_context(context, query)
-                        screenshot_with_highlight = get_highlighted_element(action, driver)
+                        outputs = get_highlighted_element(action, driver)
+                        image = outputs[-1]["image"]
+
                         if display:
-                            display_screenshot(screenshot_with_highlight)
+                            display_screenshot(image)
 
                         print("Showing the next element to interact with")
                         time.sleep(3)

diff --git a/src/lavague/defaults.py b/src/lavague/defaults.py
@@ -58,19 +58,13 @@ def default_python_code_extractor(markdown_text: str) -> Optional[str]:
 
     def default_get_selenium_driver() -> SeleniumDriver:
         from selenium import webdriver
-        from selenium.webdriver.chrome.service import Service
-        from selenium.webdriver.common.by import By
         from selenium.webdriver.chrome.options import Options
-        from selenium.webdriver.common.keys import Keys
-        import os.path
 
         chrome_options = Options()
-        chrome_options.add_argument("--headless")  # Ensure GUI is off
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument("--window-size=1600,900")
+        chrome_options.add_argument("--headless")
 
         driver = webdriver.Chrome(options=chrome_options)
-        return SeleniumDriver(driver)
+        return driver
 
     def evaluation_get_selenium_driver() -> SeleniumDriver:
         """Extra options to make the driver more static for evaluation purposes."""

diff --git a/src/lavague/format_utils.py b/src/lavague/format_utils.py
@@ -58,6 +58,7 @@ def extract_code_from_funct(funct: Callable) -> List[str]:
         line[nident:] for line in source_code_lines[:-1]
     ]  # every line except the return
 
+
 def extract_imports_from_lines(lines: List[str]) -> str:
     """Only keep import lines from python code lines and join them"""
     return "\n".join(

diff --git a/src/lavague/prompts.py b/src/lavague/prompts.py
@@ -1,11 +1,50 @@
 from string import Template
 
+TEXT_WORLD_MODEL_PROMPT_TEMPLATE = Template("""
+                                            You are an AI system specialized in high level reasoning. Your goal is to generate instructions for other specialized AIs to perform web actions to reach objectives given by humans.
+Your inputs are an objective in natural language, as well as a cleaned textual description of the current page extracted from the raw HTML.
+Your output are a list of thoughts in bullet points detailling your reasoning, followed by your conclusion on what the next step should be in the form of an instruction.
+You can assume the instruction is used by another AI to generate the action code to select the element to be interacted with and perform the action demanded by the human.
+
+The instruction should be detailled as possible and only contain the next step. 
+Do not make assumptions about elements you do not see.
+If the objective is already achieved in the screenshot, provide the instruction 'STOP'.
+
+Here are previous examples:
+Objective: Ask the AI model 'Command R plus' 'What is love'
+Thought:
+- I am on the Hugging Face website.
+- Hugging Face is a company that hosts AI models, and allows users to interact with models on them through the chat.
+- Therefore, to answer the objective of asking the AI model 'Command R Plus' 'What is love', we need first to find the model page.
+- Given the current textual description, the fastest way to find the model page seems to be to use the search bar.
+Instruction: Type 'Command R plus' on the search bar with placeholder "Search ..." and click on the first result
+
+Objective: Explore the latest updates on the model 'Meta-Llama-3-8B'
+Thought:
+- I am currently viewing the main page of Hugging Face, a hub for AI models and datasets.
+- On this platform, users can explore and interact with a variety of AI models.
+- From the current textual description, I see that the model 'Meta-Llama-3-8B' is displayed in the "Trending" section.
+- To investigate the updates made to 'Meta-Llama-3-8B', the best approach is to go directly to the model's specific page where information will be available
+Instruction: Click on 'Meta-Llama-3-8B'
+
+Objective: Print the installation guide for Transformers
+Thoughts:
+- The textual description of the current page seems to indicate we are on the installation page for Transformers
+- Several installation modes are poposed in different sections, such as pip, from source, editable install, etc.
+- Since the user did not mention which installation, we will go for pip as it is the fastest
+Instruction: Print the text in the 'Install with pip' section
+
+Objective: ${objective}
+Thought:
+""")
+
 WORLD_MODEL_PROMPT_TEMPLATE = Template("""
-You are an AI system whose goal is to generate training examples to teach other AIs to think and reach objectives given by humans and a screenshot of the current page.
-The AIs to be taught have to write their thought process and propose an instruction to be performed.
-Your answer should contain your thoughts in bullet points, and the instruction for the next step to be performed.
-The instruction should be detailled as possible and only contain one step. Do not provide bullet points or multiple steps.
-Leverage as much information from the screenshot to make it easy to identify the element, such as placeholders or text.
+You are an AI system specialized in high level reasoning. Your goal is to generate instructions for other specialized AIs to perform web actions to reach objectives given by humans.
+Your inputs are an objective in natural language, as well as a screenshot of the current page of the browser.
+Your output are a list of thoughts in bullet points detailling your reasoning, followed by your conclusion on what the next step should be in the form of an instruction.
+You can assume the instruction is used by another AI to generate the action code to select the element to be interacted with and perform the action demanded by the human.
+
+The instruction should be detailled as possible and only contain the next step. 
 Do not make assumptions about elements you do not see.
 If the objective is already achieved in the screenshot, provide the instruction 'STOP'.
 

diff --git a/src/lavague/web_utils.py b/src/lavague/web_utils.py
@@ -5,6 +5,18 @@
 from PIL.PngImagePlugin import PngImageFile
 import base64
 
+def resize_driver(driver, width, targeted_height):
+    """Resize the Selenium driver viewport to a targeted height and width.
+    This is due to Selenium only being able to set window size and not viewport size.
+    """
+    driver.set_window_size(width, targeted_height)
+
+    viewport_height = driver.execute_script("return window.innerHeight;")
+
+    height_difference = targeted_height - viewport_height
+    driver.set_window_size(width, targeted_height + height_difference)
+
+
 # Function to encode the image
 def encode_image(image_path):
   with open(image_path, "rb") as image_file:
@@ -18,34 +30,73 @@ def display_screenshot(img: PngImageFile):
     # Display the image directly in the notebook
     display(img)
 
+def get_highlighted_element(generated_code, driver):
 
-def get_highlighted_element(generated_code, driver) -> PngImageFile:
-
+    # Extract the assignments from the generated code
     assignment_code = keep_assignments(generated_code)
 
+    # We add imports to the code to be executed
     code = f"""
 from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.common.action_chains import ActionChains
 {assignment_code}
     """.strip()
 
     local_scope = {"driver": driver}
 
-    exec(code, globals(), local_scope)
+    exec(code, local_scope, local_scope)
 
+    # We extract pairs of variables assigned during execution with their name and pointer
     variable_names = return_assigned_variables(generated_code)
 
-    elements = []
+    elements = {}
 
     for variable_name in variable_names:
         var = local_scope[variable_name]
         if type(var) == WebElement:
-            elements.append(var)
-
-    first_element = elements[0]
-    driver.execute_script("arguments[0].setAttribute('style', arguments[1]);", first_element, "border: 2px solid red;")
-    driver.execute_script("arguments[0].scrollIntoView();", first_element)
-    driver.save_screenshot("screenshot.png")
-    image = Image.open("screenshot.png")
-    driver.execute_script("arguments[0].setAttribute('style', '');", first_element)
-    return image
+            elements[variable_name] = var
+
+    if len(elements) == 0:
+        raise ValueError(f"No element found.")
+
+    outputs = []
+    for element_name, element in elements.items():
+
+        local_scope = {"driver": driver, 
+            element_name: element}
+
+        code = f"""
+element = {element_name}
+driver.execute_script("arguments[0].setAttribute('style', arguments[1]);", element, "border: 2px solid red;")
+driver.execute_script("arguments[0].scrollIntoView({{block: 'center'}});", element)
+driver.save_screenshot("screenshot.png")
+
+x1 = element.location['x']
+y1 = element.location['y']
+
+x2 = x1 + element.size['width']
+y2 = y1 + element.size['height']
+
+viewport_width = driver.execute_script("return window.innerWidth;")
+viewport_height = driver.execute_script("return window.innerHeight;")
+    """
+        exec(code, globals(), local_scope)
+        bounding_box = {
+            "x1": local_scope["x1"],
+            "y1": local_scope["y1"],
+            "x2": local_scope["x2"],
+            "y2": local_scope["y2"]
+        }
+        viewport_size = {
+            "width": local_scope["viewport_width"],
+            "height": local_scope["viewport_height"]
+        }
+        image = Image.open("screenshot.png")
+        output = {
+            "image": image,
+            "bounding_box": bounding_box,
+            "viewport_size": viewport_size
+        }
+        outputs.append(output)
+    return outputs