Skip to content

Commit

Permalink
Updated agents
Browse files Browse the repository at this point in the history
  • Loading branch information
Dany committed May 13, 2024
1 parent 8211c89 commit 96a81b0
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 28 deletions.
6 changes: 4 additions & 2 deletions src/lavague/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,11 @@ def run(self, objective, display=True):
for _ in range(N_ATTEMPTS):
try:
action = action_engine.action_from_context(context, query)
screenshot_with_highlight = get_highlighted_element(action, driver)
outputs = get_highlighted_element(action, driver)
image = outputs[-1]["image"]

if display:
display_screenshot(screenshot_with_highlight)
display_screenshot(image)

print("Showing the next element to interact with")
time.sleep(3)
Expand Down
10 changes: 2 additions & 8 deletions src/lavague/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,13 @@ def default_python_code_extractor(markdown_text: str) -> Optional[str]:

def default_get_selenium_driver() -> SeleniumDriver:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import os.path

chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1600,900")
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
return SeleniumDriver(driver)
return driver

def evaluation_get_selenium_driver() -> SeleniumDriver:
"""Extra options to make the driver more static for evaluation purposes."""
Expand Down
1 change: 1 addition & 0 deletions src/lavague/format_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def extract_code_from_funct(funct: Callable) -> List[str]:
line[nident:] for line in source_code_lines[:-1]
] # every line except the return


def extract_imports_from_lines(lines: List[str]) -> str:
"""Only keep import lines from python code lines and join them"""
return "\n".join(
Expand Down
49 changes: 44 additions & 5 deletions src/lavague/prompts.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,50 @@
from string import Template

TEXT_WORLD_MODEL_PROMPT_TEMPLATE = Template("""
You are an AI system specialized in high level reasoning. Your goal is to generate instructions for other specialized AIs to perform web actions to reach objectives given by humans.
Your inputs are an objective in natural language, as well as a cleaned textual description of the current page extracted from the raw HTML.
Your output are a list of thoughts in bullet points detailling your reasoning, followed by your conclusion on what the next step should be in the form of an instruction.
You can assume the instruction is used by another AI to generate the action code to select the element to be interacted with and perform the action demanded by the human.
The instruction should be detailled as possible and only contain the next step.
Do not make assumptions about elements you do not see.
If the objective is already achieved in the screenshot, provide the instruction 'STOP'.
Here are previous examples:
Objective: Ask the AI model 'Command R plus' 'What is love'
Thought:
- I am on the Hugging Face website.
- Hugging Face is a company that hosts AI models, and allows users to interact with models on them through the chat.
- Therefore, to answer the objective of asking the AI model 'Command R Plus' 'What is love', we need first to find the model page.
- Given the current textual description, the fastest way to find the model page seems to be to use the search bar.
Instruction: Type 'Command R plus' on the search bar with placeholder "Search ..." and click on the first result
Objective: Explore the latest updates on the model 'Meta-Llama-3-8B'
Thought:
- I am currently viewing the main page of Hugging Face, a hub for AI models and datasets.
- On this platform, users can explore and interact with a variety of AI models.
- From the current textual description, I see that the model 'Meta-Llama-3-8B' is displayed in the "Trending" section.
- To investigate the updates made to 'Meta-Llama-3-8B', the best approach is to go directly to the model's specific page where information will be available
Instruction: Click on 'Meta-Llama-3-8B'
Objective: Print the installation guide for Transformers
Thoughts:
- The textual description of the current page seems to indicate we are on the installation page for Transformers
- Several installation modes are poposed in different sections, such as pip, from source, editable install, etc.
- Since the user did not mention which installation, we will go for pip as it is the fastest
Instruction: Print the text in the 'Install with pip' section
Objective: ${objective}
Thought:
""")

WORLD_MODEL_PROMPT_TEMPLATE = Template("""
You are an AI system whose goal is to generate training examples to teach other AIs to think and reach objectives given by humans and a screenshot of the current page.
The AIs to be taught have to write their thought process and propose an instruction to be performed.
Your answer should contain your thoughts in bullet points, and the instruction for the next step to be performed.
The instruction should be detailled as possible and only contain one step. Do not provide bullet points or multiple steps.
Leverage as much information from the screenshot to make it easy to identify the element, such as placeholders or text.
You are an AI system specialized in high level reasoning. Your goal is to generate instructions for other specialized AIs to perform web actions to reach objectives given by humans.
Your inputs are an objective in natural language, as well as a screenshot of the current page of the browser.
Your output are a list of thoughts in bullet points detailling your reasoning, followed by your conclusion on what the next step should be in the form of an instruction.
You can assume the instruction is used by another AI to generate the action code to select the element to be interacted with and perform the action demanded by the human.
The instruction should be detailled as possible and only contain the next step.
Do not make assumptions about elements you do not see.
If the objective is already achieved in the screenshot, provide the instruction 'STOP'.
Expand Down
77 changes: 64 additions & 13 deletions src/lavague/web_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@
from PIL.PngImagePlugin import PngImageFile
import base64

def resize_driver(driver, width, targeted_height):
"""Resize the Selenium driver viewport to a targeted height and width.
This is due to Selenium only being able to set window size and not viewport size.
"""
driver.set_window_size(width, targeted_height)

viewport_height = driver.execute_script("return window.innerHeight;")

height_difference = targeted_height - viewport_height
driver.set_window_size(width, targeted_height + height_difference)


# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
Expand All @@ -18,34 +30,73 @@ def display_screenshot(img: PngImageFile):
# Display the image directly in the notebook
display(img)

def get_highlighted_element(generated_code, driver):

def get_highlighted_element(generated_code, driver) -> PngImageFile:

# Extract the assignments from the generated code
assignment_code = keep_assignments(generated_code)

# We add imports to the code to be executed
code = f"""
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
{assignment_code}
""".strip()

local_scope = {"driver": driver}

exec(code, globals(), local_scope)
exec(code, local_scope, local_scope)

# We extract pairs of variables assigned during execution with their name and pointer
variable_names = return_assigned_variables(generated_code)

elements = []
elements = {}

for variable_name in variable_names:
var = local_scope[variable_name]
if type(var) == WebElement:
elements.append(var)

first_element = elements[0]
driver.execute_script("arguments[0].setAttribute('style', arguments[1]);", first_element, "border: 2px solid red;")
driver.execute_script("arguments[0].scrollIntoView();", first_element)
driver.save_screenshot("screenshot.png")
image = Image.open("screenshot.png")
driver.execute_script("arguments[0].setAttribute('style', '');", first_element)
return image
elements[variable_name] = var

if len(elements) == 0:
raise ValueError(f"No element found.")

outputs = []
for element_name, element in elements.items():

local_scope = {"driver": driver,
element_name: element}

code = f"""
element = {element_name}
driver.execute_script("arguments[0].setAttribute('style', arguments[1]);", element, "border: 2px solid red;")
driver.execute_script("arguments[0].scrollIntoView({{block: 'center'}});", element)
driver.save_screenshot("screenshot.png")
x1 = element.location['x']
y1 = element.location['y']
x2 = x1 + element.size['width']
y2 = y1 + element.size['height']
viewport_width = driver.execute_script("return window.innerWidth;")
viewport_height = driver.execute_script("return window.innerHeight;")
"""
exec(code, globals(), local_scope)
bounding_box = {
"x1": local_scope["x1"],
"y1": local_scope["y1"],
"x2": local_scope["x2"],
"y2": local_scope["y2"]
}
viewport_size = {
"width": local_scope["viewport_width"],
"height": local_scope["viewport_height"]
}
image = Image.open("screenshot.png")
output = {
"image": image,
"bounding_box": bounding_box,
"viewport_size": viewport_size
}
outputs.append(output)
return outputs

0 comments on commit 96a81b0

Please sign in to comment.