Skip to content

Commit

Permalink
feat: move actions execution to node impl
Browse files Browse the repository at this point in the history
  • Loading branch information
adeprez committed Oct 4, 2024
1 parent b2a94ba commit 753dee5
Show file tree
Hide file tree
Showing 8 changed files with 232 additions and 178 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from io import BytesIO
import json
import os
from PIL import Image
from typing import Callable, Optional, Any, Mapping, Dict, List
from playwright.sync_api import Page, Locator
from lavague.sdk.base_driver import BaseDriver
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Callable, Dict, List, Optional

from lavague.drivers.selenium.node import SeleniumNode
from lavague.sdk.action.navigation import NavigationOutput
from lavague.drivers.selenium.prompt import SELENIUM_PROMPT_TEMPLATE
from lavague.sdk.base_driver import BaseDriver
from lavague.sdk.base_driver.interaction import (
InteractionType,
Expand Down Expand Up @@ -36,7 +36,7 @@
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait


class SeleniumDriver(BaseDriver[SeleniumNode]):
Expand Down Expand Up @@ -80,33 +80,6 @@ def init(self) -> None:
{"source": JS_SETUP_GET_EVENTS},
)

def execute(self, action: NavigationOutput) -> None:
"""Execute an action"""
with self.resolve_xpath(action.xpath) as node:
match action.navigation_command:
case InteractionType.CLICK:
node.element.click()

case InteractionType.TYPE:
value = action.value or ""
if node.element.tag_name == "input":
node.element.clear()
if node.element.tag_name == "select":
select = Select(node.element)
try:
select.select_by_value(value)
except NoSuchElementException:
select.select_by_visible_text(value)
else:
node.element.send_keys(value)

case InteractionType.HOVER:
ActionChains(self.driver).move_to_element(node.element).perform()

case InteractionType.SCROLL:
direction = ScrollDirection.from_string(action.value or "DOWN")
self.scroll(action.xpath, direction)

def destroy(self) -> None:
"""Cleanly destroy the underlying driver"""
self.driver.quit()
Expand Down Expand Up @@ -394,144 +367,3 @@ def switch_frame(self, xpath: str) -> None:

def switch_parent_frame(self) -> None:
self.driver.switch_to.parent_frame()


SELENIUM_PROMPT_TEMPLATE = """
You are a chrome extension and your goal is to interact with web pages. You have been given a series of HTML snippets and queries.
Your goal is to return a list of actions that should be done in order to execute the actions.
Always target elements by using the full XPATH. You can only use one of the Xpaths included in the HTML. Do not derive new Xpaths.
Your response must always be in the YAML format with the yaml markdown indicator and must include the main item "actions" , which will contains the objects "action", which contains the string "name" of tool of choice, and necessary arguments ("args") if required by the tool.
There must be only ONE args sub-object, such as args (if the tool has multiple arguments).
You must always include the comments as well, describing your actions step by step, following strictly the format in the examples provided.
Provide high level explanations about why you think this element is the right one.
Your answer must be short and concise. Always includes comments in the YAML before listing the actions.
The actions available are:
Name: click
Description: Click on an element with a specific xpath
Arguments:
- xpath (string)
Name: setValue
Description: Focus on and set the value of an input element with a specific xpath
Arguments:
- xpath (string)
- value (string)
Name: dropdownSelect
Description: Select an option from a dropdown menu by its value
Arguments:
- xpath (string)
- value (string)
Name: setValueAndEnter
Description: Like "setValue", except then it presses ENTER. Use this tool can submit the form when there's no "submit" button.
Arguments:
- xpath (string)
- value (string)
Name: hover
Description: Move the mouse cursor over an element identified by the given xpath. It can be used to reveal tooltips or dropdown that appear on hover. It can also be used before scrolling to ensure the focus is in the correct container before performing the scroll action.
Arguments:
- xpath (string)
Name: scroll
Description: Scroll the container that holds the element identified by the given xpath
Arguments:
- xpath (string)
- value (string): UP or DOWN
Here are examples of previous answers:
HTML:
<div>Check in / Check out</div>
<div xpath="/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div/div/div/div/div/div/div[2]/div/div/div/div"><a aria-hidden="true" href="/rooms/48556008?adults=2&amp;search_mode=regular_search&amp;check_in=2024-08-15&amp;check_out=2024-08-22" rel="noopener noreferrer nofollow" tabindex="-1" target="listing_48556008"><div class="dir dir-ltr" xpath="/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div/div/div/div/div/div/div[2]/div/div/div/div/a/div">
<div xpath="/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div"><div xpath="/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div"><div aria-labelledby="title_48556008" data-testid="card-container" role="group" xpath="/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div/div"><a aria-labelledby="title_48556008" href="/rooms/48556008?adults=2&amp;search_mode=regular_search&amp;check_in=2024-08-15&amp;check_out=2024-08-22" rel="noopener noreferrer nofollow" target="listing_48556008"></a><div xpath="/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div/div/div">
Query: Click on 'Home in Ploubazlanec'
Authorized Xpaths: "{'/html/body/div[5]/div/div/div/div/div[3]/header/div/div/div/div/div/div[2]/div/div/span[2]', '/html/body/div[5]/div/div/div/div/div[3]/header/div/div/div/div/div/div[2]/div/div', '/html/body/div[5]/d iv/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div', '/html/body/div[5]/div/div/div/div/div[3]/header/div/div/div/div/div/div[2]/div/div/span[2]/button/div', '/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div', '/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div/div/div/div/div/div/div[2]/div/div/div/div/a/div', '/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div/div', '/html/body/div[5]/div/div/div/div/div[3]/header/div/div/div/div/div/div[2]/div/div/span[2]/button[2]', '/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div/div/div/div/div/div/div[2]/div/div/div/div', '/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div/div/div', '/html/body/div[5]/div/div/div/div/div[3]/header/div/div/div/div/div/div[2]/div/div/span[2]/button'}"
Completion:
```yaml
# Let's think through this step-by-step:
# 1. The query asks us to click on 'Home in Ploubazlanec'
# 2. In the HTML, we need to find an element that represents this listing
# 3. We can see a div with the text "Home in Ploubazlanec" in the title
# 4. The parent element of this div is an anchor tag, which is likely the clickable link for the listing
# 5. We should use the XPath of this anchor tag to perform the click action
- actions:
- action:
# Click on the anchor tag that contains the listing title
args:
xpath: "/html/body/div[5]/div/div/div/div/div[3]/div/main/div[2]/div/div[2]/div/div/div/div/div/div/div/div[2]/div/div/div/div/div/div/div/div/div[2]/div/div/div/div/a"
name: "click"
```
-----
HTML:
<div class="devsite-top-logo-row-middle" xpath="/html/body/section/devsite-header/div/div[1]/div/div/div[2]">
<div class="devsite-header-upper-tabs" xpath="/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]">
<devsite-tabs class="upper-tabs devsite-overflow-menu--open" connected="" xpath="/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs">
<a aria-label="Extended Navigation" class="devsite-icon devsite-icon-arrow-drop-down" href="#" style="border: 2px solid red;" xpath="/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/a"><!--?lit$8296333005$-->More</a>
<div class="devsite-tabs-overflow-menu" scrollbars="" xpath="/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/div">
<tab xpath="/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/div/tab[1]">
<a class="devsite-tabs-content gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Tab: Gemma" href="https://ai.google.dev/gemma" track-metadata-eventdetail="https://ai.google.dev/gemma" track-metadata-module="primary nav" track-metadata-position="nav - gemma" track-name="gemma" track-type="nav" xpath="/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/div/tab[1]/a">
Authorized Xpaths: "{'/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs', '/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/a', '/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/div', '/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/div/tab[1]/a', '/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]', '/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/div/tab[1]', '/html/body/section/devsite-header/div/div[1]/div/div/div[2]'}"
Query: Click on "Gemma" under the "More" dropdown menu.
Completion:
```yaml
# Let's think step by step
# First, we notice that the query asks us to click on the "Gemma" option under the "More" dropdown menu.
# In the provided HTML, we see that the "More" dropdown menu is within a tab element with a specific class and role attribute.
# The "More" dropdown menu can be identified by its class 'devsite-overflow-tab' and contains a link element with the text 'More'.
# We need to interact with this dropdown menu to reveal the hidden options.
# Specifically, for the "More" dropdown menu, there is an anchor element within a tab element:
# /html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/a
- actions:
- action:
# We can use this XPATH to identify and click on the "More" dropdown menu:
args:
xpath: "/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/a"
value: ""
name: "click"
- action:
# After clicking the "More" dropdown, we need to select the "Gemma" option from the revealed menu.
# The "Gemma" option is located within the dropdown menu and can be identified by its anchor element with the corresponding text:
# /html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/div/tab[1]/a
# Thus, we use this XPATH to identify and click on the "Gemma" option:
args:
xpath: "/html/body/section/devsite-header/div/div[1]/div/div/div[2]/div[1]/devsite-tabs/nav/tab[2]/div/tab[1]/a"
value: ""
name: "click"
```
-----
HTML:
<select name="checkin_eta_hour" xpath="/html/body/div/main/form/section/div/select">
<option disabled="" selected="" value="">Please select</option>
<option value="-1">I don't know</option>
<option value="0">12:00 AM – 1:00 AM </option>
<option value="1">1:00 AM – 2:00 AM </option>
<option value="2">2:00 AM – 3:00 AM </option>
<option value="3">3:00 AM – 4:00 AM </option>
</select>
Authorized Xpaths: "{'/html/body/div/main/form/section/div/select'}"
Query: Select the 2:00 AM - 3:00 AM option from the dropdown menu
Completion:
```yaml
# Let's think step by step
# The query asks us to select the "2:00 AM - 3:00 AM" option from a dropdown menu.
# We need to identify the correct option within the dropdown menu based on its value attribute.
# The dropdown menu is specified by its XPATH, and the value of the option we need to select is "2".
# We can use the following "select" XPATH to locate the dropdown menu and the value "2" to select the appropriate option:
# /html/body/div/main/form/section/div/select
- actions:
- action:
# Select the "3:00 AM - 4:00 AM" option by targeting the dropdown menu with the specified XPATH.
args:
xpath: "/html/body/div/main/form/section/div/select"
value: "2"
name: "dropdownSelect"
```
"""
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,21 @@
from typing import Optional

from lavague.sdk.base_driver import DOMNode
from lavague.sdk.exceptions import NoElementException
from lavague.sdk.exceptions import ElementNotFoundException
from PIL import Image

from selenium.common.exceptions import WebDriverException
from selenium.common.exceptions import (
ElementClickInterceptedException,
NoSuchElementException,
WebDriverException,
)
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.remote.shadowroot import ShadowRoot
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import Select


class SeleniumNode(DOMNode[WebElement]):
Expand All @@ -32,7 +39,7 @@ def element(self) -> WebElement:
print("WARN: DOMNode context manager missing")
self.__enter__()
if self._element is None:
raise NoElementException()
raise ElementNotFoundException(self.xpath)
return self._element

@property
Expand Down Expand Up @@ -60,6 +67,49 @@ def take_screenshot(self):
pass
return Image.new("RGB", (0, 0))

def click(self):
with self:
try:
self.element.click()
except ElementClickInterceptedException:
try:
# Move to the element and click at its position
ActionChains(self.driver).move_to_element(
self.element
).click().perform()
except Exception as click_error:
raise Exception(
f"Failed to click at element coordinates of {self.xpath} : {str(click_error)}"
)

def set_value(self, value: str):
with self:
if self.element.tag_name == "input":
try:
self.element.clear()
except WebDriverException:
pass
if self.element.tag_name == "select":
select = Select(self.element)
try:
select.select_by_value(value)
except NoSuchElementException:
select.select_by_visible_text(value)
else:
(
ActionChains(self.driver)
.key_down(Keys.CONTROL)
.send_keys("a")
.key_up(Keys.CONTROL)
.send_keys(Keys.DELETE) # clear the input field
.send_keys(value)
.perform()
)

def hover(self):
with self:
ActionChains(self.driver).move_to_element(self.element).perform()

def enter_context(self):
if hasattr(self, "_element"):
return
Expand Down
Loading

0 comments on commit 753dee5

Please sign in to comment.