Skip to content

Commit

Permalink
Add support for autoclick (#2313)
Browse files Browse the repository at this point in the history
Fixes #2259 

This PR brings backend and frontend support for the new autoclick
behavior in Browsertrix, introduces in Browsertrix 1.5.0+

On the backend, we introduce `min_autoclick_crawler_image` to
`values.yaml`, with a default value of
`"docker.io/webrecorder/browsertrix-crawler:1.5.0"`. If this is set and
the crawler version for a new crawl is less than this value, the
autoclick behavior is removed from the behaviors list in the configmap
created for the crawl.

The one caveat for this is that a crawler image tag like "latest" will
always be parsed as greater than `min_autoclick_crawler_image`, so there
is the potential for the crawler to run into issues if using a
non-numeric image tag with an older version of the crawler. For
production we use hardcoded specific versions of the crawler except for
the dev channel, which from here on out will including autoclick
support, so I think this should be okay (and is also true of the
existing implementation for checking `min_qa_crawler_image`).

On the frontend, I've added a checkbox (unchecked by default) in the
"Limits" section just below the current checkbox for autoscroll. We
might want to move these to a different section eventually - I'm not
sure Limits is the right place for them - but I wanted to be consistent
with things as they are.

---------

Co-authored-by: Ilya Kreymer <[email protected]>
  • Loading branch information
tw4l and ikreymer committed Jan 29, 2025
1 parent a21b2ff commit 1f25781
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 5 deletions.
33 changes: 32 additions & 1 deletion backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,31 @@ def _load_redis(self, params, status: CrawlStatus, children):

return self.load_from_yaml("redis.yaml", params)

def _filter_autoclick_behavior(
self, behaviors: Optional[str], crawler_image: str
) -> Optional[str]:
"""Remove autoclick behavior if crawler version doesn't support it"""
min_autoclick_crawler_image = os.environ.get("MIN_AUTOCLICK_CRAWLER_IMAGE")

if (
min_autoclick_crawler_image
and behaviors
and "autoclick" in behaviors
and crawler_image
and crawler_image < min_autoclick_crawler_image
):
print(
"Crawler version < min_autoclick_crawler_image, removing autoclick behavior",
flush=True,
)
behaviors_list = behaviors.split(",")
filtered_behaviors = [
behavior for behavior in behaviors_list if behavior != "autoclick"
]
return ",".join(filtered_behaviors)

return behaviors

async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params):
name = f"crawl-config-{crawl.id}"

Expand All @@ -357,7 +382,13 @@ async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params):

crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid)

params["config"] = json.dumps(crawlconfig.get_raw_config())
raw_config = crawlconfig.get_raw_config()

raw_config["behaviors"] = self._filter_autoclick_behavior(
raw_config["behaviors"], params["crawler_image"]
)

params["config"] = json.dumps(raw_config)

return self.load_from_yaml("crawl_configmap.yaml", params)

Expand Down
2 changes: 2 additions & 0 deletions chart/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ data:

MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}"

MIN_AUTOCLICK_CRAWLER_IMAGE: "{{ .Values.min_autoclick_crawler_image }}"

NUM_BROWSERS: "{{ .Values.crawler_browser_instances }}"

MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}"
Expand Down
3 changes: 3 additions & 0 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,9 @@ crawler_namespace: "crawlers"
# if set, will restrict QA to image names that are >= than this value
# min_qa_crawler_image: ""

# if set, will restrict autoclick behavior to image names that are >= than this value
min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0"

# optional: enable to use a persist volume claim for all crawls
# can be enabled to use a multi-write shared filesystem
# crawler_pv_claim: "nfs-shared-crawls"
Expand Down
33 changes: 29 additions & 4 deletions frontend/src/features/crawl-workflows/workflow-editor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1304,6 +1304,20 @@ https://archiveweb.page/images/${"logo.svg"}`}
),
false,
)}
${inputCol(
html`<sl-checkbox
name="autoclickBehavior"
?checked=${this.formState.autoclickBehavior}
>
${msg("Autoclick behavior")}
</sl-checkbox>`,
)}
${this.renderHelpTextCol(
msg(
`When enabled the browser will automatically click on links that don't navigate to other pages.`,
),
false,
)}
${inputCol(html`
<sl-input
name="pageExtraDelaySeconds"
Expand Down Expand Up @@ -2156,10 +2170,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
lang: this.formState.lang || "",
blockAds: this.formState.blockAds,
exclude: trimArray(this.formState.exclusions),
behaviors: (this.formState.autoscrollBehavior
? DEFAULT_BEHAVIORS
: DEFAULT_BEHAVIORS.slice(1)
).join(","),
behaviors: this.setBehaviors(),
},
crawlerChannel: this.formState.crawlerChannel || "default",
proxyId: this.formState.proxyId,
Expand All @@ -2168,6 +2179,20 @@ https://archiveweb.page/images/${"logo.svg"}`}
return config;
}

private setBehaviors(): string {
let behaviors = (
this.formState.autoscrollBehavior
? DEFAULT_BEHAVIORS
: DEFAULT_BEHAVIORS.slice(1)
).join(",");

if (this.formState.autoclickBehavior) {
behaviors += ",autoclick";
}

return behaviors;
}

private parseUrlListConfig(): Pick<
NewCrawlConfigParams["config"],
"seeds" | "scopeType" | "extraHops" | "useSitemap" | "failOnFailedSeed"
Expand Down
5 changes: 5 additions & 0 deletions frontend/src/utils/workflow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ export type FormState = {
autoAddCollections: string[];
description: WorkflowParams["description"];
autoscrollBehavior: boolean;
autoclickBehavior: boolean;
userAgent: string | null;
crawlerChannel: string;
proxyId: string | null;
Expand Down Expand Up @@ -138,6 +139,7 @@ export const getDefaultFormState = (): FormState => ({
autoAddCollections: [],
description: null,
autoscrollBehavior: true,
autoclickBehavior: false,
userAgent: null,
crawlerChannel: "default",
proxyId: null,
Expand Down Expand Up @@ -286,6 +288,9 @@ export function getInitialFormState(params: {
autoscrollBehavior: params.initialWorkflow.config.behaviors
? params.initialWorkflow.config.behaviors.includes("autoscroll")
: defaultFormState.autoscrollBehavior,
autoclickBehavior: params.initialWorkflow.config.behaviors
? params.initialWorkflow.config.behaviors.includes("autoclick")
: defaultFormState.autoclickBehavior,
userAgent:
params.initialWorkflow.config.userAgent ?? defaultFormState.userAgent,
crawlerChannel:
Expand Down

0 comments on commit 1f25781

Please sign in to comment.