diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index a39064a1b2..569494befd 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -340,6 +340,31 @@ def _load_redis(self, params, status: CrawlStatus, children): return self.load_from_yaml("redis.yaml", params) + def _filter_autoclick_behavior( + self, behaviors: Optional[str], crawler_image: str + ) -> Optional[str]: + """Remove autoclick behavior if crawler version doesn't support it""" + min_autoclick_crawler_image = os.environ.get("MIN_AUTOCLICK_CRAWLER_IMAGE") + + if ( + min_autoclick_crawler_image + and behaviors + and "autoclick" in behaviors + and crawler_image + and crawler_image < min_autoclick_crawler_image + ): + print( + "Crawler version < min_autoclick_crawler_image, removing autoclick behavior", + flush=True, + ) + behaviors_list = behaviors.split(",") + filtered_behaviors = [ + behavior for behavior in behaviors_list if behavior != "autoclick" + ] + return ",".join(filtered_behaviors) + + return behaviors + async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params): name = f"crawl-config-{crawl.id}" @@ -357,7 +382,13 @@ async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params): crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid) - params["config"] = json.dumps(crawlconfig.get_raw_config()) + raw_config = crawlconfig.get_raw_config() + + raw_config["behaviors"] = self._filter_autoclick_behavior( + raw_config["behaviors"], params["crawler_image"] + ) + + params["config"] = json.dumps(raw_config) return self.load_from_yaml("crawl_configmap.yaml", params) diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index f29f7788e7..eec66c3385 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -61,6 +61,8 @@ data: MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}" + MIN_AUTOCLICK_CRAWLER_IMAGE: "{{ .Values.min_autoclick_crawler_image }}" + NUM_BROWSERS: "{{ .Values.crawler_browser_instances }}" MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}" diff --git a/chart/values.yaml b/chart/values.yaml index d422f60a73..6490ad28e9 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -250,6 +250,9 @@ crawler_namespace: "crawlers" # if set, will restrict QA to image names that are >= than this value # min_qa_crawler_image: "" +# if set, will restrict autoclick behavior to image names that are >= than this value +min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0" + # optional: enable to use a persist volume claim for all crawls # can be enabled to use a multi-write shared filesystem # crawler_pv_claim: "nfs-shared-crawls" diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 6ac6f4bcda..94be3248b0 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -1304,6 +1304,20 @@ https://archiveweb.page/images/${"logo.svg"}`} ), false, )} + ${inputCol( + html` + ${msg("Autoclick behavior")} + `, + )} + ${this.renderHelpTextCol( + msg( + `When enabled the browser will automatically click on links that don't navigate to other pages.`, + ), + false, + )} ${inputCol(html` ({ autoAddCollections: [], description: null, autoscrollBehavior: true, + autoclickBehavior: false, userAgent: null, crawlerChannel: "default", proxyId: null, @@ -286,6 +288,9 @@ export function getInitialFormState(params: { autoscrollBehavior: params.initialWorkflow.config.behaviors ? params.initialWorkflow.config.behaviors.includes("autoscroll") : defaultFormState.autoscrollBehavior, + autoclickBehavior: params.initialWorkflow.config.behaviors + ? params.initialWorkflow.config.behaviors.includes("autoclick") + : defaultFormState.autoclickBehavior, userAgent: params.initialWorkflow.config.userAgent ?? defaultFormState.userAgent, crawlerChannel: