From 68bb889040b86ee2d8c58118b8536367afeb9fcf Mon Sep 17 00:00:00 2001 From: Guido Schmutz Date: Fri, 31 Jan 2025 21:31:33 +0100 Subject: [PATCH] add crawl4ai: #1015 --- documentation/port-mapping.md | 1 + .../generator-config/stack-config.yml | 3 ++ .../templates/docker-compose.yml.j2 | 39 +++++++++++++++++++ .../generator-config/vars/config.yml | 8 ++++ 4 files changed, 51 insertions(+) diff --git a/documentation/port-mapping.md b/documentation/port-mapping.md index 0bcdec82..053ecb1c 100644 --- a/documentation/port-mapping.md +++ b/documentation/port-mapping.md @@ -323,6 +323,7 @@ Container Port(s) | Internal Port(s) | Service (alternatives) | 10099 | 10099 | kyuubi | 11211 | 11211 | memcached | 11212 | 11211 | ignite-1 | +11235 | 11235 | crawl4ai | 11434 | 11434 | ollama | 11435 | 11434 | litellm | 12222 | 2222 | risingwave | diff --git a/modern-data-platform-stack/generator-config/stack-config.yml b/modern-data-platform-stack/generator-config/stack-config.yml index bbbe0001..360dbd3e 100644 --- a/modern-data-platform-stack/generator-config/stack-config.yml +++ b/modern-data-platform-stack/generator-config/stack-config.yml @@ -129,6 +129,9 @@ vars: #SearXNG SEARXNG_version: latest + + #Crawl4ai + CRAWL4AI_version: latest #hive HIVE_version: 3.1.2 diff --git a/modern-data-platform-stack/generator-config/templates/docker-compose.yml.j2 b/modern-data-platform-stack/generator-config/templates/docker-compose.yml.j2 index 79c1c29e..1be97f1d 100755 --- a/modern-data-platform-stack/generator-config/templates/docker-compose.yml.j2 +++ b/modern-data-platform-stack/generator-config/templates/docker-compose.yml.j2 @@ -111,6 +111,8 @@ networks: {% set __SEARXNG_version = SEARXNG_version | default('latest') -%} +{% set __CRAWL4AI_version = CRAWL4AI_version | default('latest') -%} + {% set __HIVE_version = HIVE_version | default('latest') -%} {% set __HIVE_version_suffix = HIVE_version_suffix | default('postgresql-metastore-s3') -%} {% set __HIVE_METASTORE_DB_version = HIVE_METASTORE_DB_version | default('latest') -%} @@ -765,6 +767,7 @@ networks: {% set TIKA_enable = false -%} {% set NLM_INGESTOR_enable = false -%} {% set UNSTRUCTURED_enable = false -%} + {% set CRAWL4AI_enable = false -%} {% set HIVE_enable = false -%} {% set AVRO_TOOLS_enable = false -%} {% set OPENLDAP_enable = false -%} @@ -6595,6 +6598,42 @@ services: command: valkey-server --save 30 1 --loglevel warning {% endif %} {# SEARXNG_enable #} +{% if CRAWL4AI_enable | default(false) %} + # ================================== Crawl4AI ========================================== # + crawl4ai: + image: unclecode/crawl4ai:{{__CRAWL4AI_version}} + container_name: crawl4ai + hostname: crawl4ai + labels: + com.platys.name: 'crawl4ai' + com.platys.description: "LLM Web Crawler & Scraper" + com.platys.restapi.title: "Crawl4ai API" + com.platys.restapi.url: "http://dataplatform:28389" + ports: + - "11235:11235" + environment: + CRAWL4AI_API_TOKEN: {{CRAWL4AI_api_token if CRAWL4AI_api_token is defined and CRAWL4AI_api_token and CRAWL4AI_api_token | length else omit }} + MAX_CONCURRENT_TASKS: 5 + # LLM Provider Keys + OPENAI_API_KEY: {{CRAWL4AI_openai_api_key if CRAWL4AI_openai_api_key is defined and CRAWL4AI_openai_api_key and CRAWL4AI_openai_api_key | length else omit }} + ANTHROPIC_API_KEY: {{CRAWL4AI_anthropic_api_key if CRAWL4AI_anthropic_api_key is defined and CRAWL4AI_anthropic_api_key and CRAWL4AI_anthropic_api_key | length else omit }} + GROQ_API_KEY: {{CRAWL4AI_groq_api_key if CRAWL4AI_groq_api_key is defined and CRAWL4AI_groq_api_key and CRAWL4AI_groq_api_key | length else omit }} + {%if use_timezone | default(false) %} + TZ: {{use_timezone}} + {% endif -%} {# use_timezone #} + volumes: + - ./data-transfer:/data-transfer + - /dev/shm:/dev/shm + {%if use_timezone | default(false) %} + - "./etc/timezone:/etc/timezone:ro" + - "./etc/localtime:/etc/localtime:ro" + {% endif -%} {# use_timezone #} + {%if logging_driver is defined and logging_driver and logging_driver in ('fluentd','loki','syslog','splunk') | default(false) %} + <<: *logging + {% endif -%} {# logging_driver is defined ... #} + restart: always +{% endif %} {# CRAWL4AI_enable #} + {% if HIVE_SERVER_enable | default(false) %} # ================================== Apache Hive Server ========================================== # hive-server: diff --git a/modern-data-platform-stack/generator-config/vars/config.yml b/modern-data-platform-stack/generator-config/vars/config.yml index 3696f5f5..b8e166f9 100644 --- a/modern-data-platform-stack/generator-config/vars/config.yml +++ b/modern-data-platform-stack/generator-config/vars/config.yml @@ -679,6 +679,14 @@ # SEARXNG_enable: false + # + # ===== Crawl4AI ======== + # + CRAWL4AI_enable: false + CRAWL4AI_openai_api_key: '' + CRAWL4AI_anthropic_api_key: '' + CRAWL4AI_groq_api_key: '' + # # ===== Apache Hive ======== #