Skip to content

Commit

Permalink
add crawl4ai: #1015
Browse files Browse the repository at this point in the history
  • Loading branch information
gschmutz committed Jan 31, 2025
1 parent dbaac63 commit 68bb889
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 0 deletions.
1 change: 1 addition & 0 deletions documentation/port-mapping.md
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ Container Port(s) | Internal Port(s) | Service (alternatives) |
10099 | 10099 | kyuubi |
11211 | 11211 | memcached |
11212 | 11211 | ignite-1 |
11235 | 11235 | crawl4ai |
11434 | 11434 | ollama |
11435 | 11434 | litellm |
12222 | 2222 | risingwave |
Expand Down
3 changes: 3 additions & 0 deletions modern-data-platform-stack/generator-config/stack-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ vars:

#SearXNG
SEARXNG_version: latest

#Crawl4ai
CRAWL4AI_version: latest

#hive
HIVE_version: 3.1.2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ networks:

{% set __SEARXNG_version = SEARXNG_version | default('latest') -%}

{% set __CRAWL4AI_version = CRAWL4AI_version | default('latest') -%}

{% set __HIVE_version = HIVE_version | default('latest') -%}
{% set __HIVE_version_suffix = HIVE_version_suffix | default('postgresql-metastore-s3') -%}
{% set __HIVE_METASTORE_DB_version = HIVE_METASTORE_DB_version | default('latest') -%}
Expand Down Expand Up @@ -765,6 +767,7 @@ networks:
{% set TIKA_enable = false -%}
{% set NLM_INGESTOR_enable = false -%}
{% set UNSTRUCTURED_enable = false -%}
{% set CRAWL4AI_enable = false -%}
{% set HIVE_enable = false -%}
{% set AVRO_TOOLS_enable = false -%}
{% set OPENLDAP_enable = false -%}
Expand Down Expand Up @@ -6595,6 +6598,42 @@ services:
command: valkey-server --save 30 1 --loglevel warning
{% endif %} {# SEARXNG_enable #}

{% if CRAWL4AI_enable | default(false) %}
# ================================== Crawl4AI ========================================== #
crawl4ai:
image: unclecode/crawl4ai:{{__CRAWL4AI_version}}
container_name: crawl4ai
hostname: crawl4ai
labels:
com.platys.name: 'crawl4ai'
com.platys.description: "LLM Web Crawler & Scraper"
com.platys.restapi.title: "Crawl4ai API"
com.platys.restapi.url: "http://dataplatform:28389"
ports:
- "11235:11235"
environment:
CRAWL4AI_API_TOKEN: {{CRAWL4AI_api_token if CRAWL4AI_api_token is defined and CRAWL4AI_api_token and CRAWL4AI_api_token | length else omit }}
MAX_CONCURRENT_TASKS: 5
# LLM Provider Keys
OPENAI_API_KEY: {{CRAWL4AI_openai_api_key if CRAWL4AI_openai_api_key is defined and CRAWL4AI_openai_api_key and CRAWL4AI_openai_api_key | length else omit }}
ANTHROPIC_API_KEY: {{CRAWL4AI_anthropic_api_key if CRAWL4AI_anthropic_api_key is defined and CRAWL4AI_anthropic_api_key and CRAWL4AI_anthropic_api_key | length else omit }}
GROQ_API_KEY: {{CRAWL4AI_groq_api_key if CRAWL4AI_groq_api_key is defined and CRAWL4AI_groq_api_key and CRAWL4AI_groq_api_key | length else omit }}
{%if use_timezone | default(false) %}
TZ: {{use_timezone}}
{% endif -%} {# use_timezone #}
volumes:
- ./data-transfer:/data-transfer
- /dev/shm:/dev/shm
{%if use_timezone | default(false) %}
- "./etc/timezone:/etc/timezone:ro"
- "./etc/localtime:/etc/localtime:ro"
{% endif -%} {# use_timezone #}
{%if logging_driver is defined and logging_driver and logging_driver in ('fluentd','loki','syslog','splunk') | default(false) %}
<<: *logging
{% endif -%} {# logging_driver is defined ... #}
restart: always
{% endif %} {# CRAWL4AI_enable #}

{% if HIVE_SERVER_enable | default(false) %}
# ================================== Apache Hive Server ========================================== #
hive-server:
Expand Down
8 changes: 8 additions & 0 deletions modern-data-platform-stack/generator-config/vars/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,14 @@
#
SEARXNG_enable: false

#
# ===== Crawl4AI ========
#
CRAWL4AI_enable: false
CRAWL4AI_openai_api_key: ''
CRAWL4AI_anthropic_api_key: ''
CRAWL4AI_groq_api_key: ''

#
# ===== Apache Hive ========
#
Expand Down

0 comments on commit 68bb889

Please sign in to comment.