-
Notifications
You must be signed in to change notification settings - Fork 414
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create lambda infrastructure (#3830)
* Create test Lambda calling into QW to get its version * Switch to provided AL image * Revert Dockerfile changes * Add querying and indexing * Add querying and indexing * Rename querier to searcher to align with existing terminology * Fix failing CI tests * Use S3 store source instead of bundled * Refactor binaries to seprate bin dir * Fork the CLI local search and index methods * Create index if not found * Add flexible indexing and query inputs * Add instance of Quickwit lambda with mock data generation * Log end of indexing * Add benchmarck commands * wip: trying to setup tracing * Add root trace * Try fix trace flushing * Fix merge disabling * Cache and better tracing * Add pyaload to context span * Use API Gateway events in search * Log as json and extract logs from cloudwatch * Add histogram oneshot search example * Fix errors due to rebase * Improve example queries * Expose config to disable partial_request_cache_capacity * Improve benchmark script * Document the setup of an API Gateway * Add partial request cache to bench * Add packaging workflow * Address review regarding hdfs index config * Fix rebase errors * Fix CI errors * Add mypy linter * Improve release tag name * Try using package pip install in CI * Update lambda package version * Enable download using uploaded artifact * Add API Gateway construct and refactor cdk code * Add staging lifecycle rule * Fix SpawnPipeline field after rebase * Fix unused rust-toolchain.toml * Add tutorial * Final cleanup * Fix after rebase * Fix fmt in quickwit-lambda * Handle gzip file source. * Fix clippy. * Update github action. * Add telemetry. * Apply new versioning and skip hdfs decompression * Add comment in file source. * Add test on skip reader. * Take review comments into account. * Fix rebase. --------- Co-authored-by: fmassot <[email protected]>
- Loading branch information
Showing
44 changed files
with
4,036 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
name: Build and publish AWS Lambda packages | ||
|
||
on: | ||
push: | ||
tags: | ||
- "lambda-beta-*" | ||
|
||
jobs: | ||
build-lambdas: | ||
name: Build Quickwit Lambdas | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- name: Install Ubuntu packages | ||
run: sudo apt-get -y install protobuf-compiler python3 python3-pip | ||
- name: Install rustup | ||
run: curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain none -y | ||
- name: Install python dependencies | ||
run: pip install ./distribution/lambda | ||
- name: Mypy lint | ||
run: mypy distribution/lambda/ | ||
|
||
- name: Extract asset version of release | ||
run: echo "QW_LAMBDA_VERSION=${GITHUB_REF/refs\/tags\//}" >> $GITHUB_ENV | ||
if: ${{ github.event_name == 'push' }} | ||
- name: Retrieve and export commit date, hash, and tags | ||
run: | | ||
echo "QW_COMMIT_DATE=$(TZ=UTC0 git log -1 --format=%cd --date=format-local:%Y-%m-%dT%H:%M:%SZ)" >> $GITHUB_ENV | ||
echo "QW_COMMIT_HASH=$(git rev-parse HEAD)" >> $GITHUB_ENV | ||
echo "QW_COMMIT_TAGS=$(git tag --points-at HEAD | tr '\n' ',')" >> $GITHUB_ENV | ||
- name: Build Quickwit Lambdas | ||
run: make package | ||
env: | ||
QW_COMMIT_DATE: ${{ env.QW_COMMIT_DATE }} | ||
QW_COMMIT_HASH: ${{ env.QW_COMMIT_HASH }} | ||
QW_COMMIT_TAGS: ${{ env.QW_COMMIT_TAGS }} | ||
QW_LAMBDA_BUILD: 1 | ||
working-directory: ./distribution/lambda | ||
- name: Extract package locations | ||
run: | | ||
echo "SEARCHER_PACKAGE_LOCATION=./distribution/lambda/$(make searcher-package-path)" >> $GITHUB_ENV | ||
echo "INDEXER_PACKAGE_LOCATION=./distribution/lambda/$(make indexer-package-path)" >> $GITHUB_ENV | ||
working-directory: ./distribution/lambda | ||
- name: Upload Lambda archives | ||
uses: quickwit-inc/upload-to-github-release@v1 | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
with: | ||
file: ${{ env.SEARCHER_PACKAGE_LOCATION }};${{ env.INDEXER_PACKAGE_LOCATION }} | ||
overwrite: true | ||
draft: true | ||
tag_name: aws-${{ env.QW_LAMBDA_VERSION }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
*.swp | ||
package-lock.json | ||
__pycache__ | ||
.pytest_cache | ||
.venv | ||
*.egg-info | ||
build/ | ||
.mypy_cache | ||
|
||
# CDK asset staging directory | ||
.cdk.staging | ||
cdk.out | ||
|
||
# AWS SAM build directory | ||
.aws-sam | ||
|
||
# Benchmark output files | ||
*.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
.SILENT: | ||
.ONESHELL: | ||
SHELL := bash | ||
.SHELLFLAGS := -eu -o pipefail -c | ||
|
||
# Update this when cutting a new release | ||
QW_LAMBDA_VERSION?=beta-01 | ||
PACKAGE_BASE_URL=https://github.com/quickwit-oss/quickwit/releases/download/aws-lambda-$(QW_LAMBDA_VERSION)/ | ||
SEARCHER_PACKAGE_FILE=quickwit-lambda-searcher-$(QW_LAMBDA_VERSION)-x86_64.zip | ||
INDEXER_PACKAGE_FILE=quickwit-lambda-indexer-$(QW_LAMBDA_VERSION)-x86_64.zip | ||
export SEARCHER_PACKAGE_PATH=cdk.out/$(SEARCHER_PACKAGE_FILE) | ||
export INDEXER_PACKAGE_PATH=cdk.out/$(INDEXER_PACKAGE_FILE) | ||
|
||
check-env: | ||
ifndef CDK_ACCOUNT | ||
$(error CDK_ACCOUNT is undefined) | ||
endif | ||
ifndef CDK_REGION | ||
$(error CDK_REGION is undefined) | ||
endif | ||
|
||
# Build or download the packages from the release page | ||
# - Download by default, the version can be set with QW_LAMBDA_VERSION | ||
# - To build locally, set QW_LAMBDA_BUILD=1 | ||
package: | ||
mkdir -p cdk.out | ||
if [ "$${QW_LAMBDA_BUILD:-0}" = "1" ] | ||
then | ||
pushd ../../quickwit/ | ||
cargo lambda build \ | ||
-p quickwit-lambda \ | ||
--release \ | ||
--output-format zip \ | ||
--target x86_64-unknown-linux-gnu | ||
popd | ||
cp -u ../../quickwit/target/lambda/searcher/bootstrap.zip $(SEARCHER_PACKAGE_PATH) | ||
cp -u ../../quickwit/target/lambda/indexer/bootstrap.zip $(INDEXER_PACKAGE_PATH) | ||
else | ||
if ! [ -f $(SEARCHER_PACKAGE_PATH) ]; then | ||
echo "Downloading package $(PACKAGE_BASE_URL)$(SEARCHER_PACKAGE_FILE)" | ||
curl -C - -Ls -o $(SEARCHER_PACKAGE_PATH) $(PACKAGE_BASE_URL)$(SEARCHER_PACKAGE_FILE) | ||
else | ||
echo "Using cached package $(SEARCHER_PACKAGE_PATH)" | ||
fi | ||
if ! [ -f $(INDEXER_PACKAGE_PATH) ]; then | ||
echo "Downloading package $(PACKAGE_BASE_URL)$(INDEXER_PACKAGE_FILE)" | ||
curl -C - -Ls -o $(INDEXER_PACKAGE_PATH) $(PACKAGE_BASE_URL)$(INDEXER_PACKAGE_FILE) | ||
else | ||
echo "Using cached package $(INDEXER_PACKAGE_PATH)" | ||
fi | ||
fi | ||
|
||
indexer-package-path: | ||
echo -n $(INDEXER_PACKAGE_PATH) | ||
|
||
searcher-package-path: | ||
echo -n $(SEARCHER_PACKAGE_PATH) | ||
|
||
bootstrap: package check-env | ||
cdk bootstrap aws://$$CDK_ACCOUNT/$$CDK_REGION | ||
|
||
deploy-hdfs: package check-env | ||
cdk deploy -a cdk/app.py HdfsStack | ||
|
||
deploy-mock-data: package check-env | ||
cdk deploy -a cdk/app.py MockDataStack | ||
|
||
destroy-hdfs: | ||
cdk destroy -a cdk/app.py HdfsStack | ||
|
||
destroy-mock-data: | ||
cdk destroy -a cdk/app.py MockDataStack | ||
|
||
clean: | ||
rm -rf cdk.out | ||
|
||
## Invocation examples | ||
|
||
invoke-mock-data-searcher: check-env | ||
python -c 'from cdk import cli; cli.invoke_mock_data_searcher()' | ||
|
||
invoke-hdfs-indexer: check-env | ||
python -c 'from cdk import cli; cli.upload_hdfs_src_file()' | ||
python -c 'from cdk import cli; cli.invoke_hdfs_indexer()' | ||
|
||
invoke-hdfs-searcher-term: check-env | ||
python -c 'from cdk import cli; cli.invoke_hdfs_searcher("""{"query": "severity_text:ERROR", "max_hits": 10}""")' | ||
|
||
invoke-hdfs-searcher-histogram: check-env | ||
python -c 'from cdk import cli; cli.invoke_hdfs_searcher("""{ "query": "*", "max_hits": 0, "aggs": { "events": { "date_histogram": { "field": "timestamp", "fixed_interval": "1d" }, "aggs": { "log_level": { "terms": { "size": 10, "field": "severity_text", "order": { "_count": "desc" } } } } } } }""")' | ||
|
||
bench-index: | ||
mem_sizes=( 10240 8192 6144 4096 3072 2048 ) | ||
export QW_LAMBDA_DISABLE_MERGE=true | ||
for mem_size in "$${mem_sizes[@]}" | ||
do | ||
export INDEXER_MEMORY_SIZE=$${mem_size} | ||
$(MAKE) deploy-hdfs | ||
python -c 'from cdk import cli; cli.benchmark_hdfs_indexing()' | ||
done | ||
|
||
bench-search-term: | ||
mem_sizes=( 1024 2048 4096 8192 ) | ||
for mem_size in "$${mem_sizes[@]}" | ||
do | ||
export SEARCHER_MEMORY_SIZE=$${mem_size} | ||
$(MAKE) deploy-hdfs | ||
python -c 'from cdk import cli; cli.benchmark_hdfs_search("""{"query": "severity_text:ERROR", "max_hits": 10}""")' | ||
done | ||
|
||
bench-search-histogram: | ||
mem_sizes=( 1024 2048 4096 8192 ) | ||
for mem_size in "$${mem_sizes[@]}" | ||
do | ||
export SEARCHER_MEMORY_SIZE=$${mem_size} | ||
$(MAKE) deploy-hdfs | ||
python -c 'from cdk import cli; cli.benchmark_hdfs_search("""{ "query": "*", "max_hits": 0, "aggs": { "events": { "date_histogram": { "field": "timestamp", "fixed_interval": "1d" }, "aggs": { "log_level": { "terms": { "size": 10, "field": "severity_text", "order": { "_count": "desc" } } } } } } }""")' | ||
done | ||
|
||
bench-search: | ||
for run in {1..30} | ||
do | ||
export QW_LAMBDA_DISABLE_SEARCH_CACHE=true | ||
$(MAKE) bench-search-term | ||
$(MAKE) bench-search-histogram | ||
export QW_LAMBDA_DISABLE_SEARCH_CACHE=false | ||
export QW_LAMBDA_PARTIAL_REQUEST_CACHE_CAPACITY=0 | ||
$(MAKE) bench-search-term | ||
$(MAKE) bench-search-histogram | ||
export QW_LAMBDA_DISABLE_SEARCH_CACHE=false | ||
export QW_LAMBDA_PARTIAL_REQUEST_CACHE_CAPACITY=64MB | ||
$(MAKE) bench-search-term | ||
$(MAKE) bench-search-histogram | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
|
||
# CDK template for running Quickwit on AWS Lambda | ||
|
||
## Prerequisites | ||
|
||
- Install AWS CDK Toolkit (cdk command) | ||
- `npm install -g aws-cdk ` | ||
- Ensure `curl` and `make` are installed | ||
- To run the invocation example `make` commands, you will also need Python 3.10 | ||
or later and `pip` installed (see [Python venv](#python-venv) below). | ||
|
||
## AWS Lambda service quotas | ||
|
||
For newly created AWS accounts, a conservative quota of 10 concurrent executions | ||
is applied to Lambda in each individual region. If that's the case, CDK won't be | ||
able to apply the reserved concurrency of the indexing Quickwit lambda. You can | ||
increase the quota without charge using the [Service Quotas | ||
console](https://console.aws.amazon.com/servicequotas/home/services/lambda/quotas). | ||
|
||
> **Note:** The request can take hours or even days to be processed. | ||
## Python venv | ||
|
||
This project is set up like a standard Python project. The initialization | ||
process also creates a virtualenv within this project, stored under the `.venv` | ||
directory. To create the virtualenv it assumes that there is a `python3` | ||
executable in your path with access to the `venv` package. If for any reason the | ||
automatic creation of the virtualenv fails, you can create the virtualenv | ||
manually. | ||
|
||
To manually create a virtualenv on MacOS and Linux: | ||
|
||
```bash | ||
python3 -m venv .venv | ||
``` | ||
|
||
After the init process completes and the virtualenv is created, you can use the following | ||
step to activate your virtualenv. | ||
|
||
```bash | ||
source .venv/bin/activate | ||
``` | ||
|
||
Once the virtualenv is activated, you can install the required dependencies. | ||
|
||
```bash | ||
pip install . | ||
``` | ||
|
||
If you prefer using Poetry, achieve the same by running: | ||
```bash | ||
poetry shell | ||
poetry install | ||
``` | ||
|
||
## Example stacks | ||
|
||
Provided demonstration setups: | ||
- HDFS example data: index the the [HDFS | ||
dataset](https://quickwit-datasets-public.s3.amazonaws.com/hdfs-logs-multitenants-10000.json) | ||
by triggering the Quickwit lambda manually. | ||
- Mock Data generator: start a mock data generator lambda that pushes mock JSON | ||
data every X minutes to S3. Those file trigger the Quickwit indexer lambda | ||
automatically. | ||
|
||
## Deploy and run | ||
|
||
The Makefile is a usefull entrypoint to show how the Lambda deployment can used. | ||
|
||
Configure your shell and AWS account: | ||
```bash | ||
# replace with you AWS account ID and preferred region | ||
export CDK_ACCOUNT=123456789 | ||
export CDK_REGION=us-east-1 | ||
make bootstrap | ||
``` | ||
|
||
Deploy, index and query the HDFS dataset: | ||
```bash | ||
make deploy-hdfs | ||
make invoke-hdfs-indexer | ||
make invoke-hdfs-searcher | ||
``` | ||
|
||
Deploy the mock data generator and query the indexed data: | ||
```bash | ||
make deploy-mock-data | ||
# wait a few minutes... | ||
make invoke-mock-data-searcher | ||
``` | ||
|
||
## Set up a search API | ||
|
||
You can configure an HTTP API endpoint around the Quickwit Searcher Lambda. The | ||
mock data example stack shows such a configuration. The API Gateway is enabled | ||
when the `SEARCHER_API_KEY` environment variable is set: | ||
|
||
```bash | ||
SEARCHER_API_KEY=my-at-least-20-char-long-key make deploy-mock-data | ||
``` | ||
|
||
> [!WARNING] | ||
> The API key is stored in plain text in the CDK stack. For a real world | ||
> deployment, the key should be fetched from something like [AWS Secrets | ||
> Manager](https://docs.aws.amazon.com/cdk/v2/guide/get_secrets_manager_value.html). | ||
Note that the response is always gzipped compressed, regardless the | ||
`Accept-Encoding` request header: | ||
|
||
```bash | ||
curl -d '{"query":"quantity:>5", "max_hits": 10}' -H "Content-Type: application/json" -H "x-api-key: my-at-least-20-char-long-key" -X POST https://{api_id}.execute-api.{region}.amazonaws.com/api/v1/mock-sales/search --compressed | ||
``` | ||
|
||
## Useful CDK commands | ||
|
||
* `cdk ls` list all stacks in the app | ||
* `cdk synth` emits the synthesized CloudFormation template | ||
* `cdk deploy` deploy this stack to your default AWS account/region | ||
* `cdk diff` compare deployed stack with current state | ||
* `cdk docs` open CDK documentation |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#!/usr/bin/env python3 | ||
import os | ||
from typing import Literal | ||
|
||
import aws_cdk as cdk | ||
|
||
from cdk.stacks.services.quickwit_service import DEFAULT_LAMBDA_MEMORY_SIZE | ||
from cdk.stacks.examples.hdfs_stack import HdfsStack | ||
from cdk.stacks.examples.mock_data_stack import MockDataStack | ||
|
||
HDFS_STACK_NAME = "HdfsStack" | ||
MOCK_DATA_STACK_NAME = "MockDataStack" | ||
|
||
|
||
def package_location_from_env(type: Literal["searcher"] | Literal["indexer"]) -> str: | ||
path_var = f"{type.upper()}_PACKAGE_PATH" | ||
if path_var in os.environ: | ||
return os.environ[path_var] | ||
else: | ||
print( | ||
f"Could not infer the {type} package location. Configure it using the {path_var} environment variable" | ||
) | ||
exit(1) | ||
|
||
|
||
app = cdk.App() | ||
|
||
HdfsStack( | ||
app, | ||
HDFS_STACK_NAME, | ||
env=cdk.Environment( | ||
account=os.getenv("CDK_ACCOUNT"), region=os.getenv("CDK_REGION") | ||
), | ||
indexer_memory_size=int( | ||
os.environ.get("INDEXER_MEMORY_SIZE", DEFAULT_LAMBDA_MEMORY_SIZE) | ||
), | ||
searcher_memory_size=int( | ||
os.environ.get("SEARCHER_MEMORY_SIZE", DEFAULT_LAMBDA_MEMORY_SIZE) | ||
), | ||
indexer_package_location=package_location_from_env("indexer"), | ||
searcher_package_location=package_location_from_env("searcher"), | ||
) | ||
|
||
MockDataStack( | ||
app, | ||
MOCK_DATA_STACK_NAME, | ||
env=cdk.Environment( | ||
account=os.getenv("CDK_ACCOUNT"), region=os.getenv("CDK_REGION") | ||
), | ||
indexer_package_location=package_location_from_env("indexer"), | ||
searcher_package_location=package_location_from_env("searcher"), | ||
search_api_key=os.getenv("SEARCHER_API_KEY", None), | ||
) | ||
|
||
app.synth() |
Oops, something went wrong.