Skip to content

Commit

Permalink
feat(duckdb): query lance (#22073)
Browse files Browse the repository at this point in the history
  • Loading branch information
hongbo-miao authored Dec 28, 2024
1 parent 467a0a6 commit 3389ef4
Show file tree
Hide file tree
Showing 9 changed files with 497 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/.static-type-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ jobs:
uv run poe static-type-check-python --package=data-storage.delta-lake.read-delta-lake-by-trino
uv run poe static-type-check-python --package=data-storage.delta-lake.write-to-delta-lake
uv run poe static-type-check-python --package=data-storage.hm-duckdb.query-duckdb
uv run poe static-type-check-python --package=data-storage.hm-duckdb.query-lance
uv run poe static-type-check-python --package=data-storage.hm-duckdb.query-parquet
uv run poe static-type-check-python --package=data-storage.hm-duckdb.query-protobuf
uv run poe static-type-check-python --package=data-storage.hm-lancedb
Expand Down
37 changes: 37 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ jobs:
hm-aws-parallelcluster: ${{ steps.filter.outputs.hm-aws-parallelcluster }}
hm-docling: ${{ steps.filter.outputs.hm-docling }}
hm-duckdb-query-duckdb: ${{ steps.filter.outputs.hm-duckdb-query-duckdb }}
hm-duckdb-query-lance: ${{ steps.filter.outputs.hm-duckdb-query-lance }}
hm-duckdb-query-parquet: ${{ steps.filter.outputs.hm-duckdb-query-parquet }}
hm-duckdb-query-protobuf: ${{ steps.filter.outputs.hm-duckdb-query-protobuf }}
hm-flax: ${{ steps.filter.outputs.hm-flax }}
Expand Down Expand Up @@ -253,6 +254,9 @@ jobs:
hm-duckdb-query-duckdb:
- '.github/workflows/test.yml'
- 'data-storage/hm-duckdb/query-duckdb/**'
hm-duckdb-query-lance:
- '.github/workflows/test.yml'
- 'data-storage/hm-duckdb/query-lance/**'
hm-duckdb-query-parquet:
- '.github/workflows/test.yml'
- 'data-storage/hm-duckdb/query-parquet/**'
Expand Down Expand Up @@ -1381,6 +1385,39 @@ jobs:
with:
directory: data-storage/hm-duckdb/query-duckdb

duckdb-query-lance-test:
name: DuckDB (query-lance) | Test
needs: detect-changes
if: ${{ needs.detect-changes.outputs.hm-duckdb-query-lance == 'true' }}
runs-on: ubuntu-24.04
environment: test
timeout-minutes: 10
steps:
- name: Checkout
uses: actions/[email protected]
- name: Install uv
uses: astral-sh/[email protected]
with:
version: 0.5.11
enable-cache: true
cache-dependency-glob: data-storage/hm-duckdb/query-lance/uv.lock
- name: Set up Python
uses: actions/[email protected]
with:
python-version-file: data-storage/hm-duckdb/query-lance/pyproject.toml
- name: Install dependencies
working-directory: data-storage/hm-duckdb/query-lance
run: |
uv sync --dev
- name: Test
working-directory: data-storage/hm-duckdb/query-lance
run: |
uv run poe test-coverage
- name: Upload coverage to Codecov
uses: codecov/[email protected]
with:
directory: data-storage/hm-duckdb/query-lance

duckdb-query-parquet-test:
name: DuckDB (query-parquet) | Test
needs: detect-changes
Expand Down
3 changes: 3 additions & 0 deletions .mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,9 @@ pull_request_rules:
- or:
- check-success=DuckDB (query-duckdb) | Test
- check-skipped=DuckDB (query-duckdb) | Test
- or:
- check-success=DuckDB (query-lance) | Test
- check-skipped=DuckDB (query-lance) | Test
- or:
- check-success=DuckDB (query-parquet) | Test
- check-skipped=DuckDB (query-parquet) | Test
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ static-type-check-python:
uv run poe static-type-check-python --package=data-storage.delta-lake.read-delta-lake-by-trino
uv run poe static-type-check-python --package=data-storage.delta-lake.write-to-delta-lake
uv run poe static-type-check-python --package=data-storage.hm-duckdb.query-duckdb
uv run poe static-type-check-python --package=data-storage.hm-duckdb.query-lance
uv run poe static-type-check-python --package=data-storage.hm-duckdb.query-parquet
uv run poe static-type-check-python --package=data-storage.hm-duckdb.query-protobuf
uv run poe static-type-check-python --package=data-storage.hm-lancedb
Expand Down
13 changes: 13 additions & 0 deletions data-storage/hm-duckdb/query-lance/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
uv-install-python::
uv python install
uv-update-lock-file:
uv lock
uv-install-dependencies:
uv sync --dev

uv-run-dev:
uv run poe dev
uv-run-test:
uv run poe test
uv-run-test-coverage:
uv run poe test-coverage
24 changes: 24 additions & 0 deletions data-storage/hm-duckdb/query-lance/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[project]
name = "query-parquet"
version = "1.0.0"
requires-python = "~=3.13.0"
dependencies = [
"duckdb==1.1.3",
"polars==1.18.0",
"lancedb==0.17.0",
]

[dependency-groups]
dev = [
"poethepoet==0.32.0",
"pytest==8.3.4",
"pytest-cov==6.0.0",
]

[tool.uv]
package = false

[tool.poe.tasks]
dev = "python src/main.py"
test = "pytest --verbose --verbose"
test-coverage = "pytest --cov=. --cov-report=xml"
3 changes: 3 additions & 0 deletions data-storage/hm-duckdb/query-lance/src/dummy_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class TestDummy:
def test_dummy(self):
assert 1 + 1 == 2
44 changes: 44 additions & 0 deletions data-storage/hm-duckdb/query-lance/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import logging
from pathlib import Path

import duckdb
import lancedb


def main(database_path: Path) -> None:
sample_data = [
{
"product_name": "LED Bulb",
"product_price": 10.0,
"review_scores": [4.5, 4.0, 4.8, 5.0, 4.1],
},
{
"product_name": "Power Bank",
"product_price": 20.0,
"review_scores": [3.5, 4.0, 4.1],
},
]
db = lancedb.connect(str(database_path))
product_table = db.create_table(
"product_catalog", data=sample_data, mode="overwrite"
).to_lance()

with duckdb.connect() as conn:
conn.register("product_table", product_table)
query = """
select
product_name,
product_price,
list_avg(review_scores) as average_review_score
from product_table
order by average_review_score desc
"""
df = conn.execute(query).pl()
logging.info(df)


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)

database_path = Path("/tmp/lancedb/products")
main(database_path)
Loading

0 comments on commit 3389ef4

Please sign in to comment.