Skip to content

Commit

Permalink
feat(lancedb): add lancedb (#21956)
Browse files Browse the repository at this point in the history
  • Loading branch information
hongbo-miao authored Dec 26, 2024
1 parent 16e99cb commit efa2452
Show file tree
Hide file tree
Showing 9 changed files with 977 additions and 2 deletions.
37 changes: 37 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ jobs:
hm-jax: ${{ steps.filter.outputs.hm-jax }}
hm-kubeflow-calculate: ${{ steps.filter.outputs.hm-kubeflow-calculate }}
hm-kubeflow-classify-mnist: ${{ steps.filter.outputs.hm-kubeflow-classify-mnist }}
hm-lancedb: ${{ steps.filter.outputs.hm-lancedb }}
hm-langchain-chat-pdf: ${{ steps.filter.outputs.hm-langchain-chat-pdf }}
hm-langgraph-chat-pdf: ${{ steps.filter.outputs.hm-langgraph-chat-pdf }}
hm-llama-index-chat-pdf: ${{ steps.filter.outputs.hm-llama-index-chat-pdf }}
Expand Down Expand Up @@ -267,6 +268,9 @@ jobs:
hm-kubeflow-classify-mnist:
- '.github/workflows/test.yml'
- 'machine-learning/hm-kubeflow/pipelines/classify-mnist/**'
hm-lancedb:
- '.github/workflows/test.yml'
- 'data-storage/hm-lancedb/**'
hm-langchain-chat-pdf:
- '.github/workflows/test.yml'
- 'machine-learning/hm-langchain/applications/chat-pdf/**'
Expand Down Expand Up @@ -1398,6 +1402,39 @@ jobs:
with:
directory: data-storage/hm-duckdb/query-protobuf

lancedb-test:
name: LanceDB | Test
needs: detect-changes
if: ${{ needs.detect-changes.outputs.hm-lancedb == 'true' }}
runs-on: ubuntu-24.04
environment: test
timeout-minutes: 10
steps:
- name: Checkout
uses: actions/[email protected]
- name: Install uv
uses: astral-sh/[email protected]
with:
version: 0.5.11
enable-cache: true
cache-dependency-glob: data-storage/hm-lancedb/uv.lock
- name: Set up Python
uses: actions/[email protected]
with:
python-version-file: data-storage/hm-lancedb/pyproject.toml
- name: Install dependencies
working-directory: data-storage/hm-lancedb
run: |
uv sync --dev
- name: Test
working-directory: data-storage/hm-lancedb
run: |
uv run poe test-coverage
- name: Upload coverage to Codecov
uses: codecov/[email protected]
with:
directory: data-storage/hm-lancedb

protobuf-test:
name: Protobuf | Test
needs: detect-changes
Expand Down
3 changes: 3 additions & 0 deletions .mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ pull_request_rules:
- or:
- check-success=DuckDB (query-protobuf) | Test
- check-skipped=DuckDB (query-protobuf) | Test
- or:
- check-success=LanceDB | Test
- check-skipped=LanceDB | Test
- or:
- check-success=Protobuf | Test
- check-skipped=Protobuf | Test
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -244,12 +244,12 @@ The diagram illustrates the repository's architecture, which is considered overl
- **ClickHouse** - Column-oriented SQL database
- **YugabyteDB** - Distributed SQL database
- **TimescaleDB** - Time-series SQL database
- **InfluxDB** - Time-series database
- **InfluxDB Enterprise** - Distributed time-series database
- **Prometheus** - Time-series database
- **InfluxDB** - Distributed time-series database
- **Loki** - Log aggregation system
- **DuckDB** - Embedded analytical SQL database
- **Apache Cassandra** - Distributed wide-column NoSQL database
- **LanceDB** - Embedded vector database
- **Qdrant** - Distributed vector database
- **Chroma** - Distributed vector database
- **Dgraph** - Distributed graph database
Expand Down
13 changes: 13 additions & 0 deletions data-storage/hm-lancedb/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
uv-install-python::
uv python install
uv-update-lock-file:
uv lock
uv-install-dependencies:
uv sync --dev

uv-run-dev:
uv run poe dev
uv-run-test:
uv run poe test
uv-run-test-coverage:
uv run poe test-coverage
24 changes: 24 additions & 0 deletions data-storage/hm-lancedb/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[project]
name = "hm-lancedb"
version = "1.0.0"
requires-python = "~=3.12.0"
dependencies = [
"lancedb==0.17.0",
"polars==1.18.0",
"sentence-transformers==3.3.1",
]

[dependency-groups]
dev = [
"poethepoet==0.31.1",
"pytest==8.3.4",
"pytest-cov==6.0.0",
]

[tool.uv]
package = false

[tool.poe.tasks]
dev = "python src/main.py"
test = "pytest --verbose --verbose"
test-coverage = "pytest --cov=. --cov-report=xml"
6 changes: 6 additions & 0 deletions data-storage/hm-lancedb/rick_and_morty_quotes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,quote,author
1,"Wubba Lubba Dub Dub!",Rick Sanchez
2,"Nobody exists on purpose. Nobody belongs anywhere. We're all going to die. Come watch TV.",Morty Smith
3,"Sometimes science is more art than science.",Rick Sanchez
4,"I'm not a hero. I'm a high-functioning alcoholic.",Rick Sanchez
5,"Get your shit together, get it all together and put it in a backpack.",Morty Smith
3 changes: 3 additions & 0 deletions data-storage/hm-lancedb/src/dummy_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class TestDummy:
def test_dummy(self):
assert 1 + 1 == 2
45 changes: 45 additions & 0 deletions data-storage/hm-lancedb/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import logging

import lancedb
import polars as pl
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

registry = get_registry()
func = registry.get("sentence-transformers").create(name="all-MiniLM-L6-v2")


class Quotes(LanceModel):
name: str
line: str = func.SourceField()
vector: Vector = func.VectorField()


def create_and_populate_table(
db: lancedb.connect, df: pl.DataFrame
) -> lancedb.table.Table:
table = db.create_table("quotes", schema=Quotes, mode="overwrite")
table.add(df)
return table


def perform_semantic_search(table: lancedb.table.Table, query: str) -> pl.DataFrame:
return table.search(query).limit(5).to_polars()


def main():
url = "https://raw.githubusercontent.com/Abhiram970/RickBot/refs/heads/main/Rick_and_Morty.csv"
df = pl.read_csv(url)
db = lancedb.connect("~/.lancedb")
table = create_and_populate_table(db, df)

query = "What is the meaning of life?"
df = perform_semantic_search(table, query)
logging.info("Question: %s", query)
logging.info("Answer: %s", df["line"][0])
logging.info(df)


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
main()
Loading

0 comments on commit efa2452

Please sign in to comment.