Skip to content

Commit

Permalink
merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
Dave Berenbaum committed Aug 19, 2024
2 parents f6008fb + 61aeed4 commit c01b394
Show file tree
Hide file tree
Showing 11 changed files with 156 additions and 112 deletions.
103 changes: 103 additions & 0 deletions .github/workflows/tests-studio.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
name: Studio Tests

on:
push:
branches: [main]
pull_request:
workflow_dispatch:

env:
FORCE_COLOR: "1"
BRANCH: ${{ github.head_ref || github.ref_name }}

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
studio:
if: '!github.event.pull_request.head.repo.fork'
runs-on: ubuntu-latest-16-cores
strategy:
matrix:
pyv: ['3.12']
group: [1, 2, 3, 4, 5, 6]
services:
postgres:
image: postgres:16.3
ports:
- 5432:5432
env:
POSTGRES_USER: test
POSTGRES_DB: database
POSTGRES_HOST_AUTH_METHOD: trust
clickhouse:
image: clickhouse/clickhouse-server:24
ports:
- 8123:8123
- 9010:9000
env:
CLICKHOUSE_DB: studio_local_db
CLICKHOUSE_USER: studio_local
CLICKHOUSE_PASSWORD: ch123456789!
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
redis:
image: redis:7.2.5
ports:
- 6379:6379
steps:
- name: Studio branch name
env:
BRANCH: ${{ env.BRANCH }}
STUDIO_READ_ACCESS_TOKEN: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
run: |
echo "DataChain branch: $BRANCH"
if [[ "$BRANCH" == "main" ]]
then
STUDIO_BRANCH=develop
elif git ls-remote --heads https://"$STUDIO_READ_ACCESS_TOKEN"@github.com/iterative/studio.git "$BRANCH" | grep -F "$BRANCH" 2>&1>/dev/null
then
STUDIO_BRANCH="$BRANCH"
else
STUDIO_BRANCH=develop
fi
echo "STUDIO_BRANCH=$STUDIO_BRANCH" >> $GITHUB_ENV
echo "Studio branch: $STUDIO_BRANCH"
- name: Check out Studio
uses: actions/checkout@v4
with:
fetch-depth: 0
repository: iterative/studio
ref: ${{ env.STUDIO_BRANCH }}
token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}

- name: Check out repository
uses: actions/checkout@v4
with:
path: './backend/datachain'
fetch-depth: 0

- name: Set up Python ${{ matrix.pyv }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.pyv }}
cache: 'pip'

- name: Install uv
run: |
python -m pip install --upgrade uv
uv --version
- name: Install dependencies
run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]

- name: Run tests
# Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
run: >
pytest
--config-file=pyproject.toml -rs
--splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
-m 'not benchmark'
tests ../datachain/tests
working-directory: backend/datachain_server
93 changes: 0 additions & 93 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ on:

env:
FORCE_COLOR: "1"
BRANCH: ${{ github.head_ref || github.ref_name }}

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
Expand All @@ -18,7 +17,6 @@ jobs:
lint:
runs-on: ubuntu-latest
steps:

- name: Check out the repository
uses: actions/checkout@v4
with:
Expand Down Expand Up @@ -112,96 +110,6 @@ jobs:
- name: Build docs
run: nox -s docs


studio:
if: '!github.event.pull_request.head.repo.fork'
runs-on: ubuntu-latest-16-cores
strategy:
matrix:
pyv: ['3.12']
group: [1, 2, 3, 4, 5, 6]
services:
postgres:
image: postgres:16.3
ports:
- 5432:5432
env:
POSTGRES_USER: test
POSTGRES_DB: database
POSTGRES_HOST_AUTH_METHOD: trust
clickhouse:
image: clickhouse/clickhouse-server:24
ports:
- 8123:8123
- 9010:9000
env:
CLICKHOUSE_DB: studio_local_db
CLICKHOUSE_USER: studio_local
CLICKHOUSE_PASSWORD: ch123456789!
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
redis:
image: redis:7.2.5
ports:
- 6379:6379
steps:

- name: Studio branch name
env:
BRANCH: ${{ env.BRANCH }}
STUDIO_READ_ACCESS_TOKEN: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
run: |
echo "DataChain branch: $BRANCH"
if [[ "$BRANCH" == "main" ]]
then
STUDIO_BRANCH=develop
elif git ls-remote --heads https://"$STUDIO_READ_ACCESS_TOKEN"@github.com/iterative/studio.git "$BRANCH" | grep -F "$BRANCH" 2>&1>/dev/null
then
STUDIO_BRANCH="$BRANCH"
else
STUDIO_BRANCH=develop
fi
echo "STUDIO_BRANCH=$STUDIO_BRANCH" >> $GITHUB_ENV
echo "Studio branch: $STUDIO_BRANCH"
- name: Check out Studio
uses: actions/checkout@v4
with:
fetch-depth: 0
repository: iterative/studio
ref: ${{ env.STUDIO_BRANCH }}
token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}

- name: Check out repository
uses: actions/checkout@v4
with:
path: './backend/datachain'
fetch-depth: 0

- name: Set up Python ${{ matrix.pyv }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.pyv }}
cache: 'pip'

- name: Install uv
run: |
python -m pip install --upgrade uv
uv --version
- name: Install dependencies
run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]

- name: Run tests
# Generate `.test_durations` file with `pytest --store-durations --durations-path ../.github/.test_durations ...`
run: >
pytest
--config-file=pyproject.toml -rs
--splits=6 --group=${{ matrix.group }} --durations-path=../../.github/.test_durations
-m 'not benchmark'
tests ../datachain/tests
working-directory: backend/datachain_server


examples:
runs-on: ${{ matrix.os }}
timeout-minutes: 60
Expand All @@ -212,7 +120,6 @@ jobs:
pyv: ['3.9', '3.12']
group: ['get_started', 'llm_and_nlp or computer_vision', 'multimodal']
steps:

- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.pyv }}
Expand Down
2 changes: 1 addition & 1 deletion examples/computer_vision/openimage-detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def openimage_detect(args):
.filter(C("file.path").glob("*.jpg") | C("file.path").glob("*.json"))
.agg(
openimage_detect,
partition_by=path.file_stem(path.name(C("file.path"))),
partition_by=path.file_stem(C("file.path")),
params=["file"],
output={"file": File, "bbox": BBox},
)
Expand Down
4 changes: 2 additions & 2 deletions examples/get_started/common_sql_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ def num_chars_udf(file):

(
dc.mutate(
stem=path.file_stem(path.name(C("file.path"))),
ext=path.file_ext(path.name(C("file.path"))),
stem=path.file_stem(C("file.path")),
ext=path.file_ext(C("file.path")),
)
.select("file.path", "stem", "ext")
.show(5)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ tests = [
]
dev = [
"datachain[docs,tests]",
"mypy==1.10.1",
"mypy==1.11.1",
"types-python-dateutil",
"types-pytz",
"types-PyYAML",
Expand Down
2 changes: 1 addition & 1 deletion src/datachain/lib/arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
return str
if pa.types.is_list(col_type):
return list[arrow_type_mapper(col_type.value_type)] # type: ignore[misc]
return list[arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
return dict
if isinstance(col_type, pa.lib.DictionaryType):
Expand Down
22 changes: 14 additions & 8 deletions src/datachain/lib/convert/values_to_tuples.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from collections.abc import Sequence
from typing import Any, Union

from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
from datachain.lib.data_model import (
DataType,
DataTypeNames,
DataValuesType,
is_chain_type,
)
from datachain.lib.utils import DataChainParamsError


Expand All @@ -15,7 +20,7 @@ def __init__(self, ds_name, msg):
def values_to_tuples( # noqa: C901, PLR0912
ds_name: str = "",
output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
**fr_map,
**fr_map: Sequence[DataValuesType],
) -> tuple[Any, Any, Any]:
if output:
if not isinstance(output, (Sequence, str, dict)):
Expand Down Expand Up @@ -47,10 +52,10 @@ def values_to_tuples( # noqa: C901, PLR0912
f" number of signals '{len(fr_map)}'",
)

types_map = {}
types_map: dict[str, type] = {}
length = -1
for k, v in fr_map.items():
if not isinstance(v, Sequence) or isinstance(v, str):
if not isinstance(v, Sequence) or isinstance(v, str): # type: ignore[unreachable]
raise ValuesToTupleError(ds_name, f"signals '{k}' is not a sequence")
len_ = len(v)

Expand All @@ -64,15 +69,16 @@ def values_to_tuples( # noqa: C901, PLR0912
if len_ == 0:
raise ValuesToTupleError(ds_name, f"signal '{k}' is empty list")

typ = type(v[0])
first_element = next(iter(v))
typ = type(first_element)
if not is_chain_type(typ):
raise ValuesToTupleError(
ds_name,
f"signal '{k}' has unsupported type '{typ.__name__}'."
f" Please use DataModel types: {DataTypeNames}",
)
if typ is list:
types_map[k] = list[type(v[0][0])] # type: ignore[misc]
if isinstance(first_element, list):
types_map[k] = list[type(first_element[0])] # type: ignore[assignment, misc]
else:
types_map[k] = typ

Expand All @@ -98,7 +104,7 @@ def values_to_tuples( # noqa: C901, PLR0912
if len(output) > 1: # type: ignore[arg-type]
tuple_type = tuple(output_types)
res_type = tuple[tuple_type] # type: ignore[valid-type]
res_values = list(zip(*fr_map.values()))
res_values: Sequence[Any] = list(zip(*fr_map.values()))
else:
res_type = output_types[0] # type: ignore[misc]
res_values = next(iter(fr_map.values()))
Expand Down
1 change: 1 addition & 0 deletions src/datachain/lib/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
]
DataType = Union[type[BaseModel], StandardType]
DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
DataValuesType = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]


class DataModel(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion src/datachain/lib/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
raise ImportError(

Check warning on line 22 in src/datachain/lib/hf.py

View check run for this annotation

Codecov / codecov/patch

src/datachain/lib/hf.py#L21-L22

Added lines #L21 - L22 were not covered by tests
"Missing dependencies for huggingface datasets:\n"
"To install run:\n\n"
" pip install 'datasets'\n"
" pip install 'datachain[hf]'\n"
) from exc

from io import BytesIO
Expand Down
34 changes: 30 additions & 4 deletions src/datachain/sql/sqlite/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,19 +221,45 @@ def path_name(path):
return func.ltrim(func.substr(path, func.length(path_parent(path)) + 1), slash)


def path_file_ext_length(path):
name = path_name(path)
def name_file_ext_length(name):
expr = func.length(name) - func.length(
func.rtrim(name, func.replace(name, dot, empty_str))
)
return case((func.instr(name, dot) == 0, 0), else_=expr)


def path_file_ext_length(path):
name = path_name(path)
return name_file_ext_length(name)


def path_file_stem(path):
return func.rtrim(
func.substr(path, 1, func.length(path) - path_file_ext_length(path)), dot
path_length = func.length(path)
parent_length = func.length(path_parent(path))

name_expr = func.rtrim(
func.substr(
path,
1,
path_length - name_file_ext_length(path),
),
dot,
)

full_path_expr = func.ltrim(
func.rtrim(
func.substr(
path,
parent_length + 1,
path_length - parent_length - path_file_ext_length(path),
),
dot,
),
slash,
)

return case((func.instr(path, slash) == 0, name_expr), else_=full_path_expr)


def path_file_ext(path):
return func.substr(path, func.length(path) - path_file_ext_length(path) + 1)
Expand Down
Loading

0 comments on commit c01b394

Please sign in to comment.