diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py index c00b10a3..bbd1ac71 100644 --- a/lilac/data/dataset.py +++ b/lilac/data/dataset.py @@ -20,6 +20,7 @@ StrictFloat, StrictInt, StrictStr, + field_serializer, field_validator, ) from pydantic import Field as PydanticField @@ -115,11 +116,16 @@ class StatsResult(BaseModel): # Defined for numeric features. min_val: Optional[Union[float, date, datetime]] = None max_val: Optional[Union[float, date, datetime]] = None - value_samples: Optional[list[float]] = None # Used for approximating histogram bins + value_samples: list[float] = PydanticField(default=[], exclude=True, repr=False) # Samples for calculating histogram bins. # Defined for text features. avg_text_length: Optional[float] = None + def __eq__(self, other: object) -> bool: + if not isinstance(other, StatsResult): + return NotImplemented + return self.model_dump(exclude={'value_samples'}) == other.model_dump(exclude={'value_samples'}) + class MediaResult(BaseModel): """The result of a media() query.""" diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py index acff6bab..d0ae0e82 100644 --- a/lilac/data/dataset_duckdb.py +++ b/lilac/data/dataset_duckdb.py @@ -3943,12 +3943,9 @@ def _round(value: float) -> float: return round(value, round_ndigits) num_bins = min(int(np.log2(stats.total_count)), MAX_AUTO_BINS) - print(stats.value_samples) transformer = PowerTransformer().fit(np.array(stats.value_samples).reshape(-1, 1)) - print(transformer.transform(np.array(stats.value_samples).reshape(-1, 1)).ravel()) # [-2, 2], assuming post-transform normal distribution, should cover central 95% of the data. buckets = transformer.inverse_transform(np.linspace(-2.5, 2.5, num_bins).reshape(-1, 1)).ravel() - print(buckets) # Sometimes the autogenerated buckets round to the same value. # Sometimes PowerTransformer returns NaN for some unusually shaped distributions. buckets = sorted(set(_round(val) for val in buckets if not np.isnan(val))) diff --git a/web/blueprint/src/lib/components/schemaView/Histogram.svelte b/web/blueprint/src/lib/components/schemaView/Histogram.svelte index 33d6996d..4dd92465 100644 --- a/web/blueprint/src/lib/components/schemaView/Histogram.svelte +++ b/web/blueprint/src/lib/components/schemaView/Histogram.svelte @@ -28,7 +28,7 @@ } else if (end == null) { return `≥ ${formatValue(start)}`; } else { - return `${formatValue(start)} .. ${formatValue(end)}`; + return `[${formatValue(start)} .. ${formatValue(end)})`; } } const dispatch = createEventDispatcher(); diff --git a/web/lib/fastapi_client/models/StatsResult.ts b/web/lib/fastapi_client/models/StatsResult.ts index 830946b6..590b1e3b 100644 --- a/web/lib/fastapi_client/models/StatsResult.ts +++ b/web/lib/fastapi_client/models/StatsResult.ts @@ -12,7 +12,7 @@ export type StatsResult = { approx_count_distinct: number; min_val?: (number | string | null); max_val?: (number | string | null); - value_samples?: (Array | null); + value_samples?: Array; avg_text_length?: (number | null); };