Skip to content

Commit

Permalink
fix tests and serialization
Browse files Browse the repository at this point in the history
  • Loading branch information
brilee committed Feb 5, 2024
1 parent c04cc08 commit e822ef6
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 6 deletions.
8 changes: 7 additions & 1 deletion lilac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
StrictFloat,
StrictInt,
StrictStr,
field_serializer,
field_validator,
)
from pydantic import Field as PydanticField
Expand Down Expand Up @@ -115,11 +116,16 @@ class StatsResult(BaseModel):
# Defined for numeric features.
min_val: Optional[Union[float, date, datetime]] = None
max_val: Optional[Union[float, date, datetime]] = None
value_samples: Optional[list[float]] = None # Used for approximating histogram bins
value_samples: list[float] = PydanticField(default=[], exclude=True, repr=False) # Samples for calculating histogram bins.

# Defined for text features.
avg_text_length: Optional[float] = None

def __eq__(self, other: object) -> bool:
if not isinstance(other, StatsResult):
return NotImplemented
return self.model_dump(exclude={'value_samples'}) == other.model_dump(exclude={'value_samples'})


class MediaResult(BaseModel):
"""The result of a media() query."""
Expand Down
3 changes: 0 additions & 3 deletions lilac/data/dataset_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3943,12 +3943,9 @@ def _round(value: float) -> float:
return round(value, round_ndigits)

num_bins = min(int(np.log2(stats.total_count)), MAX_AUTO_BINS)
print(stats.value_samples)
transformer = PowerTransformer().fit(np.array(stats.value_samples).reshape(-1, 1))
print(transformer.transform(np.array(stats.value_samples).reshape(-1, 1)).ravel())
# [-2, 2], assuming post-transform normal distribution, should cover central 95% of the data.
buckets = transformer.inverse_transform(np.linspace(-2.5, 2.5, num_bins).reshape(-1, 1)).ravel()
print(buckets)
# Sometimes the autogenerated buckets round to the same value.
# Sometimes PowerTransformer returns NaN for some unusually shaped distributions.
buckets = sorted(set(_round(val) for val in buckets if not np.isnan(val)))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
} else if (end == null) {
return `≥ ${formatValue(start)}`;
} else {
return `${formatValue(start)} .. ${formatValue(end)}`;
return `[${formatValue(start)} .. ${formatValue(end)})`;
}
}
const dispatch = createEventDispatcher();
Expand Down
2 changes: 1 addition & 1 deletion web/lib/fastapi_client/models/StatsResult.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ export type StatsResult = {
approx_count_distinct: number;
min_val?: (number | string | null);
max_val?: (number | string | null);
value_samples?: (Array<number> | null);
value_samples?: Array<number>;
avg_text_length?: (number | null);
};

0 comments on commit e822ef6

Please sign in to comment.