-
Notifications
You must be signed in to change notification settings - Fork 98
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added isnone()
function
#801
base: main
Are you sure you want to change the base?
Changes from all commits
8d35c7e
7a19674
fddab8a
be065d4
ba5d315
40ea99f
a696435
9038cc0
5644531
1f9d382
c25e359
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,15 @@ | ||
from typing import Union | ||
from typing import Optional, Union | ||
|
||
from sqlalchemy import ColumnElement | ||
from sqlalchemy import case as sql_case | ||
from sqlalchemy.sql.elements import BinaryExpression | ||
|
||
from datachain.lib.utils import DataChainParamsError | ||
from datachain.query.schema import Column | ||
from datachain.sql.functions import conditional | ||
|
||
from .func import ColT, Func | ||
|
||
CaseT = Union[int, float, complex, bool, str] | ||
CaseT = Union[int, float, complex, bool, str, Func] | ||
|
||
|
||
def greatest(*args: Union[ColT, float]) -> Func: | ||
|
@@ -87,17 +88,19 @@ def least(*args: Union[ColT, float]) -> Func: | |
) | ||
|
||
|
||
def case(*args: tuple[BinaryExpression, CaseT], else_=None) -> Func: | ||
def case( | ||
*args: tuple[Union[ColumnElement, Func], CaseT], else_: Optional[CaseT] = None | ||
) -> Func: | ||
""" | ||
Returns the case function that produces case expression which has a list of | ||
conditions and corresponding results. Results can only be python primitives | ||
like string, numbes or booleans. Result type is inferred from condition results. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it still true? can result now be a Func? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, all methods here return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean this part |
||
like string, numbers or booleans. Result type is inferred from condition results. | ||
|
||
Args: | ||
args (tuple(BinaryExpression, value(str | int | float | complex | bool): | ||
- Tuple of binary expression and values pair which corresponds to one | ||
case condition - value | ||
else_ (str | int | float | complex | bool): else value in case expression | ||
args (tuple((ColumnElement, Func), (str | int | float | complex | bool, Func))): | ||
Tuple of condition and values pair. | ||
else_ (str | int | float | complex | bool, Func): else value in case | ||
expression. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it optional? can we say (no |
||
|
||
Returns: | ||
Func: A Func object that represents the case function. | ||
|
@@ -111,45 +114,84 @@ def case(*args: tuple[BinaryExpression, CaseT], else_=None) -> Func: | |
""" | ||
supported_types = [int, float, complex, str, bool] | ||
|
||
type_ = type(else_) if else_ else None | ||
def _get_type(val): | ||
if isinstance(val, Func): | ||
# nested functions | ||
return val.result_type | ||
return type(val) | ||
|
||
if not args: | ||
raise DataChainParamsError("Missing statements") | ||
|
||
type_ = _get_type(else_) if else_ is not None else None | ||
|
||
for arg in args: | ||
if type_ and not isinstance(arg[1], type_): | ||
raise DataChainParamsError("Statement values must be of the same type") | ||
type_ = type(arg[1]) | ||
arg_type = _get_type(arg[1]) | ||
if type_ and arg_type != type_: | ||
raise DataChainParamsError( | ||
f"Statement values must be of the same type, got {type_} and {arg_type}" | ||
) | ||
type_ = arg_type | ||
|
||
if type_ not in supported_types: | ||
raise DataChainParamsError( | ||
f"Only python literals ({supported_types}) are supported for values" | ||
) | ||
|
||
kwargs = {"else_": else_} | ||
return Func("case", inner=sql_case, args=args, kwargs=kwargs, result_type=type_) | ||
|
||
return Func("case", inner=sql_case, cols=args, kwargs=kwargs, result_type=type_) | ||
|
||
|
||
def ifelse(condition: BinaryExpression, if_val: CaseT, else_val: CaseT) -> Func: | ||
def ifelse( | ||
condition: Union[ColumnElement, Func], if_val: CaseT, else_val: CaseT | ||
) -> Func: | ||
""" | ||
Returns the ifelse function that produces if expression which has a condition | ||
and values for true and false outcome. Results can only be python primitives | ||
like string, numbes or booleans. Result type is inferred from the values. | ||
and values for true and false outcome. Results can be one of python primitives | ||
like string, numbes or booleans, but can also be nested functions. | ||
Result type is inferred from the values. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. numbes -> numbers |
||
|
||
Args: | ||
condition: BinaryExpression - condition which is evaluated | ||
if_val: (str | int | float | complex | bool): value for true condition outcome | ||
else_val: (str | int | float | complex | bool): value for false condition | ||
outcome | ||
condition (ColumnElement, Func): Condition which is evaluated. | ||
if_val (str | int | float | complex | bool, Func): Value for true | ||
condition outcome. | ||
else_val (str | int | float | complex | bool, Func): Value for false condition | ||
outcome. | ||
|
||
Returns: | ||
Func: A Func object that represents the ifelse function. | ||
|
||
Example: | ||
```py | ||
dc.mutate( | ||
res=func.ifelse(C("num") > 0, "P", "N"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's add another example with a column expression? also an examples with a results as an expression? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need that many examples? All other functions have exactly one example even though there can be more as well like the version with column expression as you stated. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to my mind yes, examples is the most valuable part usually since you can get an idea of what is actually possible. We need more simple examples everywhere. |
||
res=func.ifelse(isnone("col"), "EMPTY", "NOT_EMPTY"), | ||
) | ||
``` | ||
""" | ||
return case((condition, if_val), else_=else_val) | ||
|
||
|
||
def isnone(col: Union[str, Column]) -> Func: | ||
""" | ||
Returns True if column value is None, otherwise False | ||
|
||
Args: | ||
col (str | Column): Column to check if it's None or not. | ||
If a string is provided, it is assumed to be the name of the column. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. consistency There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm trying to figure out but not sure what is wrong in this part of docs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. e.e. in some cases I saw it starts with lower case, sometimes I saw and all of this within this single PR and it also applies to the description, e.g. we don't have period here - is it always the case? |
||
|
||
Returns: | ||
Func: A Func object that represents the conditional to check if column is None. | ||
|
||
Example: | ||
```py | ||
dc.mutate(test=ifelse(isnone("col"), "EMPTY", "NOT_EMPTY")) | ||
``` | ||
""" | ||
from datachain import C | ||
|
||
if isinstance(col, str): | ||
# if string, it is assumed to be the name of the column | ||
col = C(col) | ||
|
||
return case((col == None, True), else_=False) # noqa: E711 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,7 @@ | |
from .window import Window | ||
|
||
|
||
ColT = Union[str, ColumnElement, "Func"] | ||
ColT = Union[str, ColumnElement, "Func", tuple] | ||
|
||
|
||
class Func(Function): | ||
|
@@ -78,7 +78,7 @@ def _db_cols(self) -> Sequence[ColT]: | |
return ( | ||
[ | ||
col | ||
if isinstance(col, (Func, BindParameter, Case, Comparator)) | ||
if isinstance(col, (Func, BindParameter, Case, Comparator, tuple)) | ||
else ColumnMeta.to_db_name( | ||
col.name if isinstance(col, ColumnElement) else col | ||
) | ||
|
@@ -381,17 +381,24 @@ def get_column( | |
col_type = self.get_result_type(signals_schema) | ||
sql_type = python_to_sql(col_type) | ||
|
||
def get_col(col: ColT) -> ColT: | ||
def get_col(col: ColT, string_as_literal=False) -> ColT: | ||
# string_as_literal is used only for conditionals like `case()` where | ||
# literals are nested inside ColT as we have tuples of condition - values | ||
# and if user wants to set some case value as column, explicit `C("col")` | ||
# syntax must be used to distinguish from literals | ||
if isinstance(col, tuple): | ||
return tuple(get_col(x, string_as_literal=True) for x in col) | ||
if isinstance(col, Func): | ||
return col.get_column(signals_schema, table=table) | ||
if isinstance(col, str): | ||
if isinstance(col, str) and not string_as_literal: | ||
column = Column(col, sql_type) | ||
column.table = table | ||
return column | ||
return col | ||
|
||
cols = [get_col(col) for col in self._db_cols] | ||
func_col = self.inner(*cols, *self.args, **self.kwargs) | ||
kwargs = {k: get_col(v, string_as_literal=True) for k, v in self.kwargs.items()} | ||
func_col = self.inner(*cols, *self.args, **kwargs) | ||
|
||
if self.is_window: | ||
if not self.window: | ||
|
@@ -423,7 +430,7 @@ def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType": | |
return sql_to_python(col) | ||
|
||
return signals_schema.get_column_type( | ||
col.name if isinstance(col, ColumnElement) else col | ||
col.name if isinstance(col, ColumnElement) else col # type: ignore[arg-type] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we have this ignore here? |
||
) | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are there tests for Func values btw? can we add them?