From 5dc6d384b6b769cda58b2da85a52ccf44769e3b2 Mon Sep 17 00:00:00 2001 From: Stefan Krawczyk Date: Thu, 4 Jul 2024 15:25:55 -0700 Subject: [PATCH 1/6] Sketches Schema UI view This is hacky UI code. It does not handle "run ids". 1. Adds to pandas & pyspark schema capture using `h_schema.py`. 2. Adds SchemaView and related types to try to show a table... So it's either we've set something up with too many things, or this is just a factor of using typescript... Basically I dont like how many things I had to add to do this.. Otherwise it's non-obvious to me what the pattern should be to handle comparing runs -- this UI change for the schema table doesn't take that into account -- it should probably use Generic Table but I couldn't wrap my head around it. Otherwise open question whether we reuse the h_schema pyarrow stuff, or just roll our own again... --- .../result-summaries/DataObservability.tsx | 19 ++++- .../Runs/Task/result-summaries/SchemaView.tsx | 82 +++++++++++++++++++ ui/frontend/src/state/api/backendApiRaw.ts | 11 +++ ui/frontend/src/state/api/friendlyApi.ts | 3 + .../src/hamilton_sdk/tracking/pandas_stats.py | 37 +++++++-- .../hamilton_sdk/tracking/pyspark_stats.py | 58 +++++++++++-- 6 files changed, 193 insertions(+), 17 deletions(-) create mode 100644 ui/frontend/src/components/dashboard/Runs/Task/result-summaries/SchemaView.tsx diff --git a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx index 24fdda202..7682cfe60 100644 --- a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx +++ b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx @@ -5,7 +5,7 @@ import { AttributeDict2, AttributeHTML1, AttributePandasDescribe1, - AttributePrimitive1, + AttributePrimitive1, AttributeSchema1, AttributeUnsupported1, } from "../../../../../state/api/backendApiRaw"; import { @@ -20,6 +20,7 @@ import { PandasDescribe1View } from "./PandasDescribe"; import { Dict1View, Dict2View } from "./DictView"; import { DAGWorksDescribe3View } from "./DAGWorksDescribe"; import { HTML1View } from "./HTMLView"; +import {Schema1View} from "./SchemaView"; const Primitive1View = (props: { taskName: string; @@ -303,6 +304,22 @@ export const ResultsSummaryView = (props: { ); } + const schema1View = getNodeRunAttributes( + props.runAttributes, + props.runIds, + "AttributeSchema1" + ); + + if (schema1View.length > 0) { + allViews.push( + i.value)} + taskName={props.taskName || ""} + runIds={schema1View.map((i) => i.runId)} + /> + ); + } + const unsupportedViews = getNodeRunAttributes( props.runAttributes, props.runIds, diff --git a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/SchemaView.tsx b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/SchemaView.tsx new file mode 100644 index 000000000..92a3c05d9 --- /dev/null +++ b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/SchemaView.tsx @@ -0,0 +1,82 @@ +import { AttributeSchema1 } from "../../../../../state/api/friendlyApi"; +import { GenericTable } from "../../../../common/GenericTable"; +import {ColSchema} from "../../../../../state/api/backendApiRaw"; + + +// export const Schema1View = (props: { +// taskName: string; +// runIds: number[]; +// values: AttributeSchema1[]; +// }) => { +// const columns = [ +// { +// displayName: "ColName", +// Render: (item: AttributeSchema1) => {item.name}, +// }, +// { +// displayName: "Name", +// Render: (item: AttributeSchema1) => {item.name}, +// }, +// { +// displayName: "Value", +// Render: (item: AttributeSchema1) => {item.value}, +// }, +// ]; +// const data = props.values.map(value => [value.id, value.name, value.value]); +// return ( +//
hi
+// ); +// }; + +interface Schema1ViewProps { + attributeSchemas: AttributeSchema1[]; + taskName: string; + runIds: number[]; +} +// export type RunIdToSchema = { +// [key: number]: AttributeSchema1; +// }; + +// const buildRunIdToAttributeSchemaMapping = (props: Schema1ViewProps): { [key: number]: AttributeSchema1 } => { +// const mapping: RunIdToSchema = {}; +// +// props.runIds.forEach((runId, index) => { +// mapping[runId] = props.attributeSchemas[index]; +// }); +// +// return mapping; +// }; + +export const Schema1View: React.FC = ({ attributeSchemas, taskName, runIds }) => { + // Convert the AttributeSchema1 object into an array of its values + // const runIdtoAttributeSchema = buildRunIdToAttributeSchemaMapping({ attributeSchemas, taskName, runIds }); + const schemaArray = Object.values(attributeSchemas[0]); + console.log(attributeSchemas); + console.log(runIds); + + // return ( + // + // ) + return ( + + + + + + + + + + + {schemaArray.map((col, index) => ( + + + + + + + ))} + +
Column NameTypeNullableMetadata
{col.name}{col.type}{col.nullable}{JSON.stringify(col.metadata)}
+ ); +} diff --git a/ui/frontend/src/state/api/backendApiRaw.ts b/ui/frontend/src/state/api/backendApiRaw.ts index b378f4b06..cb2b64b52 100644 --- a/ui/frontend/src/state/api/backendApiRaw.ts +++ b/ui/frontend/src/state/api/backendApiRaw.ts @@ -524,6 +524,16 @@ export type AttributeDagworksDescribe3 = { export type AttributeHTML1 = { html: string; }; + +export type ColSchema = { + name: string; + type: string; + nullable: boolean; + metadata: object; +}; +export type AttributeSchema1 = { + [key: string]: ColSchema; +}; export type AllAttributeTypes = { documentation_loom__1: AttributeDocumentationLoom1; primitive__1: AttributePrimitive1; @@ -534,6 +544,7 @@ export type AllAttributeTypes = { dict__2: AttributeDict2; dagworks_describe__3: AttributeDagworksDescribe3; html__1: AttributeHTML1; + schema__1: AttributeSchema1; }; export type CodeVersionGit1 = { git_hash: string; diff --git a/ui/frontend/src/state/api/friendlyApi.ts b/ui/frontend/src/state/api/friendlyApi.ts index 0890a1260..c8d4b3297 100644 --- a/ui/frontend/src/state/api/friendlyApi.ts +++ b/ui/frontend/src/state/api/friendlyApi.ts @@ -204,6 +204,8 @@ export type AttributeDagworksDescribe3 = export type AttributeHTML1 = AllAttributeTypes["html__1"]; +export type AttributeSchema1 = AllAttributeTypes["schema__1"]; + export const nodeAttributeTypeMap = { AttributePrimitive1: { version: 1, type: "primitive" }, AttributeUnsupported1: { version: 1, type: "unsupported" }, @@ -213,6 +215,7 @@ export const nodeAttributeTypeMap = { AttributeDict2: { version: 2, type: "dict" }, AttributeDagworksDescribe3: { version: 3, type: "dagworks_describe" }, AttributeHTML1: {version: 1, type: "html"}, + AttributeSchema1: {version: 1, type: "schema"}, }; export type DAGWorksDescribeColumn = diff --git a/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py b/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py index f190b8b08..4feade7fe 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Union +from typing import Any, Dict, List, Union import pandas as pd from hamilton_sdk.tracking import pandas_col_stats as pcs @@ -6,6 +6,11 @@ from hamilton import driver +try: + from hamilton.plugins import h_schema +except (ImportError, ModuleNotFoundError): + h_schema = None + """Module that houses functions to compute statistics on pandas series/dataframes. Notes: - we should assume pandas v1.0+ so that we have a string type @@ -80,12 +85,30 @@ def execute_col( @stats.compute_stats.register -def compute_stats_df(result: pd.DataFrame, node_name: str, node_tags: dict) -> Dict[str, Any]: - return { - "observability_type": "dagworks_describe", - "observability_value": _compute_stats(result), - "observability_schema_version": "0.0.3", - } +def compute_stats_df( + result: pd.DataFrame, node_name: str, node_tags: dict +) -> List[stats.StatsType]: + summary_stats = _compute_stats(result) + + results = [ + { + "observability_type": "dagworks_describe", + "observability_value": summary_stats, + "observability_schema_version": "0.0.3", + }, + ] + if h_schema is not None: + schema = h_schema._get_arrow_schema(result) + schema.with_metadata(dict(name=node_name)) + results.append( + { + "observability_type": "schema", + "observability_value": h_schema.pyarrow_schema_to_json(schema), + "observability_schema_version": "0.0.1", + "name": "Schema", + } + ) + return results @stats.compute_stats.register diff --git a/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py b/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py index b88f79803..c813ce676 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py @@ -1,8 +1,13 @@ -from typing import Any, Dict +from typing import Any, Dict, List import pyspark.sql as ps from hamilton_sdk.tracking import stats +try: + from hamilton.plugins import h_schema +except (ImportError, ModuleNotFoundError): + h_schema = None + """Module that houses functions to introspect a PySpark dataframe. """ # this is a mapping used in the Backend/UI. @@ -69,17 +74,52 @@ def _introspect(df: ps.DataFrame) -> Dict[str, Any]: @stats.compute_stats.register -def compute_stats_psdf(result: ps.DataFrame, node_name: str, node_tags: dict) -> Dict[str, Any]: +def compute_stats_psdf( + result: ps.DataFrame, node_name: str, node_tags: dict +) -> List[Dict[str, Any]]: # TODO: create custom type instead of dict for UI o_value = _introspect(result) - return { - "observability_type": "dict", - "observability_value": { - "type": str(type(result)), - "value": o_value, + + results = [ + { + "observability_type": "dict", + "observability_value": { + "type": str(type(result)), + "value": o_value["columns"], + }, + "observability_schema_version": "0.0.2", }, - "observability_schema_version": "0.0.2", - } + { + "observability_type": "primitive", + "observability_value": { + "type": str(str), + "value": o_value["cost_explain"], + }, + "observability_schema_version": "0.0.1", + "name": "Cost Explain", + }, + { + "observability_type": "primitive", + "observability_value": { + "type": str(str), + "value": o_value["extended_explain"], + }, + "observability_schema_version": "0.0.1", + "name": "Extended Explain", + }, + ] + if h_schema is not None: + schema = h_schema._get_arrow_schema(result) + schema.with_metadata(dict(name=node_name)) + results.append( + { + "observability_type": "schema", + "observability_value": h_schema.pyarrow_schema_to_json(schema), + "observability_schema_version": "0.0.1", + "name": "Schema", + } + ) + return results if __name__ == "__main__": From ecc93376068e73b26e89e016f736367d747d02e1 Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Thu, 11 Jul 2024 17:01:30 -0700 Subject: [PATCH 2/6] Gets the schema view to use GenericGroupedTable This will make it look like the others, make it expandable, etc... --- .../result-summaries/DataObservability.tsx | 9 +- .../Runs/Task/result-summaries/SchemaView.tsx | 204 +++++++++++------- 2 files changed, 137 insertions(+), 76 deletions(-) diff --git a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx index 7682cfe60..82962faac 100644 --- a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx +++ b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx @@ -5,7 +5,8 @@ import { AttributeDict2, AttributeHTML1, AttributePandasDescribe1, - AttributePrimitive1, AttributeSchema1, + AttributePrimitive1, + AttributeSchema1, AttributeUnsupported1, } from "../../../../../state/api/backendApiRaw"; import { @@ -20,7 +21,7 @@ import { PandasDescribe1View } from "./PandasDescribe"; import { Dict1View, Dict2View } from "./DictView"; import { DAGWorksDescribe3View } from "./DAGWorksDescribe"; import { HTML1View } from "./HTMLView"; -import {Schema1View} from "./SchemaView"; +import { Schema1View } from "./SchemaView"; const Primitive1View = (props: { taskName: string; @@ -313,9 +314,9 @@ export const ResultsSummaryView = (props: { if (schema1View.length > 0) { allViews.push( i.value)} - taskName={props.taskName || ""} + attributeSchemas={schema1View.map((i) => i.value)} runIds={schema1View.map((i) => i.runId)} + projectId={props.projectId} /> ); } diff --git a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/SchemaView.tsx b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/SchemaView.tsx index 92a3c05d9..7e5ee6bf6 100644 --- a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/SchemaView.tsx +++ b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/SchemaView.tsx @@ -1,82 +1,142 @@ import { AttributeSchema1 } from "../../../../../state/api/friendlyApi"; -import { GenericTable } from "../../../../common/GenericTable"; -import {ColSchema} from "../../../../../state/api/backendApiRaw"; - - -// export const Schema1View = (props: { -// taskName: string; -// runIds: number[]; -// values: AttributeSchema1[]; -// }) => { -// const columns = [ -// { -// displayName: "ColName", -// Render: (item: AttributeSchema1) => {item.name}, -// }, -// { -// displayName: "Name", -// Render: (item: AttributeSchema1) => {item.name}, -// }, -// { -// displayName: "Value", -// Render: (item: AttributeSchema1) => {item.value}, -// }, -// ]; -// const data = props.values.map(value => [value.id, value.name, value.value]); -// return ( -//
hi
-// ); -// }; +import { GenericGroupedTable } from "../../../../common/GenericTable"; +import { ColSchema } from "../../../../../state/api/backendApiRaw"; +import JsonView from "@uiw/react-json-view"; +import { RunLink } from "../../../../common/CommonLinks"; interface Schema1ViewProps { attributeSchemas: AttributeSchema1[]; - taskName: string; runIds: number[]; + projectId: number; } -// export type RunIdToSchema = { -// [key: number]: AttributeSchema1; -// }; -// const buildRunIdToAttributeSchemaMapping = (props: Schema1ViewProps): { [key: number]: AttributeSchema1 } => { -// const mapping: RunIdToSchema = {}; -// -// props.runIds.forEach((runId, index) => { -// mapping[runId] = props.attributeSchemas[index]; -// }); -// -// return mapping; -// }; +type AttributeColumn = ColSchema & { + runId: number; + projectId: number; +}; -export const Schema1View: React.FC = ({ attributeSchemas, taskName, runIds }) => { - // Convert the AttributeSchema1 object into an array of its values - // const runIdtoAttributeSchema = buildRunIdToAttributeSchemaMapping({ attributeSchemas, taskName, runIds }); - const schemaArray = Object.values(attributeSchemas[0]); - console.log(attributeSchemas); - console.log(runIds); +export const Schema1View: React.FC = ({ + attributeSchemas, + runIds, + projectId, +}) => { + const attributeColumns: AttributeColumn[] = []; - // return ( - // - // ) - return ( - - - - - - - - - - - {schemaArray.map((col, index) => ( - - - - - - + attributeSchemas.forEach((schema, i) => { + const runId = runIds[i]; + Object.keys(schema).forEach((key) => { + attributeColumns.push({ + runId: runId, + projectId: projectId, + ...schema[key], + }); + }); + }); + + const columns = [ + { + displayName: "runs", + Render: ( + value: { runId: number }[], + isSummaryRow: boolean, + isExpanded: boolean + ) => { + const runIds = value.map((item) => item.runId); + if (isSummaryRow && isExpanded) { + return <>; + } + return ( +
+ {runIds.map((taskRun, i) => ( + void 0} + /> ))} -
-
Column NameTypeNullableMetadata
{col.name}{col.type}{col.nullable}{JSON.stringify(col.metadata)}
- ); -} + + ); + }, + }, + + { + displayName: "type", + Render: ( + value: AttributeColumn[], + isSummaryRow: boolean, + isExpanded: boolean + ) => { + const toDisplay = Array.from(new Set(value.map((item) => item.type))); + console.log(value); + return !isSummaryRow || !isExpanded ? ( +
+ {toDisplay.map((item, i) => ( + {item} + ))} +
+ ) : ( + <> + ); + }, + }, + { + displayName: "nullable", + Render: ( + value: AttributeColumn[], + isSummaryRow: boolean, + isExpanded: boolean + ) => { + console.log(value); + const allExpanded = value.map((item) => item.nullable); + console.log(allExpanded); + + return !isSummaryRow || !isExpanded ? ( +
+ {allExpanded.map((item) => (item ? "✓" : "✗"))} +
+ ) : ( + <> + ); + }, + }, + { + displayName: "metadata", + Render: ( + value: AttributeColumn[], + isSummaryRow: boolean, + isExpanded: boolean + ) => { + const toDisplay = Array.from( + new Set(value.map((item) => item.metadata)) + ); + console.log(value); + return !isSummaryRow || !isExpanded ? ( +
+ {toDisplay.map((item, i) => ( + + ))} +
+ ) : ( + <> + ); + }, + }, + ]; + return ( +
+ [col.name, col])} + // description="Numeric columns in the dataset" + columns={columns} + /> +
+ ); +}; From 1c7190cace961e442e719f30aab8d8a2633c441e Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Thu, 11 Jul 2024 17:09:43 -0700 Subject: [PATCH 3/6] Adds expand/contract of attributes This allows us to hide attributes now that we have multiple --- .../result-summaries/DataObservability.tsx | 222 +++--------------- 1 file changed, 38 insertions(+), 184 deletions(-) diff --git a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx index 82962faac..5fd29a026 100644 --- a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx +++ b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx @@ -22,6 +22,7 @@ import { Dict1View, Dict2View } from "./DictView"; import { DAGWorksDescribe3View } from "./DAGWorksDescribe"; import { HTML1View } from "./HTMLView"; import { Schema1View } from "./SchemaView"; +import { HiChevronDown, HiChevronUp } from "react-icons/hi"; const Primitive1View = (props: { taskName: string; @@ -149,6 +150,16 @@ export const MultiResultSummaryView = (props: { projectId: number; runIds: number[]; }) => { + const [minimizedAttributes, setMinimizedAttributes] = useState([]); + const toggleExpanded = (attributeName: string) => { + if (minimizedAttributes.includes(attributeName)) { + setMinimizedAttributes( + minimizedAttributes.filter((i) => i !== attributeName) + ); + } else { + setMinimizedAttributes([...minimizedAttributes, attributeName]); + } + }; const attributes = props.nodeRunData.flatMap((i) => i?.attributes || []); const attributesGroupedByName = attributes.reduce((acc, item) => { if (acc[item.name]) { @@ -160,19 +171,33 @@ export const MultiResultSummaryView = (props: { }, {} as { [key: string]: NodeRunAttribute[] }); return (
- {Object.entries(attributesGroupedByName).map(([key, value]) => ( -
-

- {snakeToTitle(key)} -

- -
- ))} + {Object.entries(attributesGroupedByName).map(([key, value]) => { + const isExpanded = !minimizedAttributes.includes(key); + const Icon = isExpanded ? HiChevronUp : HiChevronDown; + return ( +
+
+

+ {snakeToTitle(key)} +

+ { + toggleExpanded(key); + }} + /> +
+ {isExpanded && ( + + )} +
+ ); + })}
); }; @@ -344,175 +369,4 @@ export const ResultsSummaryView = (props: { })} ); - - // const task_name: string = props.taskName ? props.taskName : "?"; - // // so yeah this code is messy - we should really massage things into a dict of run_id -> specific type... - // // else we're assuming that the order of run ids and results matches. - // const { - // observability_value, - // observability_type, - // observability_schema_version, - // } = resultSummariesFiltered[0] as ResultsSummary; - // if ( - // observability_type === "pandas_describe" && - // observability_schema_version === "0.0.1" - // ) { - // return ( - // item as PandasDescribeV0_0_1)} - // projectId={props.projectId} - // runIds={props.runs.map((run) => run.id as number)} - // results={resultSummariesFiltered.map( - // (item) => item.observability_value as PandasDescribeV0_0_1 - // )} - // /> - // ); - // } else if ( - // observability_type === "primitive" && - // observability_schema_version === "0.0.1" - // ) { - // type PrimitiveType = { - // type: string; - // value: string; - // }; - // // for each observability_value in resultSummariesFiltered, cast it to PrimitiveType - // // and put it into values - // const values = resultSummariesFiltered.map((item) => { - // return { runId: item.runId, ...item.observability_value }; - // }) as ({ runId: number } & PrimitiveType)[]; - // return ( - //
- // { - // return [item.runId?.toString() || "", item]; - // })} - // columns={[ - // { - // displayName: "type", - // Render: (item: PrimitiveType) => { - // return ( - //
- // - // {parsePythonType({ typeName: item.type })} - // - //
- // ); - // }, - // }, - // { - // displayName: "value", - // Render: (item: PrimitiveType) => { - // const [expanded, setExpanded] = useState(false); - // return ( - //
- //
 {
-  //                       setExpanded(!expanded);
-  //                       e.stopPropagation();
-  //                     }}
-  //                     className={`${
-  //                       expanded ? "break-word whitespace-pre-wrap" : "truncate"
-  //                     }  text-gray-500 cursor-cell`}
-  //                   >
-  //                     {item.value.toString()}
-  //                   
- // {/* - // {item.value.toString()} - // */} - //
- // ); - // }, - // }, - // ]} - // dataTypeName={"Run"} - // dataTypeDisplay={(item: string) => { - // return ( - // void 0} - // highlightedRun={null} - // > - // ); - // }} - // /> - //
- // ); - // } else if ( - // observability_type === "dict" && - // observability_schema_version === "0.0.2" - // ) { - // return ( - // run.id as number)} - // results={resultSummariesFiltered.map( - // (item) => (item.observability_value as any).value as object - // )} - // > - // ); - // } else if ( - // observability_type === "dagworks_describe" && - // observability_schema_version === "0.0.3" - // ) { - // return ( - // run.id as number)} - // results={resultSummariesFiltered.map( - // (item) => item.observability_value as DAGWorksDescribeV0_0_3 - // )} - // /> - // ); - // } - // type UnknownType = { - // unsupported_type: string; - // action: string; - // }; - // const values = resultSummariesFiltered.map((item) => { - // return { - // value: item.observability_value as UnknownType, - // runId: item.runId, - // }; - // }); - // // TODO: this does not show all the outputs -- just the first one. This code is messy so not going to fix it now. - // return ( - //
- // - // We currently do not capture data summaries for run(s){" "} - // [{runIds.join(", ")}] - // for {task_name}. We are working on adding support for - // everything -- reach out if you need it and we can prioritize! - // - //
- // { - // return [item.runId.toString() || "", item.value]; - // })} - // columns={[ - // { - // displayName: "type", - // Render: (item: UnknownType) => { - // return ( - //
- // {item.unsupported_type} - //
- // ); - // }, - // }, - // { - // displayName: "action", - // Render: (item: UnknownType) => { - // return ( - //
- // {item.action} - //
- // ); - // }, - // }, - // ]} - // dataTypeName={""} - // /> - //
- //
- // ); }; From 577e6af583651bde7fb432eb33798151fc1ab78a Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Fri, 12 Jul 2024 12:00:19 -0700 Subject: [PATCH 4/6] Refactors capturing of data to be more flexible We now have three categories: 1. Result summaries -- we can output one and if not it will be unsupported in the UI 2. Schema -- we can output one or none 3. Additional -- as many as we want We use a single dispatch function for each, and it makes the code a lot cleaner. We no longer put stuff in lists unless its an additional result. Note they can also supply their own name, if not, we will generate a unique attribute name. In the long term we'll likely want to stop using single dispatch as we need to register multiple with roles (and single dispatch requires one function per role). For now this is clean enough and easy to work with, however. --- ui/sdk/src/hamilton_sdk/adapters.py | 34 ++-------- .../{stats.py => data_observation.py} | 41 ++++++++++-- .../src/hamilton_sdk/tracking/ibis_stats.py | 6 +- .../hamilton_sdk/tracking/langchain_stats.py | 12 ++-- .../src/hamilton_sdk/tracking/numpy_stats.py | 4 +- .../src/hamilton_sdk/tracking/pandas_stats.py | 44 ++++++------ .../src/hamilton_sdk/tracking/polars_stats.py | 6 +- .../hamilton_sdk/tracking/pydantic_stats.py | 4 +- .../hamilton_sdk/tracking/pyspark_stats.py | 67 +++++++------------ ui/sdk/src/hamilton_sdk/tracking/runs.py | 52 ++++++++------ .../tracking/scikit_learn_stats.py | 30 +++++---- ui/sdk/tests/tracking/test_pandas_stats.py | 4 +- ui/sdk/tests/tracking/test_runs.py | 16 +++-- ui/sdk/tests/tracking/test_stats.py | 10 +-- 14 files changed, 168 insertions(+), 162 deletions(-) rename ui/sdk/src/hamilton_sdk/tracking/{stats.py => data_observation.py} (87%) diff --git a/ui/sdk/src/hamilton_sdk/adapters.py b/ui/sdk/src/hamilton_sdk/adapters.py index abfda447d..b74703a70 100644 --- a/ui/sdk/src/hamilton_sdk/adapters.py +++ b/ui/sdk/src/hamilton_sdk/adapters.py @@ -260,7 +260,7 @@ def post_node_execute( if success: task_run.status = Status.SUCCESS task_run.result_type = type(result) - result_summary = runs.process_result(result, node_) + result_summary, schema, additional_attributes = runs.process_result(result, node_) if result_summary is None: result_summary = { "observability_type": "observability_failure", @@ -270,18 +270,7 @@ def post_node_execute( "value": "Failed to process result.", }, } - # NOTE This is a temporary hack to make process_result() able to return - # more than one object that will be used as UI "task attributes". - # There's a conflict between `TaskRun.result_summary` that expect a single - # dict from process_result() and the `HamiltonTracker.post_node_execute()` - # that can more freely handle "stats" to create multiple "task attributes" - elif isinstance(result_summary, dict): - result_summary = result_summary - elif isinstance(result_summary, list): - other_results = [obj for obj in result_summary[1:]] - result_summary = result_summary[0] - else: - raise TypeError("`process_result()` needs to return a dict or sequence of dict") + other_results = ([schema] if schema is not None else []) + additional_attributes task_run.result_summary = result_summary task_attr = dict( @@ -546,12 +535,13 @@ async def post_node_execute( task_run = self.task_runs[run_id][node_.name] tracking_state = self.tracking_states[run_id] task_run.end_time = datetime.datetime.now(timezone.utc) - other_results = [] + if success: task_run.status = Status.SUCCESS task_run.result_type = type(result) - result_summary = runs.process_result(result, node_) # add node + result_summary, schema, additional = runs.process_result(result, node_) # add node + other_results = ([schema] if schema is not None else []) + additional if result_summary is None: result_summary = { "observability_type": "observability_failure", @@ -561,19 +551,6 @@ async def post_node_execute( "value": "Failed to process result.", }, } - # NOTE This is a temporary hack to make process_result() able to return - # more than one object that will be used as UI "task attributes". - # There's a conflict between `TaskRun.result_summary` that expect a single - # dict from process_result() and the `HamiltonTracker.post_node_execute()` - # that can more freely handle "stats" to create multiple "task attributes" - elif isinstance(result_summary, dict): - result_summary = result_summary - elif isinstance(result_summary, list): - other_results = [obj for obj in result_summary[1:]] - result_summary = result_summary[0] - else: - raise TypeError("`process_result()` needs to return a dict or sequence of dict") - task_run.result_summary = result_summary task_attr = dict( node_name=get_node_name(node_, task_id), @@ -603,7 +580,6 @@ async def post_node_execute( attribute_role="error", ) - # `result_summary` or "error" is first because the order influences UI display order attributes = [task_attr] for i, other_result in enumerate(other_results): other_attr = dict( diff --git a/ui/sdk/src/hamilton_sdk/tracking/stats.py b/ui/sdk/src/hamilton_sdk/tracking/data_observation.py similarity index 87% rename from ui/sdk/src/hamilton_sdk/tracking/stats.py rename to ui/sdk/src/hamilton_sdk/tracking/data_observation.py index 0b968e27f..688468657 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/data_observation.py @@ -1,15 +1,29 @@ import json from functools import singledispatch -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional import pandas as pd from hamilton_sdk.tracking import sql_utils -StatsType = Dict[str, Any] +# Multiple observations per are allowed +ObservationType = Dict[str, Any] @singledispatch -def compute_stats(result, node_name: str, node_tags: dict) -> Union[StatsType, List[StatsType]]: +def compute_schema(result, node_name: str, node_tags: dict) -> Optional[ObservationType]: + """The default schema will be None, and filtered out. + We can polymoorphically implement this for different types of results. + + :param result: + :param node_name: + :param node_tags: + :return: + """ + return None + + +@singledispatch +def compute_stats(result, node_name: str, node_tags: dict) -> Optional[ObservationType]: """This is the default implementation for computing stats on a result. All other implementations should be registered with the `@compute_stats.register` decorator. @@ -29,11 +43,24 @@ def compute_stats(result, node_name: str, node_tags: dict) -> Union[StatsType, L } +@singledispatch +def compute_additional_results(result, node_name: str, node_tags: dict) -> List[ObservationType]: + """The default schema will be None, and filtered out. + We can polymoorphically implement this for different types of results. + + :param result: + :param node_name: + :param node_tags: + :return: + """ + return [] + + @compute_stats.register(str) @compute_stats.register(int) @compute_stats.register(float) @compute_stats.register(bool) -def compute_stats_primitives(result, node_name: str, node_tags: dict) -> StatsType: +def compute_stats_primitives(result, node_name: str, node_tags: dict) -> ObservationType: return { "observability_type": "primitive", "observability_value": { @@ -45,7 +72,7 @@ def compute_stats_primitives(result, node_name: str, node_tags: dict) -> StatsTy @compute_stats.register(dict) -def compute_stats_dict(result: dict, node_name: str, node_tags: dict) -> StatsType: +def compute_stats_dict(result: dict, node_name: str, node_tags: dict) -> ObservationType: """call summary stats on the values in the dict""" try: # if it's JSON serializable, take it. @@ -94,7 +121,7 @@ def compute_stats_dict(result: dict, node_name: str, node_tags: dict) -> StatsTy @compute_stats.register(tuple) -def compute_stats_tuple(result: tuple, node_name: str, node_tags: dict) -> StatsType: +def compute_stats_tuple(result: tuple, node_name: str, node_tags: dict) -> ObservationType: if "hamilton.data_loader" in node_tags and node_tags["hamilton.data_loader"] is True: # assumption it's a tuple if isinstance(result[1], dict): @@ -141,7 +168,7 @@ def compute_stats_tuple(result: tuple, node_name: str, node_tags: dict) -> Stats @compute_stats.register(list) -def compute_stats_list(result: list, node_name: str, node_tags: dict) -> StatsType: +def compute_stats_list(result: list, node_name: str, node_tags: dict) -> ObservationType: """call summary stats on the values in the list""" try: # if it's JSON serializable, take it. diff --git a/ui/sdk/src/hamilton_sdk/tracking/ibis_stats.py b/ui/sdk/src/hamilton_sdk/tracking/ibis_stats.py index 15ef99458..677b6f272 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/ibis_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/ibis_stats.py @@ -1,6 +1,6 @@ from typing import Any, Dict -from hamilton_sdk.tracking import stats +from hamilton_sdk.tracking import data_observation from ibis.expr.datatypes import core # import ibis.expr.types as ir @@ -73,11 +73,11 @@ def _introspect(table: relations.Table) -> Dict[str, Any]: } -@stats.compute_stats.register +@data_observation.compute_schema.register def compute_stats_ibis_table( result: relations.Table, node_name: str, node_tags: dict ) -> Dict[str, Any]: - # TODO: create custom type instead of dict for UI + # TODO: use the schema type o_value = _introspect(result) return { "observability_type": "dict", diff --git a/ui/sdk/src/hamilton_sdk/tracking/langchain_stats.py b/ui/sdk/src/hamilton_sdk/tracking/langchain_stats.py index 9d59f3018..e2fb432ec 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/langchain_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/langchain_stats.py @@ -4,12 +4,12 @@ from typing import Any, Dict -from hamilton_sdk.tracking import stats +from hamilton_sdk.tracking import data_observation from langchain_core import documents as lc_documents from langchain_core import messages as lc_messages -@stats.compute_stats.register(lc_messages.BaseMessage) +@data_observation.compute_stats.register(lc_messages.BaseMessage) def compute_stats_lc_messages( result: lc_messages.BaseMessage, node_name: str, node_tags: dict ) -> Dict[str, Any]: @@ -22,12 +22,12 @@ def compute_stats_lc_messages( } -@stats.compute_stats.register(lc_documents.Document) +@data_observation.compute_stats.register(lc_documents.Document) def compute_stats_lc_docs( result: lc_documents.Document, node_name: str, node_tags: dict ) -> Dict[str, Any]: if hasattr(result, "to_document"): - return stats.compute_stats(result.to_document(), node_name, node_tags) + return data_observation.compute_stats(result.to_document(), node_name, node_tags) else: # d.page_content # hack because not all documents are serializable result = {"content": result.page_content, "metadata": result.metadata} @@ -43,7 +43,7 @@ def compute_stats_lc_docs( from langchain_core import messages msg = messages.BaseMessage(content="Hello, World!", type="greeting") - print(stats.compute_stats(msg, "greeting", {})) + print(data_observation.compute_stats(msg, "greeting", {})) doc = lc_documents.Document(page_content="Hello, World!", metadata={"source": "local_dir"}) - print(stats.compute_stats(doc, "document", {})) + print(data_observation.compute_stats(doc, "document", {})) diff --git a/ui/sdk/src/hamilton_sdk/tracking/numpy_stats.py b/ui/sdk/src/hamilton_sdk/tracking/numpy_stats.py index 24d4abf88..47edd64ac 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/numpy_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/numpy_stats.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from hamilton_sdk.tracking import pandas_stats, stats +from hamilton_sdk.tracking import data_observation, pandas_stats """Module that houses functions to compute statistics on numpy objects Notes: @@ -10,7 +10,7 @@ """ -@stats.compute_stats.register +@data_observation.compute_stats.register def compute_stats_numpy(result: np.ndarray, node_name: str, node_tags: dict) -> Dict[str, Any]: try: df = pd.DataFrame(result) # hack - reuse pandas stuff diff --git a/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py b/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py index 4feade7fe..e1ac4ecbd 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/pandas_stats.py @@ -1,8 +1,8 @@ -from typing import Any, Dict, List, Union +from typing import Any, Dict, Optional, Union import pandas as pd +from hamilton_sdk.tracking import data_observation from hamilton_sdk.tracking import pandas_col_stats as pcs -from hamilton_sdk.tracking import stats from hamilton import driver @@ -84,34 +84,36 @@ def execute_col( return stats -@stats.compute_stats.register +@data_observation.compute_stats.register def compute_stats_df( result: pd.DataFrame, node_name: str, node_tags: dict -) -> List[stats.StatsType]: +) -> data_observation.ObservationType: summary_stats = _compute_stats(result) - results = [ - { - "observability_type": "dagworks_describe", - "observability_value": summary_stats, - "observability_schema_version": "0.0.3", - }, - ] + return { + "observability_type": "dagworks_describe", + "observability_value": summary_stats, + "observability_schema_version": "0.0.3", + } + + +@data_observation.compute_schema.register +def compute_schema( + result: pd.DataFrame, node_name: str, node_tags: dict +) -> Optional[data_observation.ObservationType]: if h_schema is not None: schema = h_schema._get_arrow_schema(result) schema.with_metadata(dict(name=node_name)) - results.append( - { - "observability_type": "schema", - "observability_value": h_schema.pyarrow_schema_to_json(schema), - "observability_schema_version": "0.0.1", - "name": "Schema", - } - ) - return results + return { + "observability_type": "schema", + "observability_value": h_schema.pyarrow_schema_to_json(schema), + "observability_schema_version": "0.0.1", + "name": "Schema", + } + return None -@stats.compute_stats.register +@data_observation.compute_stats.register def compute_stats_series(result: pd.Series, node_name: str, node_tags: dict) -> Dict[str, Any]: col_name = result.name if result.name else node_name return { diff --git a/ui/sdk/src/hamilton_sdk/tracking/polars_stats.py b/ui/sdk/src/hamilton_sdk/tracking/polars_stats.py index 6fb6da825..b5169c10d 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/polars_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/polars_stats.py @@ -4,8 +4,8 @@ if not hasattr(pl, "Series"): raise ImportError("Polars is not installed") +from hamilton_sdk.tracking import data_observation from hamilton_sdk.tracking import polars_col_stats as pls -from hamilton_sdk.tracking import stats from hamilton import driver @@ -83,7 +83,7 @@ def execute_col(target_output: str, col: pl.Series, name: str, position: int) -> return stats -@stats.compute_stats.register +@data_observation.compute_stats.register def compute_stats_df(result: pl.DataFrame, node_name: str, node_tags: dict) -> Dict[str, Any]: return { "observability_type": "dagworks_describe", @@ -92,7 +92,7 @@ def compute_stats_df(result: pl.DataFrame, node_name: str, node_tags: dict) -> D } -@stats.compute_stats.register +@data_observation.compute_stats.register def compute_stats_series(result: pl.Series, node_name: str, node_tags: dict) -> Dict[str, Any]: return { "observability_type": "dagworks_describe", diff --git a/ui/sdk/src/hamilton_sdk/tracking/pydantic_stats.py b/ui/sdk/src/hamilton_sdk/tracking/pydantic_stats.py index eb342b887..4effe9d8b 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/pydantic_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/pydantic_stats.py @@ -1,10 +1,10 @@ from typing import Any, Dict import pydantic -from hamilton_sdk.tracking import stats +from hamilton_sdk.tracking import data_observation -@stats.compute_stats.register +@data_observation.compute_stats.register def compute_stats_pydantic( result: pydantic.BaseModel, node_name: str, node_tags: dict ) -> Dict[str, Any]: diff --git a/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py b/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py index c813ce676..947b42525 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py @@ -1,7 +1,8 @@ -from typing import Any, Dict, List +from typing import Any, Dict, Optional import pyspark.sql as ps -from hamilton_sdk.tracking import stats +from hamilton_sdk.tracking import data_observation +from hamilton_sdk.tracking.data_observation import ObservationType try: from hamilton.plugins import h_schema @@ -73,53 +74,35 @@ def _introspect(df: ps.DataFrame) -> Dict[str, Any]: } -@stats.compute_stats.register -def compute_stats_psdf( - result: ps.DataFrame, node_name: str, node_tags: dict -) -> List[Dict[str, Any]]: +@data_observation.compute_stats.register +def compute_stats_psdf(result: ps.DataFrame, node_name: str, node_tags: dict) -> ObservationType: # TODO: create custom type instead of dict for UI o_value = _introspect(result) - results = [ - { - "observability_type": "dict", - "observability_value": { - "type": str(type(result)), - "value": o_value["columns"], - }, - "observability_schema_version": "0.0.2", - }, - { - "observability_type": "primitive", - "observability_value": { - "type": str(str), - "value": o_value["cost_explain"], - }, - "observability_schema_version": "0.0.1", - "name": "Cost Explain", - }, - { - "observability_type": "primitive", - "observability_value": { - "type": str(str), - "value": o_value["extended_explain"], - }, - "observability_schema_version": "0.0.1", - "name": "Extended Explain", + return { + "observability_type": "dict", + "observability_value": { + "type": str(type(result)), + "value": o_value, }, - ] + "observability_schema_version": "0.0.2", + } + + +@data_observation.compute_schema.register +def compute_schema_psdf( + result: ps.DataFrame, node_name: str, node_tags: dict +) -> Optional[ObservationType]: if h_schema is not None: schema = h_schema._get_arrow_schema(result) schema.with_metadata(dict(name=node_name)) - results.append( - { - "observability_type": "schema", - "observability_value": h_schema.pyarrow_schema_to_json(schema), - "observability_schema_version": "0.0.1", - "name": "Schema", - } - ) - return results + return { + "observability_type": "schema", + "observability_value": h_schema.pyarrow_schema_to_json(schema), + "observability_schema_version": "0.0.1", + "name": "Schema", + } + return None if __name__ == "__main__": diff --git a/ui/sdk/src/hamilton_sdk/tracking/runs.py b/ui/sdk/src/hamilton_sdk/tracking/runs.py index ff3d7ffa1..6e329c511 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/runs.py +++ b/ui/sdk/src/hamilton_sdk/tracking/runs.py @@ -6,9 +6,10 @@ import traceback from contextlib import contextmanager from datetime import datetime, timezone -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Tuple -from hamilton_sdk.tracking import stats +from hamilton_sdk.tracking import data_observation +from hamilton_sdk.tracking.data_observation import ObservationType from hamilton_sdk.tracking.trackingtypes import DAGRun, Status, TaskRun from hamilton import node as h_node @@ -36,7 +37,9 @@ pass -def process_result(result: Any, node: h_node.Node) -> Any: +def process_result( + result: Any, node: h_node.Node +) -> Tuple[Optional[ObservationType], Optional[ObservationType], List[ObservationType]]: """Processes result -- this is purely a by-type mapping. Note that this doesn't actually do anything yet -- the idea is that we can return DQ results, and do other stuff with other results -- E.G. summary stats on dataframes, @@ -53,16 +56,31 @@ def process_result(result: Any, node: h_node.Node) -> Any: :param node: The node that produced the result :return: The processed result - it has to be JSON serializable! """ + statistics = None + schema = None + additional = [] try: start = py_time.time() - statistics = stats.compute_stats(result, node.name, node.tags) + statistics = data_observation.compute_stats(result, node.name, node.tags) end = py_time.time() logger.debug(f"Took {end - start} seconds to describe {node.name}") - return statistics - # TODO: introspect other nodes - # if it's a check_output node, then we want to process the pandera result/the result from it. except Exception as e: - logger.warning(f"Failed to introspect result for {node.name}. Error:\n{e}") + logger.warning(f"Failed to introspect statistics for {node.name}. Error:\n{e}") + try: + start = py_time.time() + schema = data_observation.compute_schema(result, node.name, node.tags) + end = py_time.time() + logger.debug(f"Took {end - start} seconds to introspect schema for {node.name}") + except Exception as e: + logger.warning(f"Failed to introspect schema for {node.name}. Error:\n{e}") + try: + start = py_time.time() + additional.extend(data_observation.compute_additional_results(result, node.name, node.tags)) + end = py_time.time() + logger.debug(f"Took {end - start} seconds to introspect additional results for {node.name}") + except Exception as e: + logger.warning(f"Failed to introspect additional results for {node.name}. Error:\n{e}") + return statistics, schema, additional class TrackingState: @@ -184,19 +202,11 @@ def execute_node( try: result = original_do_node_execute(run_id, node_, kwargs, task_id) - # NOTE This is a temporary hack to make process_result() able to return - # more than one object that will be used as UI "task attributes". - # There's a conflict between `TaskRun.result_summary` that expect a single - # dict from process_result() and the `HamiltonTracker.post_node_execute()` - # that can more freely handle "stats" to create multiple "task attributes" - result_summary = process_result(result, node_) # add node - if isinstance(result_summary, dict): - result_summary = result_summary - elif isinstance(result_summary, list): - result_summary = result_summary[0] - else: - raise TypeError("`process_result()` needs to return a dict or list of dict") - + # In subsequent versions we will stop supporting the class-based approach + # If you find yourself relying on this, switch to use adapters + result_summary, *ignored_because_this_is_a_defunct_code_path = process_result( + result, node_ + ) # add node task_run.status = Status.SUCCESS task_run.result_type = type(result) task_run.result_summary = result_summary diff --git a/ui/sdk/src/hamilton_sdk/tracking/scikit_learn_stats.py b/ui/sdk/src/hamilton_sdk/tracking/scikit_learn_stats.py index fe7087ec4..b8e230a4e 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/scikit_learn_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/scikit_learn_stats.py @@ -1,30 +1,34 @@ from typing import List -from hamilton_sdk.tracking import stats +from hamilton_sdk.tracking import data_observation from sklearn.base import BaseEstimator """Module that houses functions to compute statistics on numpy objects""" -@stats.compute_stats.register(BaseEstimator) -def get_estimator_html_representation(result, *args, **kwargs) -> List[stats.StatsType]: +@data_observation.compute_stats.register(BaseEstimator) +def get_estimator_params(result, *args, **kwargs) -> data_observation.ObservationType: """ ref: https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_display_object_visualization.html """ - return [ - { - "name": "Parameters", - "observability_type": "dict", - "observability_value": { - "type": str(type(result)), - "value": result.get_params(deep=True), - }, - "observability_schema_version": "0.0.2", + return { + "name": "Parameters", + "observability_type": "dict", + "observability_value": { + "type": str(type(result)), + "value": result.get_params(deep=True), }, + "observability_schema_version": "0.0.2", + } + + +@data_observation.compute_additional_results.register(BaseEstimator) +def get_estimator_html(result, *args, **kwargs) -> List[data_observation.ObservationType]: + return [ { "name": "Components", "observability_type": "html", "observability_value": {"html": result._repr_html_inner()}, # get_params(deep=True), "observability_schema_version": "0.0.1", - }, + } ] diff --git a/ui/sdk/tests/tracking/test_pandas_stats.py b/ui/sdk/tests/tracking/test_pandas_stats.py index 421add51e..da4903015 100644 --- a/ui/sdk/tests/tracking/test_pandas_stats.py +++ b/ui/sdk/tests/tracking/test_pandas_stats.py @@ -20,7 +20,7 @@ def test_compute_stats_df(): } ) actual = ps.compute_stats_df(df, "test", {}) - expected = { + expected_stats = { "observability_schema_version": "0.0.3", "observability_type": "dagworks_describe", "observability_value": { @@ -206,4 +206,4 @@ def test_compute_stats_df(): }, }, } - assert actual == expected + assert actual == expected_stats diff --git a/ui/sdk/tests/tracking/test_runs.py b/ui/sdk/tests/tracking/test_runs.py index 922addffb..3ea627a9d 100644 --- a/ui/sdk/tests/tracking/test_runs.py +++ b/ui/sdk/tests/tracking/test_runs.py @@ -26,7 +26,7 @@ def create_node(name: str, type_: type) -> node.Node: @pytest.mark.parametrize( - "test_result,test_node,observability_type,observability_value", + "test_result,test_node,observability_type,stats", [ ( pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), @@ -517,16 +517,20 @@ def create_node(name: str, type_: type) -> node.Node: "pl_datetime_series", ], ) -def test_process_result_happy(test_result, test_node, observability_type, observability_value): +def test_process_result_happy(test_result, test_node, observability_type, stats): """Tests a happy path for the process result function.""" - actual_result = runs.process_result(test_result, test_node) + stats, schema, additional = runs.process_result(test_result, test_node) expected_result = result_base.copy() if observability_type in ["dict"]: expected_result["observability_schema_version"] = "0.0.2" if observability_type in ["primitive", "unsupported"]: expected_result["observability_schema_version"] = "0.0.1" expected_result["observability_type"] = observability_type - expected_result["observability_value"] = observability_value - assert actual_result == expected_result + expected_result["observability_value"] = stats["observability_value"] + assert stats == expected_result # Allows us to double-check that everything can be json-dumped - json.dumps(actual_result) + json.dumps(stats) + # TODO -- test schema values, but probably not here + if schema is not None: + json.dumps(schema) + [json.dumps(add) for add in additional] diff --git a/ui/sdk/tests/tracking/test_stats.py b/ui/sdk/tests/tracking/test_stats.py index 10aa9a095..af6b2377d 100644 --- a/ui/sdk/tests/tracking/test_stats.py +++ b/ui/sdk/tests/tracking/test_stats.py @@ -3,13 +3,13 @@ from typing import NamedTuple import pandas as pd -from hamilton_sdk.tracking import stats +from hamilton_sdk.tracking import data_observation def test_compute_stats_namedtuple(): config = namedtuple("config", ["secret_key"]) result = config("secret_value") - actual = stats.compute_stats(result, "test_node", {}) + actual = data_observation.compute_stats(result, "test_node", {}) assert actual == { "observability_type": "unsupported", "observability_value": { @@ -21,7 +21,7 @@ def test_compute_stats_namedtuple(): def test_compute_stats_dict(): - actual = stats.compute_stats({"a": 1}, "test_node", {}) + actual = data_observation.compute_stats({"a": 1}, "test_node", {}) assert actual == { "observability_type": "dict", "observability_value": { @@ -34,7 +34,7 @@ def test_compute_stats_dict(): def test_compute_stats_tuple_dataloader(): """tests case of a dataloader""" - actual = stats.compute_stats( + actual = data_observation.compute_stats( ( pd.DataFrame({"a": [1, 2, 3]}), {"SQL_QUERY": "SELECT * FROM FOO.BAR.TABLE", "CONNECTION_INFO": {"URL": "SOME_URL"}}, @@ -75,7 +75,7 @@ class Foo(NamedTuple): y: str f = Foo(1, "a") - actual = stats.compute_stats( + actual = data_observation.compute_stats( f, "test_node", {"some_tag": "foo-bar"}, From 38bef2dc530226ae612b2b81a11eca15e02e9a65 Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Fri, 12 Jul 2024 14:54:58 -0700 Subject: [PATCH 5/6] Adds string diff view This just treats all primitives as strings, which can show the view. This works with strings and will actually just look fine otherwise. It uses the ReactDiffViewer component which is simple, and we use elsewhere in the app. We can probably tune this a bit (the interaction is a little jumpy), but for now this is OK. --- ui/frontend/package-lock.json | 198 ++++++++++++++++-- ui/frontend/package.json | 5 +- .../result-summaries/DataObservability.tsx | 161 ++++++++++---- 3 files changed, 311 insertions(+), 53 deletions(-) diff --git a/ui/frontend/package-lock.json b/ui/frontend/package-lock.json index f206e334c..bf3394340 100644 --- a/ui/frontend/package-lock.json +++ b/ui/frontend/package-lock.json @@ -8,7 +8,7 @@ "name": "frontend", "version": "0.1.0", "dependencies": { - "@headlessui/react": "^1.7.7", + "@headlessui/react": "^2.1.2", "@heroicons/react": "^2.0.13", "@propelauth/react": "^2.0.6", "@reduxjs/toolkit": "^1.9.7", @@ -30,13 +30,14 @@ "dayjs": "^1.11.9", "elkjs": "^0.8.2", "fuse.js": "^6.6.2", + "headlessui": "^0.0.0", "http-proxy-middleware": "^2.0.6", "moment-timezone": "^0.5.40", "posthog-js": "^1.82.1", "prism-react-renderer": "^1.3.5", "react": "^18.2.0", "react-chartjs-2": "^5.2.0", - "react-diff-viewer-continued": "^3.2.6", + "react-diff-viewer-continued": "^3.4.0", "react-dom": "^18.2.0", "react-draggable": "^4.4.5", "react-icons": "^4.10.1", @@ -2749,24 +2750,58 @@ "@floating-ui/utils": "^0.1.3" } }, + "node_modules/@floating-ui/react": { + "version": "0.26.19", + "resolved": "https://registry.npmjs.org/@floating-ui/react/-/react-0.26.19.tgz", + "integrity": "sha512-Jk6zITdjjIvjO/VdQFvpRaD3qPwOHH6AoDHxjhpy+oK4KFgaSP871HYWUAPdnLmx1gQ+w/pB312co3tVml+BXA==", + "dependencies": { + "@floating-ui/react-dom": "^2.1.1", + "@floating-ui/utils": "^0.2.4", + "tabbable": "^6.0.0" + }, + "peerDependencies": { + "react": ">=16.8.0", + "react-dom": ">=16.8.0" + } + }, + "node_modules/@floating-ui/react-dom": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@floating-ui/react-dom/-/react-dom-2.1.1.tgz", + "integrity": "sha512-4h84MJt3CHrtG18mGsXuLCHMrug49d7DFkU0RMIyshRveBeyV2hmV/pDaF2Uxtu8kgq5r46llp5E5FQiR0K2Yg==", + "dependencies": { + "@floating-ui/dom": "^1.0.0" + }, + "peerDependencies": { + "react": ">=16.8.0", + "react-dom": ">=16.8.0" + } + }, + "node_modules/@floating-ui/react/node_modules/@floating-ui/utils": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.2.4.tgz", + "integrity": "sha512-dWO2pw8hhi+WrXq1YJy2yCuWoL20PddgGaqTgVe4cOS9Q6qklXCiA1tJEqX6BEwRNSCP84/afac9hd4MS+zEUA==" + }, "node_modules/@floating-ui/utils": { "version": "0.1.6", "resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.1.6.tgz", "integrity": "sha512-OfX7E2oUDYxtBvsuS4e/jSn4Q9Qb6DzgeYtsAdkPZ47znpoNsMgZw0+tVijiv3uGNR6dgNlty6r9rzIzHjtd/A==" }, "node_modules/@headlessui/react": { - "version": "1.7.17", - "resolved": "https://registry.npmjs.org/@headlessui/react/-/react-1.7.17.tgz", - "integrity": "sha512-4am+tzvkqDSSgiwrsEpGWqgGo9dz8qU5M3znCkC4PgkpY4HcCZzEDEvozltGGGHIKl9jbXbZPSH5TWn4sWJdow==", + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/@headlessui/react/-/react-2.1.2.tgz", + "integrity": "sha512-Kb3hgk9gRNRcTZktBrKdHhF3xFhYkca1Rk6e1/im2ENf83dgN54orMW0uSKTXFnUpZOUFZ+wcY05LlipwgZIFQ==", "dependencies": { - "client-only": "^0.0.1" + "@floating-ui/react": "^0.26.16", + "@react-aria/focus": "^3.17.1", + "@react-aria/interactions": "^3.21.3", + "@tanstack/react-virtual": "^3.8.1" }, "engines": { "node": ">=10" }, "peerDependencies": { - "react": "^16 || ^17 || ^18", - "react-dom": "^16 || ^17 || ^18" + "react": "^18", + "react-dom": "^18" } }, "node_modules/@heroicons/react": { @@ -3400,6 +3435,99 @@ "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0" } }, + "node_modules/@react-aria/focus": { + "version": "3.17.1", + "resolved": "https://registry.npmjs.org/@react-aria/focus/-/focus-3.17.1.tgz", + "integrity": "sha512-FLTySoSNqX++u0nWZJPPN5etXY0WBxaIe/YuL/GTEeuqUIuC/2bJSaw5hlsM6T2yjy6Y/VAxBcKSdAFUlU6njQ==", + "dependencies": { + "@react-aria/interactions": "^3.21.3", + "@react-aria/utils": "^3.24.1", + "@react-types/shared": "^3.23.1", + "@swc/helpers": "^0.5.0", + "clsx": "^2.0.0" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0" + } + }, + "node_modules/@react-aria/focus/node_modules/clsx": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz", + "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==", + "engines": { + "node": ">=6" + } + }, + "node_modules/@react-aria/interactions": { + "version": "3.21.3", + "resolved": "https://registry.npmjs.org/@react-aria/interactions/-/interactions-3.21.3.tgz", + "integrity": "sha512-BWIuf4qCs5FreDJ9AguawLVS0lV9UU+sK4CCnbCNNmYqOWY+1+gRXCsnOM32K+oMESBxilAjdHW5n1hsMqYMpA==", + "dependencies": { + "@react-aria/ssr": "^3.9.4", + "@react-aria/utils": "^3.24.1", + "@react-types/shared": "^3.23.1", + "@swc/helpers": "^0.5.0" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0" + } + }, + "node_modules/@react-aria/ssr": { + "version": "3.9.4", + "resolved": "https://registry.npmjs.org/@react-aria/ssr/-/ssr-3.9.4.tgz", + "integrity": "sha512-4jmAigVq409qcJvQyuorsmBR4+9r3+JEC60wC+Y0MZV0HCtTmm8D9guYXlJMdx0SSkgj0hHAyFm/HvPNFofCoQ==", + "dependencies": { + "@swc/helpers": "^0.5.0" + }, + "engines": { + "node": ">= 12" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0" + } + }, + "node_modules/@react-aria/utils": { + "version": "3.24.1", + "resolved": "https://registry.npmjs.org/@react-aria/utils/-/utils-3.24.1.tgz", + "integrity": "sha512-O3s9qhPMd6n42x9sKeJ3lhu5V1Tlnzhu6Yk8QOvDuXf7UGuUjXf9mzfHJt1dYzID4l9Fwm8toczBzPM9t0jc8Q==", + "dependencies": { + "@react-aria/ssr": "^3.9.4", + "@react-stately/utils": "^3.10.1", + "@react-types/shared": "^3.23.1", + "@swc/helpers": "^0.5.0", + "clsx": "^2.0.0" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0" + } + }, + "node_modules/@react-aria/utils/node_modules/clsx": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz", + "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==", + "engines": { + "node": ">=6" + } + }, + "node_modules/@react-stately/utils": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/@react-stately/utils/-/utils-3.10.1.tgz", + "integrity": "sha512-VS/EHRyicef25zDZcM/ClpzYMC5i2YGN6uegOeQawmgfGjb02yaCX0F0zR69Pod9m2Hr3wunTbtpgVXvYbZItg==", + "dependencies": { + "@swc/helpers": "^0.5.0" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0" + } + }, + "node_modules/@react-types/shared": { + "version": "3.23.1", + "resolved": "https://registry.npmjs.org/@react-types/shared/-/shared-3.23.1.tgz", + "integrity": "sha512-5d+3HbFDxGZjhbMBeFHRQhexMFt4pUce3okyRtUVKbbedQFUrtXSBg9VszgF2RTeQDKDkMCIQDtz5ccP/Lk1gw==", + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0" + } + }, "node_modules/@reactflow/background": { "version": "11.3.6", "resolved": "https://registry.npmjs.org/@reactflow/background/-/background-11.3.6.tgz", @@ -3861,6 +3989,14 @@ "url": "https://github.com/sponsors/gregberge" } }, + "node_modules/@swc/helpers": { + "version": "0.5.12", + "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.12.tgz", + "integrity": "sha512-KMZNXiGibsW9kvZAO1Pam2JPTDBm+KSHMMHWdsyI/1DbIZjT2A6Gy3hblVXUMEDvUAKq+e0vL0X0o54owWji7g==", + "dependencies": { + "tslib": "^2.4.0" + } + }, "node_modules/@tailwindcss/forms": { "version": "0.5.7", "resolved": "https://registry.npmjs.org/@tailwindcss/forms/-/forms-0.5.7.tgz", @@ -3886,6 +4022,31 @@ "tailwindcss": ">=3.0.0 || insiders" } }, + "node_modules/@tanstack/react-virtual": { + "version": "3.8.3", + "resolved": "https://registry.npmjs.org/@tanstack/react-virtual/-/react-virtual-3.8.3.tgz", + "integrity": "sha512-9ICwbDUUzN99CJIGc373i8NLoj6zFTKI2Hlcmo0+lCSAhPQ5mxq4dGOMKmLYoEFyHcGQ64Bd6ZVbnPpM6lNK5w==", + "dependencies": { + "@tanstack/virtual-core": "3.8.3" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/tannerlinsley" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0 || ^18.0.0", + "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0" + } + }, + "node_modules/@tanstack/virtual-core": { + "version": "3.8.3", + "resolved": "https://registry.npmjs.org/@tanstack/virtual-core/-/virtual-core-3.8.3.tgz", + "integrity": "sha512-vd2A2TnM5lbnWZnHi9B+L2gPtkSeOtJOAw358JqokIH1+v2J7vUAzFVPwB/wrye12RFOurffXu33plm4uQ+JBQ==", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/tannerlinsley" + } + }, "node_modules/@testing-library/jest-dom": { "version": "5.17.0", "resolved": "https://registry.npmjs.org/@testing-library/jest-dom/-/jest-dom-5.17.0.tgz", @@ -6299,11 +6460,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/client-only": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/client-only/-/client-only-0.0.1.tgz", - "integrity": "sha512-IV3Ou0jSMzZrd3pZ48nLkT9DA7Ag1pnPzaiQhpW7c3RbcqqzvzzVu+L8gfqMp/8IM2MQtSiqaCxrrcfu8I8rMA==" - }, "node_modules/cliui": { "version": "7.0.4", "resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz", @@ -9620,6 +9776,11 @@ "he": "bin/he" } }, + "node_modules/headlessui": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/headlessui/-/headlessui-0.0.0.tgz", + "integrity": "sha512-CHvacVPbl8AqIg2sBNKySUmumu7o15jSrCaTrIh9GW2Eq4y/krCN/vZFOsKCwlrhWQbO4267a8xvvP8bs+qREQ==" + }, "node_modules/highlight.js": { "version": "10.7.3", "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.7.3.tgz", @@ -15396,9 +15557,9 @@ } }, "node_modules/react-diff-viewer-continued": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/react-diff-viewer-continued/-/react-diff-viewer-continued-3.3.1.tgz", - "integrity": "sha512-YhjWjCUq6cs8k9iErpWh/xB2jFCndigGAz2TKubdqrSTtDH5Ib+tdQgzBWVXMMqgtEwoPLi+WFmSsdSoYbDVpw==", + "version": "3.4.0", + "resolved": "https://registry.npmjs.org/react-diff-viewer-continued/-/react-diff-viewer-continued-3.4.0.tgz", + "integrity": "sha512-kMZmUyb3Pv5L9vUtCfIGYsdOHs8mUojblGy1U1Sm0D7FhAOEsH9QhnngEIRo5hXWIPNGupNRJls1TJ6Eqx84eg==", "dependencies": { "@emotion/css": "^11.11.2", "classnames": "^2.3.2", @@ -17562,6 +17723,11 @@ "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==" }, + "node_modules/tabbable": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/tabbable/-/tabbable-6.2.0.tgz", + "integrity": "sha512-Cat63mxsVJlzYvN51JmVXIgNoUokrIaT2zLclCXjRd8boZ0004U4KCs/sToJ75C6sdlByWxpYnb5Boif1VSFew==" + }, "node_modules/tailwind-scrollbar-hide": { "version": "1.1.7", "resolved": "https://registry.npmjs.org/tailwind-scrollbar-hide/-/tailwind-scrollbar-hide-1.1.7.tgz", diff --git a/ui/frontend/package.json b/ui/frontend/package.json index 660d031bb..7cc72462c 100644 --- a/ui/frontend/package.json +++ b/ui/frontend/package.json @@ -3,7 +3,7 @@ "version": "0.1.0", "private": true, "dependencies": { - "@headlessui/react": "^1.7.7", + "@headlessui/react": "^2.1.2", "@heroicons/react": "^2.0.13", "@propelauth/react": "^2.0.6", "@reduxjs/toolkit": "^1.9.7", @@ -25,13 +25,14 @@ "dayjs": "^1.11.9", "elkjs": "^0.8.2", "fuse.js": "^6.6.2", + "headlessui": "^0.0.0", "http-proxy-middleware": "^2.0.6", "moment-timezone": "^0.5.40", "posthog-js": "^1.82.1", "prism-react-renderer": "^1.3.5", "react": "^18.2.0", "react-chartjs-2": "^5.2.0", - "react-diff-viewer-continued": "^3.2.6", + "react-diff-viewer-continued": "^3.4.0", "react-dom": "^18.2.0", "react-draggable": "^4.4.5", "react-icons": "^4.10.1", diff --git a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx index 5fd29a026..cadcbf0c5 100644 --- a/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx +++ b/ui/frontend/src/components/dashboard/Runs/Task/result-summaries/DataObservability.tsx @@ -15,7 +15,10 @@ import { getNodeRunAttributes, } from "../../../../../state/api/friendlyApi"; import { parsePythonType } from "../../../../../utils"; -import { GenericTable } from "../../../../common/GenericTable"; +import { + GenericGroupedTable, + GenericTable, +} from "../../../../common/GenericTable"; import { RunLink } from "../../../../common/CommonLinks"; import { PandasDescribe1View } from "./PandasDescribe"; import { Dict1View, Dict2View } from "./DictView"; @@ -23,68 +26,156 @@ import { DAGWorksDescribe3View } from "./DAGWorksDescribe"; import { HTML1View } from "./HTMLView"; import { Schema1View } from "./SchemaView"; import { HiChevronDown, HiChevronUp } from "react-icons/hi"; +import ReactDiffViewer from "react-diff-viewer-continued"; +import { Field, Label, Switch } from "@headlessui/react"; +const DiffView = (props: { oldValue: string; newValue: string }) => { + const [splitView, setSplitView] = useState(false); + return ( +
+
+ + { + setSplitView(!splitView); + e.stopPropagation(); + }} + className="group relative inline-flex h-6 w-11 flex-shrink-0 cursor-pointer rounded-full border-2 border-transparent bg-gray-200 transition-colors duration-200 ease-in-out focus:outline-none focus:ring-2 focus:ring-dwdarkblue focus:ring-offset-2 data-[checked]:bg-dwdarkblue" + > + + + +
+ +
+ ); +}; const Primitive1View = (props: { taskName: string; runIds: number[]; values: AttributePrimitive1[]; projectId: number; }) => { + const valuesWithRunID = props.values.map((item, i) => { + return { ...item, runId: props.runIds[i] }; + }); return (
- { - return [props.runIds[i].toString() || "", item]; + { + return ["", item]; })} columns={[ { displayName: "type", - Render: (item: AttributePrimitive1) => { - return ( + Render: ( + items: AttributePrimitive1[], + isSummaryRow: boolean, + isExpanded: boolean + ) => { + const uniqueTypes = Array.from( + new Set( + items.map((item) => parsePythonType({ type_name: item.type })) + ) + ); + return isSummaryRow && isExpanded ? ( + <> + ) : (
- - {parsePythonType({ type_name: item.type })} - + {uniqueTypes.map((type, i) => { + return ( +
+ {type} +
+ ); + })}
); }, }, { displayName: "value", - Render: (item: AttributePrimitive1) => { + Render: ( + items: AttributePrimitive1[], + isSummaryRow: boolean, + isExpanded: boolean + ) => { const [expanded, setExpanded] = useState(false); return ( -
-
 {
-                      setExpanded(!expanded);
-                      e.stopPropagation();
-                    }}
-                    className={`${
-                      expanded ? "break-word whitespace-pre-wrap" : "truncate"
-                    }  text-gray-500 cursor-cell`}
-                  >
-                    {item.value.toString()}
-                  
- {/* - {item.value.toString()} - */} +
+ {isSummaryRow && isExpanded ? ( + <> + ) : isSummaryRow && !isExpanded ? ( + + ) : ( +
 {
+                        setExpanded(!expanded);
+                        e.stopPropagation();
+                      }}
+                      className={`w- ${
+                        expanded ? "break-word whitespace-pre-wrap" : "truncate"
+                      }  text-gray-500 cursor-cell`}
+                    >
+                      {items[0].value.toString()}
+                    
+ )} +
+ ); + }, + }, + { + displayName: "runs", + Render: ( + value: AttributePrimitive1[], + isSummaryRow: boolean, + isExpanded: boolean + ) => { + if (isSummaryRow && isExpanded) { + return <>; + } + return ( +
+ {props.runIds.map((taskRun, i) => ( + void 0} + /> + ))}
); }, }, ]} dataTypeName={"Run"} - dataTypeDisplay={(item: string) => { - return ( - void 0} - highlightedRun={null} - > - ); - }} />
); From 161b0288d70a28dae3949018540a80f29ec12b7e Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Mon, 15 Jul 2024 16:49:45 -0700 Subject: [PATCH 6/6] Adds back in pyspark individual metrics We had captured all of them. Now we capture individual data as well, which allows for easy comparison. It's duplicated, so we use an lru_tools cache (which should cache based on the pyspark dataframe ID) --- .../hamilton_sdk/tracking/pyspark_stats.py | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py b/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py index 947b42525..32baaa406 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py +++ b/ui/sdk/src/hamilton_sdk/tracking/pyspark_stats.py @@ -1,4 +1,5 @@ -from typing import Any, Dict, Optional +import functools +from typing import Any, Dict, List, Optional import pyspark.sql as ps from hamilton_sdk.tracking import data_observation @@ -43,6 +44,8 @@ } +# quick cache to ensure we don't compute twice +@functools.lru_cache(maxsize=128) def _introspect(df: ps.DataFrame) -> Dict[str, Any]: """Introspect a PySpark dataframe and return a dictionary of statistics. @@ -105,6 +108,33 @@ def compute_schema_psdf( return None +@data_observation.compute_additional_results.register +def compute_additional_psdf( + result: ps.DataFrame, node_name: str, node_tags: dict +) -> List[ObservationType]: + o_value = _introspect(result) + return [ + { + "observability_type": "primitive", + "observability_value": { + "type": str(str), + "value": o_value["cost_explain"], + }, + "observability_schema_version": "0.0.1", + "name": "Cost Explain", + }, + { + "observability_type": "primitive", + "observability_value": { + "type": str(str), + "value": o_value["extended_explain"], + }, + "observability_schema_version": "0.0.1", + "name": "Extended Explain", + }, + ] + + if __name__ == "__main__": import numpy as np import pandas as pd