From f5d7fad3eb1b6eea1576ab0eec0f730df0a86a42 Mon Sep 17 00:00:00 2001 From: Azory YData Bot Date: Mon, 18 Nov 2024 17:39:46 +0000 Subject: [PATCH] Deployed 611bd22 to 1.0 with MkDocs 1.6.1 and mike 2.1.3 --- 1.0/404.html | 2 +- .../connectors/create_connector/index.html | 2 +- 1.0/data_catalog/connectors/index.html | 2 +- .../supported_connections/index.html | 2 +- .../connectors/use_in_labs/index.html | 2 +- .../datasources/dataset_overview/index.html | 2 +- .../datasources/explore_in_labs/index.html | 2 +- 1.0/data_catalog/datasources/index.html | 2 +- .../datasources/metadata/index.html | 2 +- 1.0/data_catalog/datasources/pii/index.html | 2 +- .../datasources/profiling/index.html | 2 +- .../datasources/warnings/index.html | 2 +- 1.0/data_catalog/index.html | 2 +- .../deployment/aws/bastion_host/index.html | 2 +- .../deployment/aws/billing/index.html | 2 +- .../deployment/aws/clean/index.html | 2 +- .../deployment/aws/deploy/index.html | 2 +- .../deployment/aws/instance_types/index.html | 2 +- .../aws/pre_deploy_checklist/index.html | 2 +- .../deployment/aws/regions/index.html | 2 +- .../deployment/aws/update/index.html | 2 +- .../deployment/azure/billing/index.html | 2 +- .../deployment/azure/clean/index.html | 2 +- .../deployment/azure/deploy/index.html | 2 +- .../azure/instance_types/index.html | 2 +- .../azure/pre_deploy_checklist/index.html | 2 +- .../deployment/azure/regions/index.html | 2 +- .../deployment/google/deploy/index.html | 2 +- .../google/pre_deploy_checklist/index.html | 2 +- .../login_support/login_providers/index.html | 2 +- .../login_support/support/index.html | 2 +- 1.0/deployment_and_security/index.html | 2 +- .../security/index.html | 2 +- .../security_building_materials/index.html | 2 +- .../create_database_sd_generator/index.html | 2 +- 1.0/get-started/create_lab/index.html | 2 +- .../create_multitable_dataset/index.html | 2 +- 1.0/get-started/create_pipeline/index.html | 2 +- .../create_syntheticdata_generator/index.html | 2 +- 1.0/get-started/fabric_community/index.html | 2 +- 1.0/get-started/index.html | 2 +- 1.0/get-started/upload_csv/index.html | 2 +- 1.0/index.html | 2 +- .../integration_connectors_catalog/index.html | 2 +- .../integration_with_sdk/index.html | 2 +- .../databricks/overview/index.html | 2 +- 1.0/integrations/index.html | 2 +- .../integration_snowflake/index.html | 2 +- 1.0/labs/index.html | 2 +- 1.0/labs/overview/index.html | 2 +- 1.0/pipelines/concepts/index.html | 2 +- 1.0/pipelines/index.html | 2 +- 1.0/pipelines/runs/index.html | 2 +- .../synthesize_tabular_data/index.html | 3 +- .../synthesize_timeseries_data/index.html | 68 +++++++++++++----- .../synthesize_with_anonymization/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../synthesizer_multitable/index.html | 2 +- 1.0/sdk/index.html | 2 +- 1.0/sdk/installation/index.html | 2 +- 1.0/sdk/modules/connectors/index.html | 2 +- 1.0/sdk/modules/synthetic_data/index.html | 2 +- 1.0/sdk/quickstart/index.html | 2 +- .../reference/api/common/client/index.html | 2 +- 1.0/sdk/reference/api/common/types/index.html | 2 +- .../api/connectors/connector/index.html | 2 +- .../api/datasources/datasource/index.html | 2 +- .../api/datasources/metadata/index.html | 2 +- 1.0/sdk/reference/api/index.html | 2 +- .../api/synthesizers/base/index.html | 2 +- .../api/synthesizers/multitable/index.html | 2 +- .../api/synthesizers/regular/index.html | 2 +- .../api/synthesizers/timeseries/index.html | 2 +- 1.0/search/search_index.json | 2 +- 1.0/sitemap.xml.gz | Bin 127 -> 127 bytes 1.0/support/help-troubleshooting/index.html | 2 +- 1.0/synthetic_data/best_practices/index.html | 2 +- 1.0/synthetic_data/index.html | 2 +- .../relational_database/index.html | 2 +- .../use_in_labs/index.html | 2 +- 1.0/synthetic_data/single_table/index.html | 2 +- .../compare_profiling/index.html | 2 +- .../report_pdf/index.html | 2 +- 1.0/synthetic_data/text/index.html | 2 +- 1.0/synthetic_data/timeseries/index.html | 2 +- 86 files changed, 135 insertions(+), 102 deletions(-) diff --git a/1.0/404.html b/1.0/404.html index 9fa1b77a..15fa35ed 100644 --- a/1.0/404.html +++ b/1.0/404.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/data_catalog/connectors/create_connector/index.html b/1.0/data_catalog/connectors/create_connector/index.html index f8995330..3b0f6a07 100644 --- a/1.0/data_catalog/connectors/create_connector/index.html +++ b/1.0/data_catalog/connectors/create_connector/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/data_catalog/connectors/index.html b/1.0/data_catalog/connectors/index.html index 5a282cd8..697ddce4 100644 --- a/1.0/data_catalog/connectors/index.html +++ b/1.0/data_catalog/connectors/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/data_catalog/connectors/supported_connections/index.html b/1.0/data_catalog/connectors/supported_connections/index.html index 24c8b4af..7aed22c4 100644 --- a/1.0/data_catalog/connectors/supported_connections/index.html +++ b/1.0/data_catalog/connectors/supported_connections/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/data_catalog/connectors/use_in_labs/index.html b/1.0/data_catalog/connectors/use_in_labs/index.html index 026ff8c4..7fba8930 100644 --- a/1.0/data_catalog/connectors/use_in_labs/index.html +++ b/1.0/data_catalog/connectors/use_in_labs/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/data_catalog/datasources/dataset_overview/index.html b/1.0/data_catalog/datasources/dataset_overview/index.html index 7f08b5b1..4946cdab 100644 --- a/1.0/data_catalog/datasources/dataset_overview/index.html +++ b/1.0/data_catalog/datasources/dataset_overview/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/data_catalog/datasources/explore_in_labs/index.html b/1.0/data_catalog/datasources/explore_in_labs/index.html index 1f43386b..8ea958f3 100644 --- a/1.0/data_catalog/datasources/explore_in_labs/index.html +++ b/1.0/data_catalog/datasources/explore_in_labs/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/data_catalog/datasources/index.html b/1.0/data_catalog/datasources/index.html index 40487414..e7acad97 100644 --- a/1.0/data_catalog/datasources/index.html +++ b/1.0/data_catalog/datasources/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/data_catalog/datasources/metadata/index.html b/1.0/data_catalog/datasources/metadata/index.html index 654c05da..b58e2b0a 100644 --- a/1.0/data_catalog/datasources/metadata/index.html +++ b/1.0/data_catalog/datasources/metadata/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/data_catalog/datasources/pii/index.html b/1.0/data_catalog/datasources/pii/index.html index 6500e2b0..cf2a9a4c 100644 --- a/1.0/data_catalog/datasources/pii/index.html +++ b/1.0/data_catalog/datasources/pii/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/data_catalog/datasources/profiling/index.html b/1.0/data_catalog/datasources/profiling/index.html index 6abbe552..482c7378 100644 --- a/1.0/data_catalog/datasources/profiling/index.html +++ b/1.0/data_catalog/datasources/profiling/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/data_catalog/datasources/warnings/index.html b/1.0/data_catalog/datasources/warnings/index.html index 644f22e0..2eac86c2 100644 --- a/1.0/data_catalog/datasources/warnings/index.html +++ b/1.0/data_catalog/datasources/warnings/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/data_catalog/index.html b/1.0/data_catalog/index.html index cd5de896..5b166c8c 100644 --- a/1.0/data_catalog/index.html +++ b/1.0/data_catalog/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/aws/bastion_host/index.html b/1.0/deployment_and_security/deployment/aws/bastion_host/index.html index 0c805368..30d32213 100644 --- a/1.0/deployment_and_security/deployment/aws/bastion_host/index.html +++ b/1.0/deployment_and_security/deployment/aws/bastion_host/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/aws/billing/index.html b/1.0/deployment_and_security/deployment/aws/billing/index.html index 927a8cfd..a2a85300 100644 --- a/1.0/deployment_and_security/deployment/aws/billing/index.html +++ b/1.0/deployment_and_security/deployment/aws/billing/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/aws/clean/index.html b/1.0/deployment_and_security/deployment/aws/clean/index.html index a15d4202..51c5e155 100644 --- a/1.0/deployment_and_security/deployment/aws/clean/index.html +++ b/1.0/deployment_and_security/deployment/aws/clean/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/aws/deploy/index.html b/1.0/deployment_and_security/deployment/aws/deploy/index.html index 511c0f31..1526273e 100644 --- a/1.0/deployment_and_security/deployment/aws/deploy/index.html +++ b/1.0/deployment_and_security/deployment/aws/deploy/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/aws/instance_types/index.html b/1.0/deployment_and_security/deployment/aws/instance_types/index.html index 8b668c27..fb8439b5 100644 --- a/1.0/deployment_and_security/deployment/aws/instance_types/index.html +++ b/1.0/deployment_and_security/deployment/aws/instance_types/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/aws/pre_deploy_checklist/index.html b/1.0/deployment_and_security/deployment/aws/pre_deploy_checklist/index.html index 3ac8ed25..7f06e48c 100644 --- a/1.0/deployment_and_security/deployment/aws/pre_deploy_checklist/index.html +++ b/1.0/deployment_and_security/deployment/aws/pre_deploy_checklist/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/aws/regions/index.html b/1.0/deployment_and_security/deployment/aws/regions/index.html index e0d8ff89..cc7df898 100644 --- a/1.0/deployment_and_security/deployment/aws/regions/index.html +++ b/1.0/deployment_and_security/deployment/aws/regions/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/aws/update/index.html b/1.0/deployment_and_security/deployment/aws/update/index.html index 02ed077c..c37e1590 100644 --- a/1.0/deployment_and_security/deployment/aws/update/index.html +++ b/1.0/deployment_and_security/deployment/aws/update/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/azure/billing/index.html b/1.0/deployment_and_security/deployment/azure/billing/index.html index c8327923..3816e124 100644 --- a/1.0/deployment_and_security/deployment/azure/billing/index.html +++ b/1.0/deployment_and_security/deployment/azure/billing/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/azure/clean/index.html b/1.0/deployment_and_security/deployment/azure/clean/index.html index 58c0f180..d611ea18 100644 --- a/1.0/deployment_and_security/deployment/azure/clean/index.html +++ b/1.0/deployment_and_security/deployment/azure/clean/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/azure/deploy/index.html b/1.0/deployment_and_security/deployment/azure/deploy/index.html index 7a056309..82350846 100644 --- a/1.0/deployment_and_security/deployment/azure/deploy/index.html +++ b/1.0/deployment_and_security/deployment/azure/deploy/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/azure/instance_types/index.html b/1.0/deployment_and_security/deployment/azure/instance_types/index.html index 9b7602c9..0031c33d 100644 --- a/1.0/deployment_and_security/deployment/azure/instance_types/index.html +++ b/1.0/deployment_and_security/deployment/azure/instance_types/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/azure/pre_deploy_checklist/index.html b/1.0/deployment_and_security/deployment/azure/pre_deploy_checklist/index.html index dc2180dc..f25ea2af 100644 --- a/1.0/deployment_and_security/deployment/azure/pre_deploy_checklist/index.html +++ b/1.0/deployment_and_security/deployment/azure/pre_deploy_checklist/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/azure/regions/index.html b/1.0/deployment_and_security/deployment/azure/regions/index.html index dff48b42..f3d8ea12 100644 --- a/1.0/deployment_and_security/deployment/azure/regions/index.html +++ b/1.0/deployment_and_security/deployment/azure/regions/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/google/deploy/index.html b/1.0/deployment_and_security/deployment/google/deploy/index.html index 5fd2e92a..63c8a57b 100644 --- a/1.0/deployment_and_security/deployment/google/deploy/index.html +++ b/1.0/deployment_and_security/deployment/google/deploy/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/google/pre_deploy_checklist/index.html b/1.0/deployment_and_security/deployment/google/pre_deploy_checklist/index.html index d4179aad..0113ea6c 100644 --- a/1.0/deployment_and_security/deployment/google/pre_deploy_checklist/index.html +++ b/1.0/deployment_and_security/deployment/google/pre_deploy_checklist/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/login_support/login_providers/index.html b/1.0/deployment_and_security/deployment/login_support/login_providers/index.html index 86673de8..aab7fa15 100644 --- a/1.0/deployment_and_security/deployment/login_support/login_providers/index.html +++ b/1.0/deployment_and_security/deployment/login_support/login_providers/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/deployment/login_support/support/index.html b/1.0/deployment_and_security/deployment/login_support/support/index.html index 22134d36..d6a55368 100644 --- a/1.0/deployment_and_security/deployment/login_support/support/index.html +++ b/1.0/deployment_and_security/deployment/login_support/support/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/index.html b/1.0/deployment_and_security/index.html index 098f23f8..cac4498f 100644 --- a/1.0/deployment_and_security/index.html +++ b/1.0/deployment_and_security/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/deployment_and_security/security/index.html b/1.0/deployment_and_security/security/index.html index cd7c93c8..4316abfb 100644 --- a/1.0/deployment_and_security/security/index.html +++ b/1.0/deployment_and_security/security/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/deployment_and_security/security/security_building_materials/index.html b/1.0/deployment_and_security/security/security_building_materials/index.html index bde1b4e8..ee4f2eff 100644 --- a/1.0/deployment_and_security/security/security_building_materials/index.html +++ b/1.0/deployment_and_security/security/security_building_materials/index.html @@ -14,7 +14,7 @@ - + diff --git a/1.0/get-started/create_database_sd_generator/index.html b/1.0/get-started/create_database_sd_generator/index.html index f1cb9ab8..a3a7f526 100644 --- a/1.0/get-started/create_database_sd_generator/index.html +++ b/1.0/get-started/create_database_sd_generator/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/get-started/create_lab/index.html b/1.0/get-started/create_lab/index.html index 970f9e26..295229b6 100644 --- a/1.0/get-started/create_lab/index.html +++ b/1.0/get-started/create_lab/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/get-started/create_multitable_dataset/index.html b/1.0/get-started/create_multitable_dataset/index.html index ddb695ec..bf0d051c 100644 --- a/1.0/get-started/create_multitable_dataset/index.html +++ b/1.0/get-started/create_multitable_dataset/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/get-started/create_pipeline/index.html b/1.0/get-started/create_pipeline/index.html index e166a4a1..da9fd40d 100644 --- a/1.0/get-started/create_pipeline/index.html +++ b/1.0/get-started/create_pipeline/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/get-started/create_syntheticdata_generator/index.html b/1.0/get-started/create_syntheticdata_generator/index.html index 7325abdc..a5a5187f 100644 --- a/1.0/get-started/create_syntheticdata_generator/index.html +++ b/1.0/get-started/create_syntheticdata_generator/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/get-started/fabric_community/index.html b/1.0/get-started/fabric_community/index.html index a2db71c6..73141bca 100644 --- a/1.0/get-started/fabric_community/index.html +++ b/1.0/get-started/fabric_community/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/get-started/index.html b/1.0/get-started/index.html index 7edd4034..b5f84467 100644 --- a/1.0/get-started/index.html +++ b/1.0/get-started/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/get-started/upload_csv/index.html b/1.0/get-started/upload_csv/index.html index 59f35e0a..d0157f6e 100644 --- a/1.0/get-started/upload_csv/index.html +++ b/1.0/get-started/upload_csv/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/index.html b/1.0/index.html index 1debbf68..44e1495f 100644 --- a/1.0/index.html +++ b/1.0/index.html @@ -14,7 +14,7 @@ - + diff --git a/1.0/integrations/databricks/integration_connectors_catalog/index.html b/1.0/integrations/databricks/integration_connectors_catalog/index.html index 2f6a4c30..fd14efd3 100644 --- a/1.0/integrations/databricks/integration_connectors_catalog/index.html +++ b/1.0/integrations/databricks/integration_connectors_catalog/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/integrations/databricks/integration_with_sdk/index.html b/1.0/integrations/databricks/integration_with_sdk/index.html index f493a068..5bde8a06 100644 --- a/1.0/integrations/databricks/integration_with_sdk/index.html +++ b/1.0/integrations/databricks/integration_with_sdk/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/integrations/databricks/overview/index.html b/1.0/integrations/databricks/overview/index.html index 322258b0..deee7774 100644 --- a/1.0/integrations/databricks/overview/index.html +++ b/1.0/integrations/databricks/overview/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/integrations/index.html b/1.0/integrations/index.html index ced966e9..3701c5ec 100644 --- a/1.0/integrations/index.html +++ b/1.0/integrations/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/integrations/snowflake/integration_snowflake/index.html b/1.0/integrations/snowflake/integration_snowflake/index.html index 48806813..f00fd090 100644 --- a/1.0/integrations/snowflake/integration_snowflake/index.html +++ b/1.0/integrations/snowflake/integration_snowflake/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/labs/index.html b/1.0/labs/index.html index 98d5b847..dab48dea 100644 --- a/1.0/labs/index.html +++ b/1.0/labs/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/labs/overview/index.html b/1.0/labs/overview/index.html index ee658429..7ddf72e9 100644 --- a/1.0/labs/overview/index.html +++ b/1.0/labs/overview/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/pipelines/concepts/index.html b/1.0/pipelines/concepts/index.html index 4a7eb466..9da8d3ea 100644 --- a/1.0/pipelines/concepts/index.html +++ b/1.0/pipelines/concepts/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/pipelines/index.html b/1.0/pipelines/index.html index 3a485837..442c4e0e 100644 --- a/1.0/pipelines/index.html +++ b/1.0/pipelines/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/pipelines/runs/index.html b/1.0/pipelines/runs/index.html index 050491ed..32359d11 100644 --- a/1.0/pipelines/runs/index.html +++ b/1.0/pipelines/runs/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/sdk/examples/synthesize_tabular_data/index.html b/1.0/sdk/examples/synthesize_tabular_data/index.html index 4934221d..ed1203e8 100644 --- a/1.0/sdk/examples/synthesize_tabular_data/index.html +++ b/1.0/sdk/examples/synthesize_tabular_data/index.html @@ -16,7 +16,7 @@ - + @@ -3308,6 +3308,7 @@

Synthesize tabular data

Use YData's RegularSynthesizer to generate tabular synthetic data

+

For a more detailed tutorial please check YData Fabric Academy ydata-sdk notebooks.

import os
 
 from ydata.sdk.dataset import get_dataset
diff --git a/1.0/sdk/examples/synthesize_timeseries_data/index.html b/1.0/sdk/examples/synthesize_timeseries_data/index.html
index fcd83c6a..188cb8de 100644
--- a/1.0/sdk/examples/synthesize_timeseries_data/index.html
+++ b/1.0/sdk/examples/synthesize_timeseries_data/index.html
@@ -16,7 +16,7 @@
       
       
       
-      
+      
     
     
       
@@ -3318,28 +3318,60 @@ 

Synthesize time-series data

  • Variables that refer to entities (single or multiple entities)
  • Variables that are attributes (those that don't depend on time but rather on the entity)
  • +

    For a more detailed tutorial please check YData Fabric Academy ydata-sdk notebooks.

    Below find an example:

    -
    import os
    +
    # -*- coding: utf-8 -*-
     
    -from ydata.sdk.dataset import get_dataset
    -from ydata.sdk.synthesizers import TimeSeriesSynthesizer
    +# Authentication
    +import os
     
    -# Do not forget to add your token as env variable
    -os.environ["YDATA_TOKEN"] = '<TOKEN>'
    +from ydata.sdk.dataset import get_dataset
    +from ydata.sdk.synthesizers import TimeSeriesSynthesizer
     
    -X = get_dataset('occupancy')
    -
    -# We initialize a time series synthesizer
    -# As long as the synthesizer does not call `fit`, it exists only locally
    -synth = TimeSeriesSynthesizer()
    +# Do not forget to add your token as env variable
    +os.environ["YDATA_TOKEN"] = '{insert-token}'
    +
    +
    +# Sampling an example dataset for a multientity & multivariate time-series dataset"""
     
    -# We train the synthesizer on our dataset
    -# sortbykey -> variable that define the time order for the sequence
    -synth.fit(X, sortbykey='date')
    -
    -# By default it is requested a synthetic sample with the same length as the original data
    -# The TimeSeriesSynthesizer is designed to replicate temporal series and therefore the original time-horizon is respected
    -sample = synth.sample(n_entities=1)
    +# Generate the dataset
    +time_series_data = get_dataset('timeseries')
    +
    +# Print the first few rows of the dataset
    +print(time_series_data.head())
    +
    +# Train a Synthetic data generator
    +
    +# From a pandas dataframe
    +
    +# We initialize a time series synthesizer
    +# As long as the synthesizer does not call `fit`, it exists only locally
    +synth = TimeSeriesSynthesizer(name='Time-series synth')
    +
    +# We train the synthesizer on our dataset
    +# sortbykey -> variable that define the time order for the sequence
    +synth.fit(time_series_data, sortbykey='time', entities='entity_id')
    +
    +# Generate samples from an already trained synthesizer
    +# From the synthesizer in context in the notebook
    +
    +
    +# Generate a sample with x number of entities
    +# In this example the objective is to generate a dataset with the same size as the original. For that reason, 5 entities will be generated.
    +sample = synth.sample(n_entities=5)
    +
    +sample.head()
    +
    +# From a previously trained synthetic data generation model
    +# List the trained synthetic data generators to get the uid synthetisizer
    +TimeSeriesSynthesizer.list()
    +
    +synth = TimeSeriesSynthesizer(uid='{insert-synth-id}').get()
    +
    +# Generate a new synthetic dataset with the sample method
    +sample = synth.sample(n_entities=5)
    +
    +sample.head()
     
    diff --git a/1.0/sdk/examples/synthesize_with_anonymization/index.html b/1.0/sdk/examples/synthesize_with_anonymization/index.html index 5ad91fd4..adf7fe81 100644 --- a/1.0/sdk/examples/synthesize_with_anonymization/index.html +++ b/1.0/sdk/examples/synthesize_with_anonymization/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/examples/synthesize_with_conditional_sampling/index.html b/1.0/sdk/examples/synthesize_with_conditional_sampling/index.html index 681a5dfe..4bd8da57 100644 --- a/1.0/sdk/examples/synthesize_with_conditional_sampling/index.html +++ b/1.0/sdk/examples/synthesize_with_conditional_sampling/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/examples/synthesize_with_privacy_control/index.html b/1.0/sdk/examples/synthesize_with_privacy_control/index.html index 211d58db..6b894cb7 100644 --- a/1.0/sdk/examples/synthesize_with_privacy_control/index.html +++ b/1.0/sdk/examples/synthesize_with_privacy_control/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/examples/synthesizer_multitable/index.html b/1.0/sdk/examples/synthesizer_multitable/index.html index cd47841e..7a1919e2 100644 --- a/1.0/sdk/examples/synthesizer_multitable/index.html +++ b/1.0/sdk/examples/synthesizer_multitable/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/index.html b/1.0/sdk/index.html index e5277f49..7f33c941 100644 --- a/1.0/sdk/index.html +++ b/1.0/sdk/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/installation/index.html b/1.0/sdk/installation/index.html index da26de3f..529143a5 100644 --- a/1.0/sdk/installation/index.html +++ b/1.0/sdk/installation/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/modules/connectors/index.html b/1.0/sdk/modules/connectors/index.html index 92c940c8..8f2592ec 100644 --- a/1.0/sdk/modules/connectors/index.html +++ b/1.0/sdk/modules/connectors/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/modules/synthetic_data/index.html b/1.0/sdk/modules/synthetic_data/index.html index f38640fe..be8a35c1 100644 --- a/1.0/sdk/modules/synthetic_data/index.html +++ b/1.0/sdk/modules/synthetic_data/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/sdk/quickstart/index.html b/1.0/sdk/quickstart/index.html index 21fe8cad..06ca525a 100644 --- a/1.0/sdk/quickstart/index.html +++ b/1.0/sdk/quickstart/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/reference/api/common/client/index.html b/1.0/sdk/reference/api/common/client/index.html index 83808600..21d874f1 100644 --- a/1.0/sdk/reference/api/common/client/index.html +++ b/1.0/sdk/reference/api/common/client/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/reference/api/common/types/index.html b/1.0/sdk/reference/api/common/types/index.html index 2746afa8..3cb50cdd 100644 --- a/1.0/sdk/reference/api/common/types/index.html +++ b/1.0/sdk/reference/api/common/types/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/sdk/reference/api/connectors/connector/index.html b/1.0/sdk/reference/api/connectors/connector/index.html index 0c9fea3a..69a9d26a 100644 --- a/1.0/sdk/reference/api/connectors/connector/index.html +++ b/1.0/sdk/reference/api/connectors/connector/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/reference/api/datasources/datasource/index.html b/1.0/sdk/reference/api/datasources/datasource/index.html index be894787..5b0f2543 100644 --- a/1.0/sdk/reference/api/datasources/datasource/index.html +++ b/1.0/sdk/reference/api/datasources/datasource/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/reference/api/datasources/metadata/index.html b/1.0/sdk/reference/api/datasources/metadata/index.html index 117d28e7..f807300c 100644 --- a/1.0/sdk/reference/api/datasources/metadata/index.html +++ b/1.0/sdk/reference/api/datasources/metadata/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/reference/api/index.html b/1.0/sdk/reference/api/index.html index bd2d10ca..9db901ee 100644 --- a/1.0/sdk/reference/api/index.html +++ b/1.0/sdk/reference/api/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/sdk/reference/api/synthesizers/base/index.html b/1.0/sdk/reference/api/synthesizers/base/index.html index 4474873d..1bdd33d0 100644 --- a/1.0/sdk/reference/api/synthesizers/base/index.html +++ b/1.0/sdk/reference/api/synthesizers/base/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/reference/api/synthesizers/multitable/index.html b/1.0/sdk/reference/api/synthesizers/multitable/index.html index 041438ff..161f4c8c 100644 --- a/1.0/sdk/reference/api/synthesizers/multitable/index.html +++ b/1.0/sdk/reference/api/synthesizers/multitable/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/reference/api/synthesizers/regular/index.html b/1.0/sdk/reference/api/synthesizers/regular/index.html index 989833bf..ac48304e 100644 --- a/1.0/sdk/reference/api/synthesizers/regular/index.html +++ b/1.0/sdk/reference/api/synthesizers/regular/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/sdk/reference/api/synthesizers/timeseries/index.html b/1.0/sdk/reference/api/synthesizers/timeseries/index.html index a737e3b2..f7da0ec3 100644 --- a/1.0/sdk/reference/api/synthesizers/timeseries/index.html +++ b/1.0/sdk/reference/api/synthesizers/timeseries/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/search/search_index.json b/1.0/search/search_index.json index c8c547f1..655266fa 100644 --- a/1.0/search/search_index.json +++ b/1.0/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Welcome","text":"

    YData Fabric is a Data-Centric AI development platform that accelerates AI development by helping data practitioners achieve production-quality data.

    Much like for software engineering the quality of code is a must for the success of software development, Fabric accounts for the data quality requirements for data-driven applications. It introduces standards, processes, and acceleration to empower data science, analytics, and data engineering teams.

    \ud83d\ude80 YData SDK Version 1.0 Released! \ud83c\udf89

    We are excited to announce the release of YData Fabric SDK v1.0! This major release marks the beginning of long-term support for the package, ensuring stability, continuous improvements, and ongoing support for all users. YData SDK empowers developers with easy access to state-of-the-art data quality tools and generative AI capabilities. Stay tuned for more updates and new features!

    "},{"location":"#try-fabric","title":"Try Fabric","text":"
    • Get started with Fabric Community
    "},{"location":"#why-adopt-ydata-fabric","title":"Why adopt YData Fabric?","text":"

    With Fabric, you can standardize the understanding of your data, quickly identify data quality issues, streamline and version your data preparation workflows and finally leverage synthetic data for privacy-compliance or as a tool to boost ML performance. Fabric is a development environment that supports a faster and easier process of preparing data for AI development. Data practitioners are using Fabric to:

    • Establish a centralized and collaborative repository for data projects.
    • Create and share comprehensive documentation of data, encompassing data schema, structure, and personally identifiable information (PII).
    • Prevent data quality issues with standardized data quality profiling, providing visual understanding and warnings on potential issues.
    • Accelerate data preparation with customizable recipes.
    • Improve machine learning performance with optimal data preparation through solutions such as synthetic data.
    • Shorten access to data with privacy-compliant synthetic data generatio.
    • Build and streamline data preparation workflows effortlessly through a user-friendly drag-and-drop interface.
    • Efficiently manage business rules, conduct comparisons, and implement version control for data workflows using pipelines.
    "},{"location":"#key-features","title":"\ud83d\udcdd Key features","text":""},{"location":"#data-catalog","title":"Data Catalog","text":"

    Fabric Data Catalog provides a centralized perspective on datasets within a project-basis, optimizing data management through seamless integration with the organization's existing data architectures via scalable connectors (e.g., MySQL, Google Cloud Storage, AWS S3). It standardizes data quality profiling, streamlining the processes of efficient data cleaning and preparation, while also automating the identification of Personally Identifiable Information (PII) to facilitate compliance with privacy regulations.

    Explore how a Data Catalog through a centralized repository of your datasets, schema validation, and automated data profiling.

    "},{"location":"#labs","title":"Labs","text":"

    Fabric's Labs environments provide collaborative, scalable, and secure workspaces layered on a flexible infrastructure, enabling users to seamlessly switch between CPUs and GPUs based on their computational needs. Labs are familiar environments that empower data developers with powerful IDEs (Jupyter Notebooks, Visual Code or H2O flow) and a seamless experience with the tools they already love combined with YData's cutting-edge SDK for data preparation.

    Learn how to use the Labs to generate synthetic data in a familiar Python interface.

    "},{"location":"#synthetic-data","title":"Synthetic data","text":"

    Synthetic data, enabled by YData Fabric, provides data developers with a user-friendly interfaces (UI and code) for generating artificial datasets, offering a versatile solution across formats like tabular, time-series and multi-table datasets. The generated synthetic data holds the same value of the original and aligns intricately with specific business rules, contributing to machine learning models enhancement, mitigation of privacy concerns and more robustness for data developments. Fabric offers synthetic data that is ease to adapt and configure, allows customization in what concerns privacy-utility trade-offs.

    Learn how you to create high-quality synthetic data within a user-friendly UI using Fabric\u2019s data synthesis flow.

    "},{"location":"#pipelines","title":"Pipelines","text":"

    Fabric Pipelines streamlines data preparation workflows by automating, orchestrating, and optimizing data pipelines, providing benefits such as flexibility, scalability, monitoring, and reproducibility for efficient and reliable data processing. The intuitive drag-and-drop interface, leveraging Jupyter notebooks or Python scripts, expedites the pipeline setup process, providing data developers with a quick and user-friendly experience.

    Explore how you can leverage Fabric Pipelines to build versionable and reproducible data preparation workflows for ML development.

    "},{"location":"#tutorials","title":"Tutorials","text":"

    To understand how to best apply Fabric to your use cases, start by exploring the following tutorials:

    • Handling Imbalanced Data for Improved Fraud DetectionLearn how to implement high-performant fraud detection models by incorporating synthetic data to balance your datasets.

    • Prediction with Quality Inspection Learn how to develop data preparation workflows with automated data quality checks and Pipelines.

    • Generating Synthetic Data for Financial TransactionsLearn how to use synthetic data generation to replicate your existing relational databases while ensuring referential integrity.

    You can find additional examples and use cases at YData Academy GitHub Repository.

    "},{"location":"#support","title":"\ud83d\ude4b Support","text":"

    Facing an issue? We\u2019re committed to providing all the support you need to ensure a smooth experience using Fabric:

    • Create a support ticket: our team will help you move forward!
    • Contact a Fabric specialist: for personalized guidance or full access to the platform
    "},{"location":"data_catalog/","title":"Data Catalog","text":"

    In the realm of data management and analysis, the ability to efficiently discover, understand, and access data is crucial. Fabric's Data Catalog emerges as a pivotal solution in this context, designed to facilitate an organized, searchable, and accessible repository of metadata. This chapter introduces the concept, functionality, and advantages of the Data Catalog within Fabric's ecosystem, offering developers a comprehensive overview of its significance and utility.

    To ensure that large volumes of data can be processed through the entire data pipeline, Fabric is equipped with integrated connectors for various types of storages (from RDBMS to cloud object storage), guaranteeing the data never leaves your premises. Furthermore Fabric's Catalog ensures a timely and scalable data analysis as it runs on top of a distributed architecture powered by Kubernetes and Dask.

    The benefits of Fabric's Data Catalog for data teams are manifold, enhancing not only the efficiency but also the effectiveness of data understanding operations:

    • Improved Data Accessibility: With the Data Catalog, developers can consume the data they need for a certain project through a user-friendly interface, significantly reducing the time spent searching for data across disparate sources. This enhanced discoverability makes it easier to initiate data analysis, machine learning projects,
    • or any other data-driven tasks.

    • Enhanced Data Governance and Quality: Fabric's Data Catalog provides comprehensive tools for data-drive projects governance in terms of data assets, including data quality profiling and metadata management. These tools help maintain high-data quality and compliance with regulatory standards, ensuring that developers work with reliable and standardized information throughout the project.

    • Knowledge and Insight Sharing: Through detailed metadata, data quality warnings and detailed profiling, Fabric's Data Catalog enhances the understanding of data's context and behaviour. This shared knowledge base supports better decision-making and innovation in a data-driven project.

    "},{"location":"data_catalog/#related-materials","title":"Related Materials","text":"
    • \ud83d\udcd6 Data Catalogs in the modern data stack
    • How to create your first Datasource from a CSV file?
    • How to create a Database in the Data Catalog?
    • How to automate data quality profiling?
    "},{"location":"data_catalog/connectors/","title":"Connectors","text":"

    Fabric connectors play an important role in the landscape of data-driven projects, acting as essential components that facilitate the movement and integration of data across different systems, platforms, and applications. Fabric connectors where designe to offer a seamless and easy connectivity for data exchange between disparate data sources (such as databases, cloud storage systems, etc).

    "},{"location":"data_catalog/connectors/#benefits","title":"Benefits","text":"
    • Data Integration: Fabric Connectors are primarily used to consume and integrate data a variety of different sources in a single project, ensuring that data can be easily combined, transformed, and made ready for analysis or operational use.
    • Automation of data flows: They automate the process of data extraction, transformation and loading (ETL), which is crucial for maintaining up-to-date and accurate the data that is being used for a certain project.
    • Simplification of data access: Fabric connectors experience simplify the process of accessing and using data from specialized or complex systems, making it easier for users without deep technical expertise to leverage data for insights.
    • Enhancement of Data Security: Designed to manage in a secure way the credentials and access to your different storage.
    "},{"location":"data_catalog/connectors/#get-started-with-fabric-connectors","title":"Get started with Fabric Connectors","text":"
    • How to create a connector in Fabric?
    • How to use Object Storage Connectors through Labs?
    • How to use RDBMS connectors through Labs?
    "},{"location":"data_catalog/connectors/create_connector/","title":"How to create a connector in Fabric's Data Catalog?","text":"

    How to create a connector to an RDBMS in Fabric?

    To create a connector in YData Fabric, select the \"Connectors\" page from the left side menu, as illustrated in the image below.

    Click in \"Add Connector\" and a list of connector types to choose from will be shown to you.

    For the purpose of this example we will be creating a connector to our AWS S3 storage. The credentials/secrets to your storage will be requested. After adding them, you can \"Test connection\" to ensure that all the details are correct. A confirmation message, similar to the one shown in the image below, should appear in our screen, letting you know that you can now save your connector successfully!

    Congrats! \ud83d\ude80 You have now created your first Connector! You can now create different Datasources in your project's Data Catalog. Get ready for your journey of improved quality data for AI.

    "},{"location":"data_catalog/connectors/supported_connections/","title":"Supported connections","text":"

    Fabric can read and write data from a variety of data sources.

    "},{"location":"data_catalog/connectors/supported_connections/#connectors","title":"Connectors","text":"

    Here is the list of the available connectors in Fabric.

    Connector Name Type Supported file types Notes AWS S3 Object Storage Parquet CSV Azure Blog Storage Object Storage Parquet CSV Azure Data Lake Object Storage Parquet CSV Google Cloud storage Object Storage Parquet CSV Upload file File Parquet CSV Maximum file size is 700MB. Bigger files should be uploaded and read from remote object storages Google BigQuery Big Table Not applicable MySQL RDBMS Not applicable Supports reading whole schemas or specifying a query Azure SQL Server RDBMS Not applicable Supports reading whole schemas or specifying a query PostGreSQL RDBMS Not applicable Supports reading whole schemas or specifying a query Snowflake RDBMS Not applicable Supports reading whole schemas or specifying a query Oracle DB RDBMS Not applicable Supports reading whole schemas or specifying a query Databricks Unity Catalog Catalog Not applicable Supports reading a table Databricks Delta Lake Lakehouse Not applicable Supports reading a table"},{"location":"data_catalog/connectors/supported_connections/#havent-found-your-storage","title":"Haven't found your storage?","text":"

    To understand our development roadmap or to request prioritization of new data connector, reach out to us at ydata.ai/contact-us.

    "},{"location":"data_catalog/connectors/use_in_labs/","title":"Use connectors in Lab","text":""},{"location":"data_catalog/connectors/use_in_labs/#create-a-lab-environment","title":"Create a lab environment","text":""},{"location":"data_catalog/datasources/","title":"Overview","text":"

    YData Fabric Datasources are entities that represent specific data sets such as tables, file sets, or other structured formats within the YData Fabric platform. They offer a centralized framework for managing, cataloging, and profiling data, enhancing data management and quality.

    "},{"location":"data_catalog/datasources/#benefits","title":"Benefits","text":"
    • Summarized metadata information: Fabric Datasources provide comprehensive metadata management, offering detailed information about each datasource, including schema details, descriptions, tags, and data lineage. This metadata helps users understand the structure and context of their data.

    • Data Quality Management: Users can find data quality warnings, validation results, cleansing suggestions, and quality scores. These features help in identifying and addressing data quality issues automatically, ensuring reliable data for analysis and decision-making.

    • Data Profiling: Data profiling tools analyze the content and structure of datasources, providing statistical summaries, detecting patterns, assessing completeness, and evaluating data uniqueness. These insights help in understanding and improving data quality.

    • PII Identification and Management: Fabric detects and manages Personally Identifiable Information (PII) within datasources. It includes automatic PII detection, masking tools, and compliance reporting to protect sensitive data and ensure regulatory compliance.

    • Centralized Repository: Fabric Datasources serve as a centralized repository for data quality discovery and management. They provide a single point of access for all data assets, simplifying discovery, monitoring, and governance, and improving overall data management efficiency.

    "},{"location":"data_catalog/datasources/pii/","title":"PII identification","text":"

    To overcome the concerns around data privacy and enable secure data sharing, Fabric incorporates an automated Personal Identifiable Information (PII) identification engine to help detect and handle potential PII.

    What can be considered Personal Identifiable Information (PII)?

    PII is information that, when used alone or with other relevant data, can uniquely identify an individual. PII may contain direct indentifiers (e.g., ID, VAT, Credit Card Number) and/or quasi-identifiers (e.g., age, gender, race, occupation). Correctly classifying these is crucial to reduce the risk of re-identification. Learn more about how Fabric mitigates the risk of re-identification using synthetic data.

    Fabric offers a standardized classification of PII that automatically highlights and tags potential PII. The automatic detection of PII can be enabled during the loading process of your datasets and can be leveraged to generate privacy-preserving synthetic data.

    After the detection, the PII information will be available through the Metadata > PII Types, where each column that may represent potential PII is associated to one or several tags that identify the type of information it might be leaking.

    You can review the automatic PII classification and add additional PII tags of your own by editing the metadata and select additional tags available in a pre-defined list of values, containing the most common types of potential PII information: email, phone, VAT, zip code, among others.

    Need a solution to enable data sharing and comply with GDPR and CCPA regulations?

    Using synthetic data has proven to foster a culture of data-sharing within organizations, overcoming the limitations of traditional privacy methods and maximizing data value. Try Fabric Community Version to enable secure data sharing.

    "},{"location":"data_catalog/datasources/profiling/","title":"Profiling","text":"Profiling sections"},{"location":"data_catalog/datasources/profiling/#data-structures-supported","title":"Data Structures supported","text":"

    The profiling offers comprehensive insights into various types of data, including tabular, time-series text and image data.

    • Tabular data: when dealing with tabular data, such as spreadsheets or databases, the profiling provides valuable statistics on data distribution, central tendencies, and categorical variable frequencies. It identifies multivariate relations such as correlations and interactions in a visual manner. It also identifies missing data.
    • Time-series data: when dealing with data with temporal dimensions, the profiling extends its capabilities to capture trends, seasonality, cyclic patterns and missing data gaps. It can reveal information about data volatility, periodicity, and anomalies, facilitating a deeper understanding of time-dependent trends.
    • Text: when it comes to text data, such as strings or documents, the profiling offers insightful statistics on the distribution of word frequencies, common phrases, and unique words.
    "},{"location":"data_catalog/datasources/profiling/#data-types","title":"Data types","text":"

    Types, are a powerful abstraction for effective data analysis, allowing analysis under higher level lenses. Fabric Profiling is backed by a powerful type system developed specifically for data analysis that allows an automated detection for different data types. Currently, the following types are recognized:

    • Numerical
    • Categorical
    • Date (and Datetime)
    • String
    • Time-series
    • LongText
    "},{"location":"data_catalog/datasources/profiling/#univariate-profiling","title":"Univariate profiling","text":"

    This section provides a comprehensive overview of individual variables within a given dataset, this feature is particularly useful for exploratory data analysis (EDA) as it automatically calculated detailed statistics, visualizations, and insights for each variable in the dataset. It offers information such as data type, missing values, unique values, basic descriptive statistics , histogram plots, and distribution plots. This allows data analysts and scientists to quickly understand the characteristics of each variable, identify potential data quality issues, and gain initial insights into the data's distribution and variability.

    "},{"location":"data_catalog/datasources/profiling/#multivariate-profiling","title":"Multivariate profiling","text":"Multivariate profiling metrics and visualization

    This section provides essentials insights into the relationships between variables through correlations matrices and interactions. The correlation view computes and presents correlation coefficients between pairs of numerical variables, helping to identify potential linear relationships. This assists data analysts and scientists in understanding how variables change together and highlights possible multi collinearity issues.

    On the other hand, the interactions section goes beyond correlation by exploring potential nonlinear relationships and interactions between variables, providing a more comprehensive understanding of how variables interact with one another. This can be crucial in identifying hidden patterns that might not be captured through traditional correlation analysis.

    "},{"location":"data_catalog/datasources/profiling/#correlations","title":"Correlations","text":"Correlations section

    Fabric's intuitive correlation matrix and heatmap visualizations empower users to drill down into specific variable interactions and understand their dependencies. Additionally, Fabric\u2019s real-time interactivity allows users to adjust filters and refine their analysis dynamically, supporting deeper insights into correlations across complex datasets.

    Fabric Correlations are calculated pairwise and depending on the type schema: - numerical to numerical variable: Spearman correlation coefficient - categorical to categorical variable: Cramer's V association coefficient - numerical to categorical: Cramer's V association coefficient with the numerical variable discretized automatically

    "},{"location":"data_catalog/datasources/profiling/#interactions","title":"Interactions","text":"

    The interactions plot visually represents how two variables influence each other across different sections of the data. It goes beyond simple correlations by providing an interactive view of how one variable changes in relation to another. This plot helps users detect non-linear relationships and complex patterns, allowing for deeper insights during Exploratory Data Analysis (EDA). By dynamically highlighting these variable pair interactions, the interactions profile enables users to refine their understanding of data relationships, guiding more informed decisions in the data preparation process.

    "},{"location":"data_catalog/datasources/profiling/#missing-data","title":"Missing data","text":"

    This section offers valuable insights into the presence and distribution of missing data within a dataset. It can be particularly helpful for data preprocessing and quality assesment as provides a comprehensive summary of missing values across variables, indicating the percentage of missing data for each variable. Additionally, it displays a visual representation of missing data patterns through bar plots and heatmaps, allowing users to quickly identify which variables have the most significant amount of missing information.

    "},{"location":"data_catalog/datasources/profiling/#outliers","title":"Outliers **","text":"Outliers identification

    This section provides a comprehensive profiling over the potential dataset outliers. You can validate and observe outliers presence and deviation from the general distribution of numerical variables based on observed variance. The identification of outliers allows the data analyst or scientist to assess whether they are genuine data anomalies or erroneous entries, allowing for informed decisions on whether to retain, transform, or exclude these points in further analyses.

    "},{"location":"data_catalog/datasources/warnings/","title":"Warnings","text":"

    The first technical step in any data science project is to examine the data and understand its quality, value and fitness for purpose. For this reason, Fabric\u2019s Data Catalog includes an Overview and Warnings module for a better understanding of the available datasets.

    "},{"location":"data_catalog/datasources/warnings/#datasets","title":"Datasets","text":""},{"location":"data_catalog/datasources/warnings/#overview","title":"Overview","text":"

    When clicking on a Dataset available from the Data Catalog, it will show its details page, revealing an Overview and Warnings section.

    In the Overview, you\u2019ll get an overall perspective of your dataset\u2019s characteristics, where descriptive statistics will be presented, including:

    • Basic description and tags/concepts associated to the dataset
    • Memory consumption
    • Number of rows
    • Duplicate rows (percentage / number of records)
    • Number of columns
    • Total data types (numeric, categorical, string, long text, ID, date)
    • Missing data (percentage / number of cells)
    • Main data quality warnings
    "},{"location":"data_catalog/datasources/warnings/#data-quality-warnings","title":"Data Quality Warnings","text":"

    To enable data-centric development, Fabric automatically detects and signals potential data quality warnings. Warnings highlight certain peculiarities of data that might require further investigation prior to model development and deployment. However, the validity of each issued warning and whether follow-up mitigation work is needed will depend on the specific use case and on domain knowledge.

    Fabric currently supports the following warnings:

    • Constant: the column presents the same value for all observations
    • High: A high warning is raised whenever all the variables in a column have the same value
    • Zeros: the column presents the value \u201c0\u201d for several observations
    • Moderate: A moderate warning is raised if a column has between 10% and 25% of zeros
    • High: A high warning is raised if a column has more than 50% records as zeros
    • Unique: the column contains only unique/distinct values
    • High: A high warnins is raised if all the values of a column are different
    • Cardinality: the columns (categorical) has a large number of distinct values
    • Moderate: A moderate warning is raised if a column has a cardinality equivalent to between 75% and 90% of the number of rows
    • High: A high warning is raised if a column has a cardinality equivalent to more than 90% of the number of rows
    • Infinity: the column presents infinite (\\(\\inf\\)) values
    • High: A high warning is raised is all the values of a column are inf
    • Constant_length: the column (text) has constant length
    • High: A high warning is raised is all the values of a column have the same string length
    • Correlation: the columns is highly correlated with other(s)
    • Skeweness: the column distribution (numerical) is skewed
    • Moderate: A moderate warning is raised if the value for the calculated skewness is between [-1, -0.5] or [0.5, 1]
    • High: A high warning is raised if the value for the calculated skewness is lower than -1 or bigger than 1.
    • Missings: the column presents several missing values
    • Moderate: A moderate warning is raised if a column has a cardinality equivalent to between 30% and 60% of the number of rows
    • High: A high warning is raised if a column has a cardinality equivalent to more than 60% of the number of rows
    • Non-stationarity: the column (time series) presents statistical properties that change through time
    • Seasonal: the column (time series) exhibits a seasonal pattern
    • Imbalance: the column (categorical) presents a high imbalance ratio between existing categories Imbalancement is calculated as imbalanced_score: 1 - (entropy(value_counts) / log2(number_categories))
    • Moderate: A moderate warning is raised if imbalanced_score between 0.15 and 5.
    • High: A high warning is raised if imbalanced_score between 0.5 and 1.

    Fabric further enables the interactive exploration of warnings, filtering over specific warnings and severity types (i.e., Moderate and High):

    "},{"location":"data_catalog/datasources/warnings/#databases","title":"Databases","text":"

    When users create a database in YData Fabric's Data Catalog, they gain access to a powerful suite of tools designed to help them manage and understand the structure of their data. The Data Catalog provides a comprehensive view of each database, offering detailed insights into the schema and data quality. Here are the key features users can expect:

    "},{"location":"data_catalog/datasources/warnings/#schema-overview-with-database-specific-warnings","title":"Schema Overview with Database-Specific Warnings","text":"

    As soon as a database is added to the Fabric Data Catalog, users are presented with a complete overview of the schema. This includes a visual representation of the tables, columns, and relationships that exist within the database. In addition to the schema visualization, Fabric automatically scans the database for potential issues and displays database-specific warnings.

    These warnings help users identify problems that could impact data integrity, such as:

    • Self-references (columns that act as both primary and foreign keys)
    • Self-referencing columns, where a column serves as both a primary key and a foreign key in the same table, can create complexities for synthetic data generation. YData Fabric detects self-references and warns users when this relationship might lead to synthetic data inconsistency or improper referential integrity. The platform suggests creating a temporary schema or breaking these references into a more manageable structure to ensure clarity and accuracy in data generation.
    • Independent tables (tables with no defined relations)
    • Tables without relationships to other tables\u2014referred to as independent tables\u2014can result in isolated synthetic data that lacks the interdependencies present in the original data. YData Fabric flags such tables to alert users that these isolated data structures may need further inspection or modeling to ensure they align with the overall data environment.
    • Schemas with no defined relations (schemas missing foreign key constraints)
    • When a schema lacks defined relationships between tables, YData Fabric issues warnings to alert users of the absence of foreign key constraints or other relational ties. This warning is critical, as generating synthetic data without considering relationships can lead to inaccurate and fragmented datasets. Users are encouraged to define necessary relations or clarify dependencies to improve the quality of the synthetic data output.
    • Circular references (tables involved in a loop of dependencies)
    • Circular references occur when tables are interdependent in a closed loop (e.g., Table A references Table B, and Table B references Table A). These can cause significant complications during synthetic data generation, especially in maintaining referential integrity across the cycle. YData Fabric detects these loops and provides guidance on how to restructure the schema, such as breaking the cycle or temporarily isolating the tables, to avoid generating erroneous data.
    • Indirect relations between tables (complex chains of relationships)
    • YData Fabric also identifies indirect relationships between tables\u2014where two or more tables are connected via intermediary tables or columns. These complex relationships can introduce nuances that might not be immediately obvious during data modeling. The platform issues warnings to ensure that indirect relationships are clearly understood and accurately represented in synthetic data generation, preventing the loss of valuable data linkages.

    This automatic detection ensures that users can proactively address any schema complexities before they negatively impact data queries or synthetic data generation.

    "},{"location":"data_catalog/datasources/warnings/#table-level-navigation-with-detailed-warnings","title":"Table-Level Navigation with Detailed Warnings","text":"

    Similarly to datasets that are constituted by a single table, for each database table YData Fabric provides users with the ability to navigate through each table within the database individually. For every table, users can view a detailed overview that includes:

    • Column names and types,
    • Statistical overview
    • Warnings (see the previous section for more details)
    "},{"location":"deployment_and_security/deployment/aws/bastion_host/","title":"Bastion host","text":"

    During the installation, the user will be prompt with the possibility of allowing the creation of a bastion host. This bastion host is used by YData to give a closer support to the users. If you allow the creation of this bastion host, an EC2 will be created during installation with NO ingress rules on his security group.

    In case is needed, you will need to send the bastion host Elastic IP to YData Fabric and add an ingress rule to the security group as explained below. In the CloudFormation outputs you can find the relevant information of the EC2 bastion host, such as, elastic IP, the EC2 instance ID and the security group ID:

    "},{"location":"deployment_and_security/deployment/aws/bastion_host/#setting-the-sg-ingress-rule","title":"Setting the SG ingress rule","text":"
    • To give access to the bastion host, please go to the EC2 service \u2192 Security Groups.
    • You can search for the security group ID provided on the template outputs:
    • Go to the \"Inbound rules\" tab and click \"Edit\" inbound rules.
    • You can then, add an inbound rule to allow the access to the bastion host and click Save rules, as per the image below.
    • For single IP source, an IP will be given to you on the support time via email.
    "},{"location":"deployment_and_security/deployment/aws/bastion_host/#removing-the-sg-ingress-rule","title":"Removing the SG ingress rule","text":"
    • As soon the support for the specific case ends, you must remove the SG ingress rule and click Save rules.
    "},{"location":"deployment_and_security/deployment/aws/billing/","title":"Billing","text":"

    After the installation, the client will be billed for all the infrastructure costs plus the usage metrics describe in the offer. Using a usage-based pricing model you will only pay for what you use. The following metrics are calculated and sent to AWS in order to charge you at the current offer pricing:

    • CPU / Hour
    • Memory / Hour
    • GPU / Hour

    The following AWS services are mandatory for the platform to work and will be billed:

    • VPC
    • ACM
    • Secrets Manager
    • CloudWatch
    • EKS
    • EC2
    • EFS
    • RDS
    • Cognito
    • ECS
    • Lambda

    To check the infrastructure costs of the platform, you can use the AWS Cost Explorer and filter by the tag Environment = YData. This will aggregate all the resources deployed by the platform.

    "},{"location":"deployment_and_security/deployment/aws/billing/#cost-estimations","title":"Cost Estimations","text":"

    YData Fabric final cost can be estimated following the logic of a usage-based plan since it depends on your users and data. The following table provides a guideline of how to compute the total cost for different usage scenarios based on the deployed infrastructure.

    EKS Nodes Instance Type vCPUs Memory (GBi) GPUs Number of instances % Usage/ CPU/Hour % Usage/ Memory/Hour % Usage/ GPU/Hour Cost AWS/Hour Cost AWS/Day Cost YData/Hour Cost YData/Day System t3a.2xlarge 8 32 0 2 20 20 0 $0.30 $14.44 $0.38 $9.22 CPU Micro (labs) t3a.large 2 8 0 1 40 40 0 $0.08 $1.80 $0.10 $2.30 CPU Small (labs) t3a.xlarge 4 16 0 1 20 20 0 $0.15 $3.61 $0.10 $2.30 CPU Medium (labs) t3a.2xlarge 8 32 0 0 0 0 0 $0.30 $0.00 $0.00 $0.00 CPU Large (labs) m5a.4xlarge 16 64 0 0 0 0 0 $0.69 $0.00 $0.00 $0.00 CPU Compute Micro (computing) r5a.4xlarge 16 128 0 1 20 20 0 $0.90 $21.70 $0.64 $15.36 GPU Micro (labs) g4dn.xlarge 4 16 1 0 0 0 0 $0.53 $0.00 $0.00 $0.00 GPU Compute Micro (computing) g3.4xlarge 16 122 1 0 0 0 0 $1.14 $0.00 $0.00 $0.00

    The example above illustrates a scenario where the Micro and Small instances are used. It is also illustrated that despite the Nodes being available, they're not necessarily being used, hence billed - only when the infrastructure is required and actually used, it is measured and billed accordingly.

    "},{"location":"deployment_and_security/deployment/aws/clean/","title":"Clean","text":"

    The following procedure explains how to delete the platform. The full procedure takes around 45m to 1h to be completed. To clean up YData Fabric, you will need to delete the CloudFormation stack and remove the subscription.

    Please take in consideration that this will delete everything associated with the installation.

    "},{"location":"deployment_and_security/deployment/aws/clean/#deleting-the-stacks","title":"Deleting the stacks","text":"
    • Go to the regions where the product is installed
    • Go to the CloudFormation service
    • Select the ydata stack
    • Click in the Delete button
    • Select the Extension stack and click in the Delete button.

    Note

    This will disable the extension. If you are using this extension for any other project, please do not delete this stack.

    "},{"location":"deployment_and_security/deployment/aws/clean/#deleting-the-subscription","title":"Deleting the subscription","text":"
    • Go to the **AWS Marketplace Subscriptions** \u2192 Manage subscriptions
    • Click the YData product
    • Actions \u2192 Cancel subscription
    • Click the checkbox and click Yes, cancel subscription

    Following the above steps completes the process of deleting YData Fabric from your AWS Cloud instance.

    "},{"location":"deployment_and_security/deployment/aws/deploy/","title":"Deploy","text":""},{"location":"deployment_and_security/deployment/aws/deploy/#installation-process","title":"Installation process","text":"

    The following procedure explains how to install the platform using the CloudFormation template and how to connect to the platform after the installation. The full procedure takes around 45m to 1h to be completed. In order to install the platform in your account, the user must have basic knowledge with the used tools, such as CloudFormation, Route53 and Cognito.

    "},{"location":"deployment_and_security/deployment/aws/deploy/#configure-the-product","title":"Configure the product","text":"

    Make sure that you comply with the pre-flight checks

    You can check the prerequisites and pre-deploy checks.

    Start with the basic configuration for the app installation:

    • Ensure you are in the right region.
    • Choose the stack name \"ydata-platform\" is the default name
    "},{"location":"deployment_and_security/deployment/aws/deploy/#network","title":"Network","text":"

    Define your network configurations to access the platform. Using the ACM Certificate ARN OR the Hosted Zone ID and the Domain chosen from the preflight checklist, fill up the following parameters:

    "},{"location":"deployment_and_security/deployment/aws/deploy/#oauth","title":"OAuth","text":"

    Define how your users will authenticate in the platform (you can use multiple providers).

    "},{"location":"deployment_and_security/deployment/aws/deploy/#analytics","title":"Analytics","text":"

    You can opt for allowing or not the collection of metrics in order to help us understand how users interact with the product. No user data is collected at any point. You can find our privacy policy at ydata.ai/privacy.

    "},{"location":"deployment_and_security/deployment/aws/deploy/#bastion-host","title":"Bastion host","text":"

    A bastion host is created and used to give closer support to the users. The bastion host is only accessible on user demand, giving us access to EC2 setting an SG ingress rule. Set it to \"Allow\" to have it available. More information here.

    "},{"location":"deployment_and_security/deployment/aws/deploy/#create","title":"Create","text":"
    • Check the \u201cI acknowledge that AWS CloudFormation might create IAM resources with custom names.\u201d
    • Click Create Stack
    "},{"location":"deployment_and_security/deployment/aws/deploy/#2-following-the-installation-process","title":"2. Following the installation process","text":"

    Now we can follow the step-by-step for the installation of YData Fabric.

    • Click the \u201cCreate\u201d button, the installation of the platform will start:

    The process will take approximately 45-60 minutes.

    • If the installation process occurs without any issues, you will see the CREATE_COMPLETE status in the stack:

    • If any error occur during installation, please open a support case at support.ydata.ai.
    "},{"location":"deployment_and_security/deployment/aws/deploy/#3-post-installation-configuration","title":"3. Post installation configuration","text":""},{"location":"deployment_and_security/deployment/aws/deploy/#dns-configuration","title":"DNS Configuration","text":"

    If you have your domain registered in Route53, you can check the CF Outputs, and click the domain name to access the platform:

    If you are using another DNS provider or a Route53 in another account, you will need to create a CNAME record pointing to the ALB endpoint (ALBDNSName). As an example: CNAME \u2192 ydata-alb-xxxxxxxxx.eu-west-1.elb.amazonaws.com

    "},{"location":"deployment_and_security/deployment/aws/deploy/#4-connecting-to-the-platform","title":"4. Connecting to the platform","text":"

    To connect the platform, please allow 20-30m so the platform is completed initialised and access using the URL displayed in the CF Outputs. For the login process, if you choose a customer custom login provider, you need to ensure that the users are created.

    Otherwise, you will need to create the users in the Cognito generated by the CloudFormation stack.

    More information under can be found at Login providers.

    \ud83d\ude80 Congratulations you are now ready to start exploring your data with YData Fabric!

    "},{"location":"deployment_and_security/deployment/aws/instance_types/","title":"Instance types","text":"Name ID System Pool CPU MIcro Pool CPU Small Pool CPU Medium Pool CPU Large Pool CPU Compute Micro Pool GPU MIcro Pool GPU Compute Micro Pool Bastion Host N. Virginia us-east-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Ohio us-east-2 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano N. California us-west-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Oregon us-west-2 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Cape Town af-south-1 t3.xlarge t3.large t3.xlarge t3.2xlarge m5.4xlarge r5.4xlarge g4dn.xlarge g4dn.2xlarge t3.nano Hong Kong ap-east-1 t3.xlarge t3.large t3.xlarge t3.2xlarge m5.4xlarge r5.4xlarge g4dn.xlarge g4dn.2xlarge t3.nano Mumbai ap-south-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g4dn.2xlarge t3a.nano Osaka ap-northeast-3 t3.xlarge t3.large t3.xlarge t3.2xlarge m5.4xlarge r5.4xlarge g4dn.xlarge g4dn.2xlarge t3.nano Seoul ap-northeast-2 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Singapore ap-southeast-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Sydney ap-southeast-2 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Tokyo ap-northeast-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Canada Central ca-central-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Frankfurt eu-central-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Ireland eu-west-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano London eu-west-2 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Milan eu-south-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g4dn.2xlarge t3a.nano Paris eu-west-3 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g4dn.2xlarge t3a.nano Stockholm eu-north-1 t3.xlarge t3.large t3.xlarge t3.2xlarge m5.4xlarge r5.4xlarge g4dn.xlarge g4dn.2xlarge t3.nano Bahrain me-south-1 t3.xlarge t3.large t3.xlarge t3.2xlarge m5.4xlarge r5.4xlarge g4dn.xlarge g4dn.2xlarge t3.nano S\u00e3o Paulo sa-east-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g4dn.2xlarge t3a.nano"},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/","title":"Checklist and Prerequisites","text":"

    Deploying YData Fabric in the AWS cloud offers a scalable and efficient solution for managing and generating synthetic data. AWS provides a robust infrastructure that ensures high availability, security, and performance, making it an ideal platform for YData Fabric.

    This cloud deployment allows for rapid scaling of resources to meet varying workloads, ensuring optimal performance and cost-efficiency.

    With AWS's comprehensive security features, including data encryption, network firewalls, and identity management, your synthetic data and models are protected against unauthorized access and threats. Additionally, AWS's global infrastructure allows for the deployment of YData Fabric in multiple regions, ensuring low latency and high availability for users worldwide.

    Prerequisites

    If you don't have an AWS account, create a free account before you begin.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#basic-configuration","title":"Basic Configuration","text":"
    • Stack name: The name of the CloudFormation stack
    • Location: where to install the platform and create the resources. You can check the available supported regions here:
    • **Available regions: ** You can find the aws regions where YData Fabric is available here.
    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#permissions","title":"Permissions","text":"

    Check and add (if needed) the necessary permissions to the account and region where the platform will be installed.

    • Go to Identity and Access Management (IAM)
    • Select your user or role used for deployment
    • Under the permissions tab, check if you have the following permissions:
      • AdministratorAccess

    *this will be updated in the future with only the necessary permissions to create and access the application.

    You can find AWS official documentation here.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#service-linked-roles","title":"Service Linked Roles","text":"

    During the deployment all the required Service-Linked Roles are created by AWS by default with the exception of the EKS Service-Linked Role.

    Please go to IAM \u2192 Roles Verify that the following Service-Linked role exists in IAM:

    • AWSServiceRoleForAmazonEKS

    Otherwise, please create the missing service linked role:

    • Click \u201cCreate role\u201d
    • Choose AWS service and EKS:

    • Click \u201cNext\u201d \u2192 \u201cNext\u201d
    • Click \u201cCreate role\u201d

    *You can find AWS official documentation for service-linked roles.*

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#quotas","title":"Quotas","text":"

    Check and set (if needed) new quotas for the region where the application will be installed.

    • Go to Service Quotas (ensure that you are in the right region).
    • Select AWS Services \u2192 Amazon Elastic Compute Cloud (Amazon EC2)
    • Check for the following quota limits:
    Quota Minimum Recommended Running On-Demand Standard (A, C, D, H, I, M, R, T, Z) instances 50\u00b9 100\u00b2 Running On-Demand G and VT instances 0\u00b9 20\u00b2

    1. These limits are the required only for the installation of the platform. Usage is limited. 2. Each limit will depend on the platform usage and each client requirements.

    If needed, request for a new limit to the AWS support team. More on available instance types can be found here.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#network-configuration","title":"Network configuration","text":"

    Choose how you want to connect to the platform.

    The parameters below will be used during the deployment process.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#dns-configuration","title":"DNS Configuration:","text":"

    In AWS, you will connect the platform providing your own DNS custom domain, for example: platform.ydata.ai. For that, a registered domain is necessary.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#domain-name-and-route53-hosted-zone-id","title":"Domain Name and Route53 Hosted Zone ID","text":"

    If you have your domain registered in Route53, you can pass the Route53 Hosted Zone ID and the Domain Name, and the CloudFormation template will create an ACM certificate and a Route53 record pointing to the ALB used to connect the platform. So no steps are required before or after the installation.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#domain-name-and-acm-certificate-arn","title":"Domain Name and ACM Certificate ARN","text":"

    Otherwise, if you have your domain registered in another provider or in a route53 in another account, you will need to do one of the following steps:

    Create the certificate on ACM and validate it Request public certificate Certificate granted

    After the certificate is requested, copy the CNAME value and name, and create the record in your DNS provider so the certificate can be validated.

    Import the certificate to ACM Request public certificate Certificate granted

    After the certificate is imported, ensure the certificate is validated.

    After the installation, you will need to create another CNAME record pointing to the ALB endpoint, available in the CF Outputs.

    For example: CNAME \u2192 ydata-alb-xxxxxxxxx.eu-west-1.elb.amazonaws.com

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#login-provider","title":"Login Provider","text":"

    In AWS you can use multiple providers to connect to the platform. During the parameter section you can choose to create a Cognito or to use one on your own:

    Setting this to True, unless you want to use a custom one, you don\u2019t need to specify any other parameters under the OAuth Configuration.

    You can only have one Cognito

    You can only choose one Cognito:

    • The created during the platform installation.
    • One created by you, where you need to pass the credentials parameters.

    If both are set, the provided parameters will be ignored and the one created during installation will be used.

    Some regions do not support Cognito

    This is not currently supported for some regions! For this regions you will need to use the region specific template and pass your own custom oauth configuration!

    Check regions information here.

    You can log in to our app currently using the following providers - at least one is required, but you can choose multiple ones:

    • Google
    • Microsoft
    • Cognito (you own or the default created during installation)
    • GitHub

    More detailed instructions for each login provider can be found here. If you required another authentication method, please fill up a support case at support.ydata.ai

    After configuring your login provider, please save the values. This values will be used during the deployment process.

    As soon as the above steps are all completed, you are ready to start the deployment.

    "},{"location":"deployment_and_security/deployment/aws/regions/","title":"\ud83c\udf10 Regions","text":"Name ID Supported Notes N. Virginia us-east-1 \u2705 \u2796 Ohio us-east-2 \u2705 \u2796 N. California us-west-1 \u2705 \u2796 Oregon us-west-2 \u2705 \u2796 Cape Town af-south-1 \u2705 \u2796 Melbourne ap-southeast-4 \ud83d\udd34 No GPU machine types available at the moment Hong Kong ap-east-1 \u2705 \u2796 Hyderabad ap-south-2 \ud83d\udd34 No GPU machine types available at the moment Jakarta ap-southeast-3 \ud83d\udd34 No GPU machine types available at the moment Mumbai ap-south-1 \u2705 \u2796 Osaka ap-northeast-3 \u2705 \u2796 Seoul ap-northeast-2 \u2705 \u2796 Singapore ap-southeast-1 \u2705 \u2796 Sydney ap-southeast-2 \u2705 \u2796 Tokyo ap-northeast-1 \u2705 \u2796 Canada Central ca-central-1 \u2705 \u2796 Frankfurt eu-central-1 \u2705 \u2796 Ireland eu-west-1 \u2705 \u2796 London eu-west-2 \u2705 \u2796 Milan eu-south-1 \u2705 \u2796 Paris eu-west-3 \u2705 \u2796 Spain eu-south-2 \ud83d\udd34 No GPU machine types available at the moment Stockholm eu-north-1 \u2705 \u2796 Zurich eu-central-2 \ud83d\udd34 No GPU machine types available at the moment Bahrain me-south-1 \u2705 \u2796 UAE me-central-1 \ud83d\udd34 No GPU machine types available at the moment Tel Aviv il-central-1 \ud83d\udd34 No GPU machine types available at the moment S\u00e3o Paulo sa-east-1 \u2705 \u2796"},{"location":"deployment_and_security/deployment/aws/update/","title":"Update Fabric","text":"

    YData is committed to providing our users with cutting-edge tools and features to enhance their data management and synthetic data generation capabilities. Our solution updates policy is designed to ensure that YData Fabric remains at the forefront of technological advancements while maintaining the highest standards of reliability, security, and user satisfaction.

    Key Aspects of Our Update Policy

    • Regular Updates: We release regular updates that include new features, performance improvements, and bug fixes. These updates are aimed at enhancing the overall functionality and user experience of YData Fabric.
    • User Feedback Integration: We actively seek and incorporate feedback from our user community. This ensures that our updates address real-world challenges and meet the evolving needs of our users.
    • Seamless Deployment: Updates are designed to be deployed seamlessly with minimal disruption to ongoing operations. Our team provides detailed documentation and support to facilitate smooth transitions.
    • Security Enhancements: We prioritize the security of our platform. Each update undergoes rigorous testing to ensure that it enhances the security posture of YData Fabric without introducing vulnerabilities.
    • Compatibility and Compliance: Updates are developed to ensure compatibility with existing systems and compliance with industry standards and regulations, safeguarding the integrity and continuity of user operations.

    By adhering to this policy, YData ensures that users consistently benefit from the latest advancements in data technology, reinforcing our commitment to innovation and excellence in the field of data science and synthetic data generation.

    All updates to Fabric are user/organization triggered and by following the next steps to update your CloudFormation stack.

    "},{"location":"deployment_and_security/deployment/aws/update/#1-get-the-most-recent-version","title":"1. Get the most recent version","text":"
    • Go to the **AWS Marketplace Subscriptions** \u2192 Manage subscriptions
    • Click the YData Fabric subscription
    • Click Launch more software.
    • Check for new versions and click Continue to Launch. At this stage you will find the link for the new version.

    Click the deployment template associated with your installation.

    • Here you will have the new template URL. Copy the link as per the image below:

    • Go to the deployed CloudFormation stack and clink in \"Update\" button.
    • Choose \u201cReplace current template\u201d and provide the new stack URL.

    • For the parameters, use the same parameters or change if needed. Click Next \u2192 Next \u2192 Submit

    1. Following the installation process

    Now you can follow the installation process. Different from the initial deploy, the update process will only take approximately 15-60 minutes depending on the update complexity.

    \ud83d\ude80 Congratulations you have now the latest version of YData Fabric!

    "},{"location":"deployment_and_security/deployment/azure/billing/","title":"Billing","text":"

    After the installation, the client will be billed for all the infrastructure costs plus the usage metrics describe in the offer.

    Using a usage-based pricing model you will only pay for what you use.

    The following metrics are calculated and sent to Azure in order to charge you at the current offer pricing:

    • CPU / Hour
    • Memory / Hour
    • GPU / Hour

    The following Azure services are mandatory for the platform to work and will be billed:

    • Virtual networks
    • IP Address
    • Private DNS Zones
    • Container Registry
    • Storage Account
    • MySQL Server
    • Deployment Scripts
    • Kubernetes Services
    • Key Vault
    • Container Instances

    To check the infrastructure costs of the platform, you can use the Azure Cost analysis (under the Cost Management + Billing service) and filter by the created resource groups during the deployment. This will aggregate all the resources deployed by the platform.

    "},{"location":"deployment_and_security/deployment/azure/billing/#cost-estimations","title":"Cost Estimations","text":"

    YData Fabric final cost can be estimated following the logic of a usage-based plan since it depends on your users and data. The following table provides a guideline of how to compute the total cost for different usage scenarios based on the deployed infrastructure.

    AKS Nodes Instance Type vCPUs Memory (GBi) GPUs Number of instances % Usage/ CPU/Hour % Usage/ Memory/Hour % Usage/ GPU/Hour Cost Azure/Hour Cost Azure/Day Cost YData/Hour Cost YData/Day System Standard_D2s_v3 8 32 0 2 30 30 0 0.4800 23.04 0.288 6.912 CPU Micro (labs) Standard_D2s_v3 2 8 0 1 50 50 0 0.1200 2.88 0.06 1.44 CPU Small (labs) Standard_D4s_v3 4 16 0 1 50 50 0 0.2400 5.76 0.12 2.88 CPU Medium (labs) Standard_D8s_v3 8 32 0 0 0 0 0 0.4800 0 0 0 CPU Large (labs) Standard_D16s_v3 16 64 0 0 0 0 0 0.9600 0 0 0 CPU Compute Micro (computing) Standard_D32s_v3 32 128 0 1 80 80 0 1.9200 46.08 1.536 36.864 GPU Micro (labs) Standard_NC6s_v3 6 112 1 0 0 0 0 3.8230 0 0 0 GPU Compute Micro (computing) Standard_NC6s_v3 6 112 1 0 0 0 0 3.8230 0 0 0

    The example above illustrates a scenario where the Micro and Small instances are used. It is also illustrated that despite the Nodes being available, they're not necessarily being used, hence billed - only when the infrastructure is required and actually used, it is measured and billed accordingly.

    "},{"location":"deployment_and_security/deployment/azure/clean/","title":"Clean","text":"

    The following procedure explains how to delete the platform. The full procedure takes around 45m to 1h to be completed. To clean up YData Fabric, you will need to delete the managed app.

    Please take in consideration that this will delete everything associated with the installation.

    • Start by opening the resource group where the managed app is installed, select the Managed Application and click \"Delete\".

    This will delete the managed app and the managed resource group where all the components are installed.

    "},{"location":"deployment_and_security/deployment/azure/deploy/","title":"Deploy","text":""},{"location":"deployment_and_security/deployment/azure/deploy/#installation-process","title":"Installation process","text":"

    Ensure that you have completed the pre-deploy checklist

    Validate if you have checked all the deploy requirements before moving forward with the deploy.

    "},{"location":"deployment_and_security/deployment/azure/deploy/#basic-configuration","title":"Basic configuration","text":"
    • Start by defining the basic configuration for the app installation.
    "},{"location":"deployment_and_security/deployment/azure/deploy/#jit-access","title":"JIT Access","text":"
    • Enable the Just in Time (JIT) access for the app installation as shown in the image below. You can see more about JIT access in the pre-deploy checklist.
    "},{"location":"deployment_and_security/deployment/azure/deploy/#network-configuration","title":"Network configuration","text":"
    • Define your network configuration to access YData Fabric.
    New Public IPExisting Public IP
    • If you choose a new Public IP, you can choose the name or leave it as (new) default, but the remaining properties are ignored since the SKU standard and assignment static is the recommended by Azure.
    • After that, choose a DNS label for the domain as shown below.
    • If you opt for an existing Public IP, you can choose that IP from the dropdown. The DNS Public Endpoint is automatically filled since this is configured on the IP Address level. If your IP is disabled, please ensure you have the DNS name label defined and the IP is not allocated to any other resource.

    For the DNS Custom Domain, you can use a custom domain, such as, for example platform.ydata.ai. After the installation process you will need to create a CNAME or an A record in your DNS provider. More information in the Post installation step.

    "},{"location":"deployment_and_security/deployment/azure/deploy/#oauth","title":"OAuth","text":"
    • Define how you will authenticate to the app after the deployment is completed.
    "},{"location":"deployment_and_security/deployment/azure/deploy/#analytics","title":"Analytics","text":"
    • You can opt for allowing or not the collection of metrics in order to help us understand how users interact with the product. No user data is collected at any point. Read more about YData privacy policy.
    "},{"location":"deployment_and_security/deployment/azure/deploy/#create","title":"Create","text":"
    • Click \u201cNext\u201d. Check the provided parameters.
    • Insert the contact information
    • Read and accept the terms and conditions. Finally click in \"Create\".
    "},{"location":"deployment_and_security/deployment/azure/deploy/#following-the-installation-process","title":"Following the installation process","text":"
    • After click the \u201cCreate\u201d button, the installation of the managed app will start, as shown in the image below.

    The process will take approximately 45-60 minutes.

    • If any error occur during installation, please open a support case at support.ydata.ai.
    "},{"location":"deployment_and_security/deployment/azure/deploy/#post-installation-configuration","title":"Post installation configuration","text":""},{"location":"deployment_and_security/deployment/azure/deploy/#ip-configuration","title":"IP configuration","text":"

    If you choose to use one existing IP for the platform, you will need to create a role assignment to the resource group where the IP is located. To do this, open your managed resource group (where the resources are created) and open the ydata-cluster-managed-identity Managed Identity.

    • Click \u201cAzure Role Assignments\u201d

    • Click \u201cAdd role assignment\u201d as shown in the image below.

    • Choose the Scope \u201cResource group\u201d.
    • Choose the subscription where the resource group is located.
    • Select the resource group where the IP is located.
    • Add the role \u201cNetwork Contributor\u201d and \"Save\".
    "},{"location":"deployment_and_security/deployment/azure/deploy/#dns-configuration","title":"DNS Configuration","text":"

    If you opt for the DNS Custom Domain, you will need to create a CNAME record pointing to the DNS Public Endpoint or an A record pointing to the IP. Example in Route53:

    "},{"location":"deployment_and_security/deployment/azure/deploy/#connecting-to-ydata-fabric","title":"Connecting to YData Fabric","text":"

    You can get the full URL in the Managed APP \u2192 \u201cParameters and Outputs\u201d tab \u2192 Outputs

    \ud83d\ude80 Congratulations you are now ready to start exploring your data with YData Fabric!

    "},{"location":"deployment_and_security/deployment/azure/instance_types/","title":"Instance types","text":"Name ID System Pool CPU MIcro Pool CPU Small Pool CPU Medium Pool CPU Large Pool CPU Compute Micro Pool GPU MIcro Pool GPU Compute Micro Pool West Europe westeurope Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3 West US westus Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3 West US 2 westus2 Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3 Canada Central canadacentral Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3 Sweden Central swedencentral Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3 Australia East australiaeast Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3"},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/","title":"Checklist and Prerequisites","text":"

    Deploying YData Fabric in the Microsoft Azure offers a scalable and efficient solution for managing and generating synthetic data. AWS provides a robust infrastructure that ensures high availability, security, and performance, making it an ideal platform for YData Fabric.

    This cloud deployment allows for rapid scaling of resources to meet varying workloads, ensuring optimal performance and cost-efficiency.

    With Microsoft's comprehensive security features, including data encryption, network firewalls, and identity management, your synthetic data and models are protected against unauthorized access and threats. Additionally, Azure's global infrastructure allows for the deployment of YData Fabric in multiple regions, ensuring low latency and high availability for users worldwide.

    Prerequisites

    If you don't have an Azure account, create a free account before you begin.

    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#basic-configuration","title":"Basic Configuration","text":"
    • Subscription: where the platform will be installed
    • Resource group: where the managed app will be installed:

      • A new one is recommended and can be created automatically during the deployment.
    • Location: where to install the Managed APP and create the resource groups. The available location for now are:

      • West Europe - Netherlands [westeurope]
      • West US - California [westus]
      • West US - Washington [westus2]
      • Canada Central [canadacentral]
      • Sweden Central [swedencentral]*

      If you need another region, please fill up a support case at support.ydata.ai.

      *Regions without available GPU\u2019s machine types at the time

    • Application Name: the Managed APP name

    • Managed Resource Group: the resource group created by the Managed APP and where all the infrastructure services will be created (this is created automatically).
    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#permissions","title":"Permissions","text":"

    Check and add (if needed) the necessary permissions to the subscription where the platform will be installed.

    • Go to Subscriptions.
    • Select the subscription where YData Fabric will be installed.
    • Click \u201cView my access\u201d as shown in the image below.

    • Check if you have at least the following configurations:

    • Contributor

    And the following permissions:

    • Microsoft.Authorization/roleAssignments/read

    • Microsoft.Authorization/roleAssignments/write

    • If not, please create a custom role with this two permissions and create the role assignment to the user in the subscription.

    For more information check Azure's official documentation on Azure custom roles and Azure built-in roles.

    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#resource-providers","title":"Resource Providers","text":"

    Check and activate (if needed) resource providers for the subscription where the YData platform will be installed following the next steps.

    • Go to Subscriptions
    • Select the subscription where YData Fabric will be installed
    • Go to Resource Providers
    • Using the filter, check if you have the following resource providers registered. If not, please click the resource provider and click \u201cRegister\u201d.

      • Microsoft.Compute
      • Microsoft.ContainerInstance

    For more information check Azure's official documentation on resource providers and Azure Resource Manager.

    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#register-features","title":"Register features","text":"

    Check and register (if needed) the required features. - Install and update the aks-preview extension:

        az extension add --name aks-preview\n    az extension update --name aks-preview\n
    • Register the 'EnableWorkloadIdentityPreview' feature flag
        az feature register --namespace \"Microsoft.ContainerService\" --name \"EnableWorkloadIdentityPreview\"\n
    • Wait until feature to be registered:

        az feature show --namespace \"Microsoft.ContainerService\" --name \"EnableWorkloadIdentityPreview\"\n
        {\n        \"id\": \"/subscriptions/xxxxx/providers/Microsoft.Features/providers/Microsoft.ContainerService/features/EnableWorkloadIdentityPreview\",\n        \"name\": \"Microsoft.ContainerService/EnableWorkloadIdentityPreview\",\n        \"properties\": {\n            \"state\": \"Registered\"\n        },\n        \"type\": \"Microsoft.Features/providers/features\"\n    }\n

    • After the feature status is \u201cRegistered\u201d, refresh the registration of the container service resource provider:
        az provider register --namespace Microsoft.ContainerService\n

    Read more in Azure's official documentation on Azure Kubernetes Services (AKS).

    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#resource-compute-quotas","title":"Resource compute quotas","text":"

    Check and set (if needed) new quotas for the region where the managed app will be installed.

    • Go to Subscriptions.
    • Select the subscription where YData Fabric will be installed
    • Click \u201cUsage + quotas\u201d
    • Filter by the region where YData Fabric will be installed

    • Check for the following quota limits:
    Quota Minimum Recommended Total Regional vCPUs 16* 100** Standard DSv3 Family vCPUs 16* 100** Standard NCSv3 Family vCPUs* 6* 20** Standard DDSv4 Family vCPUs 10 10

    *These limits are the required only for the installation of the platform. Usage is limited.

    ** Each limit will depend on the platform usage and each client requirements.

    *** Not available in Sweden region

    • If needed, request for a new limit to the azure support team as per the image below.

    Check Azure's official documentation on quotas, increase regional vCPU quotas and increase VM-family quotas.

    More on available instance types can be found here.

    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#jit-access","title":"JIT Access","text":"

    The JIT Access feature will prevent YData Fabric from having write access to the managed app at any time.

    • To use the just-in-time access, you must have an Azure Active Directory P2 license.
    • Without this license and with the JIT enable, YData will not be able to give any closer support or make updates to the solution.

    To check your current license, go to the Azure Portal \u2192 Azure Active Directory \u2192 Licenses and check your license. To activate the P2 license, click the \u201cTry/Buy\u201d button.

    For more information check Azure's official documentation on assigning and removing licenses to Azure Active directory. To learn how to enable JIT access and approve requests.

    After accepting the request, the YData team will have access in order to make updates and give you closer support. Any other requests open a support case at support.ydata.ai.

    "},{"location":"deployment_and_security/deployment/azure/regions/","title":"\ud83c\udf10 Regions","text":"Name ID Supported Notes West Europe westeurope \u2705 \u2796 West US westus \u2705 \u2796 West US 2 westus2 \u2705 \u2796 CanadaCentral canadacentral \u2705 \u2796 (Europe) Sweden Central swedencentral \u2705 \u2796 (Asia Pacific) Australia East australiaeast \u2705 \u2796

    For more zone please contact us through support@ydata.ai

    "},{"location":"deployment_and_security/deployment/google/deploy/","title":"Deploy","text":""},{"location":"deployment_and_security/deployment/google/deploy/#installation-process","title":"Installation process","text":"

    The following information needs to be passed to YData team:

    • The SA JSON file generated in the preflight-checklist.
    • Project ID
    • Region
    • DNS Cloud Zone name
    • Domain name - the domain that will be used to connect to the platform
    • Login provider credentials (ex on google: Client ID, Client Secret, Domain)
    "},{"location":"deployment_and_security/deployment/google/deploy/#wait-for-the-installation-to-be-done","title":"Wait for the installation to be done","text":"

    YData team will take care of the deployment for you. As soon as it is finished the team will let you know.

    "},{"location":"deployment_and_security/deployment/google/deploy/#post-installation-configuration","title":"Post installation configuration","text":"

    A DNS configuration is needed. For that, if you opt for the IP, you will need to create a record pointing to the Load Balancer IP, as shown in the image below.

    "},{"location":"deployment_and_security/deployment/google/deploy/#connecting-to-ydata-fabric","title":"Connecting to YData Fabric","text":"

    YData team will share with you the link/URL that you can now use to access YData Fabric.

    \ud83d\ude80 Congratulations you are now ready to start exploring your data with YData Fabric!

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/","title":"Checklist and Prerequisites","text":"

    The deployment will be executed using terraform, and it is fully automated. It is triggered by YData\u2019s team and the progress can be monitored on the client side.

    As a pre-condition, the client must create a service account and share it with YData\u2019s team. The required permissions will be shared in this document.

    The bastion host will be used to provide technical support to the team in case of issues and troubleshooting with the usage of the platform, and this access will only be used for this purpose.

    Prerequisites

    If you don't have an GCP subscription, create a free account before you begin.

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#observations-prerequisites","title":"Observations & prerequisites","text":"
    • The deployment will create one public and private key to establish the connection to the bastion host.
    • With this deployment, a security group allowing YData\u2019s IP to establish the connection to the bastion host via SSH will be created. This should be deleted after the deployment and added in case it is needed.
    • The Bastion host can be stopped after the deployment to prevent any charges and created/started to give support.
    • The private subnets will have a NAT Gateway attached \u2013 this is needed since the GKE needs access to the public internet to connect the Data Sources and to pull images from the public registries.
    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#basic-configuration","title":"Basic Configuration","text":"
    • Project: where the platform will be installed.
    • Location: where to install the YData fabric inside the project.
    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#enable-apis","title":"Enable API's","text":"
    • Please check if the following API\u2019s for the chosen project are enabled:
      • API Keys API
      • Artifact Registry API
      • Certificate Manager API
      • Cloud Resource Manager API
      • Cloud Key Management Service (KMS) API
      • Compute Engine API
      • Kubernetes Engine API
      • ^^Cloud DNS API
      • Cloud Filestore API
      • Cloud Run API
      • Identity and Access Management (IAM) API
      • Services Networking API
      • Cloud SQL Admin API
      • Cloud Storage
      • Serverless VPC Access API
      • Secret Manager API
      • Cloud Scheduler API
      • Service Usage API
    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#permissions","title":"Permissions","text":"

    The following service account should be created and transferred to YData so the deployment can be triggered. It is recommended (but not required) that you create a new project for the YData platform. This will make it easier to control costs and to ensure that YData only have access to their resources. You can create the service account using the provided commands using the gcloud cli (recommended) or create the service manually using the google cloud UI.

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#gcloud-cli","title":"GCloud CLI","text":"

    The following commands will create a new service account with the required permissions to complete the deployment. The generated JSON file must be sent to YData.

    1. Download the following file: https://raw.githubusercontent.com/ydataai/gcp-deploy-permissions/main/clients_custom_role.yaml
    2. Create the new SA for the deployment
        export PROJECT_ID=\n    export SERVICE_ACCOUNT_NAME=\n\n    gcloud config set project $PROJECT_ID\n
    • Create a new SA
        gcloud iam service-accounts create $SERVICE_ACCOUNT_NAME --display-name \"GCP Service Account for the Ydata platform\"\n
    • Get the new key file for the created SA
        export SA_EMAIL=$(gcloud iam service-accounts list --filter $SERVICE_ACCOUNT_NAME --format 'value(email)')\n\n    gcloud iam service-accounts keys create gcp-ydata-platform-service-account.json --iam-account $SA_EMAIL\n
    • Create a new role and associate this role to the new SA
        gcloud iam roles create ydata_platform_gcp_iam_role --project $PROJECT_ID --file clients_custom_role.yaml\n\n    gcloud projects add-iam-policy-binding $PROJECT_ID --member \"serviceAccount:$SA_EMAIL\" --role \"projects/$PROJECT_ID/roles/ydata_platform_gcp_iam_role\"\n
    • Activate the new SA locally
        gcloud auth activate-service-account --project=$PROJECT_ID --key-file=gcp-ydata-platform-service-account.json\n
    • Test the new SA by setting the new account
        gcloud config set account $SA_EMAIL\n    gcloud config set project $PROJECT_ID\n
    • Check if you are logged in with the new SA:
        gcloud auth list\n
    • Try a command.
        gcloud container clusters list\n
    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#gcp-console","title":"GCP Console","text":"

    Go to IAM -> Service Accounts -> Create Service Account Choose a name for the service account and click \u201cCreate and Continue\u201d. For the Roles add the following ones (you can search by these terms and select the resulting role):

    • roles/container.admin
    • roles/compute.admin
    • roles/iam.serviceAccountAdmin
    • roles/dns.admin
    • roles/iam.roleAdmin
    • roles/resourcemanager.projectIamAdmin
    • roles/cloudsql.admin
    • roles/servicenetworking.networksAdmin
    • roles/iam.serviceAccountKeyAdmin
    • roles/serviceusage.serviceUsageAdmin
    • roles/file.editor
    • roles/storage.admin
    • roles/cloudkms.admin
    • roles/serviceusage.apiKeysAdmin
    • roles/artifactregistry.admin
    • roles/secretmanager.admin
    • roles/vpcaccess.admin
    • roles/run.admin
    • roles/deploymentmanager.editor
    • roles/cloudscheduler.admin

    After it finished, click Continue and Done. Open the service account and create a new JSON key: The transferred key will be used by YData.

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#resource-compute-quotas","title":"Resource Compute Quotas","text":"

    Check and set (if needed) new quotas for the region where Fabric will be installed.

    • Go to IAM & Admin
    • Click \u201cQuotas & System Limits\u201d on the left
    • Filter by your region and check for the following quotas
    Quota Recommended CPUs (all regions) >200** C2D CPUs 200** N2D CPUs 24** Zonal & Regional 1-10 TiB (Enterprise) capacity (GB) per region 1024GiB *Each limit will depend on the platform usage and each client requirements.*
    • If needed, request for a new limit to the Google's support team:

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#network-configuration","title":"Network configuration","text":"

    Choose how you want to connect to the platform.

    In GCP, it\u2019s possible to connect to YData Fabric using your own DNS custom domain, for example: ydatafabric.yourdomain.com. (It\u2019s necessary to have a domain registered).

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#domain-name-and-gcp-cloud-dns-zone","title":"Domain Name and GCP Cloud DNS Zone","text":"

    If you have your domain registered in GCP Cloud DNS, you can use the Zone Name and the Domain Name, and the Deployment will create a Managed Certificate and the Cloud DNS record pointing to the Load Balancer used to connect the platform.

    Otherwise, if you have the domain registered in another provider, it is recommended to create a Public Cloud DNS Zone and point and create a new record in your provider pointing to the NS of Google and pass this Zone Name and Domain name, so the deployment occurs without any issues.

    If you don\u2019t want to create the Public Cloud DNS Zone you can point your to the IP available after the installation creating an A record.

    These parameters will be used during the deployment process.

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#login-provider","title":"Login Provider","text":"

    Choose how you want to login to the platform. You can log in to our app currently using the following providers - at least one is required, but you can choose multiple ones: - Google - Microsoft - Cognito - GitHub

    You can find detailed instructions for each type of login provider in the Login Providers page After configuring your login provider, please save the values. This values will be used during the deployment process.

    If you required another authentication method, please fill up a support case at support.ydata.ai.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/","title":"Login Providers","text":"

    YData Fabric offers a flexible and secure authentication system, allowing users to log in using a variety of trusted identity providers. This technical documentation provides a comprehensive guide to configuring and managing login providers for YData Fabric, including Google, Microsoft, and Amazon Cognito. By leveraging these providers, users can benefit from seamless and secure access to YData Fabric, ensuring a smooth and efficient user experience.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#google","title":"Google","text":"
    1. Open the Google Cloud Console.
    2. At the top-left, click Menu>APIs & Services>Credentials.
    3. Click Create Credentials>OAuth client ID.
    4. Click Application type>Web application.
    5. In the \"Name\" field, type a name for the credential. This name is only shown in the Cloud Console.
    6. Leave the \u201cAuthorized JavaScript origins\u201d empty. Add a new \u201cAuthorized redirect URIs\u201d with the platform endpoint with a suffix */dex/callback* For the provided example:

      If you are using the DNS Public EndpointOr, if you are using the DNS Custom Domain

    7. Click \u201cCreate\u201d

    8. Save the following credentials:

      • a. Client ID

        The Client ID for the Web Application - b. Client Secret

        The Client Secret for the Web Application - c. APP Hosted domain

        Google supports whitelisting allowed domains when using G Suite For example, for one company with the emails like person@example.com, the APP Hosted domain is example.com

    9. Use the credentials as inputs for YData Fabric.

    You can find more details in Google's official documentation.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#microsoft","title":"Microsoft","text":"
    1. Open the Azure Portal
    2. Go to \u201cEntra ID\u201d
    3. Click \u201cApp registrations\u201d
    4. Click \u201cNew registration\u201d
    5. Choose a name
    6. For the supported account types, choose the most appropriated choice for you.
    7. For the Redirect URI, choose \u201cWeb\u201d, and fill with the platform endpoint with a suffix */dex/callback*. For the provided example:

      If you are using the DNS Public EndpointOr, if you are using the DNS Custom Domain

    8. Click \u201cRegister\u201d

    9. Go to \u201cCertificates & Secrets\u201d, generate a new secret and save the value (not the secret id). Please choose a large expiration date. This value cannot be changed after the installation of the platform.
    10. Go to \u201cOverview\u201d and save the following credentials:

      • a. Client ID

        The Application (client) ID

      • b. Client Secret

        The secret generated in step 9 (not the secret id).

      • c. Tenant ID

        The Directory (tenant) ID

    11. Use the credentials as inputs for YData Fabric.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#consent-workflow","title":"Consent workflow","text":"

    The admin consent workflow is necessary to configure, so you can access the platform using the app registered above.

    1. Open the Azure Portal
    2. Go to \u201cAzure Active Directory\u201d
    3. Click \"Enterprise applications\u201d
    4. Open the \u201cConsent and permissions\u201d page \u2192 \u201cUser consent settings\u201d
    5. Check with the AD administrator if an administrator is required to login to the app, or if all users can consent for the apps.
    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#give-access-only-to-a-set-of-users-andor-groups","title":"Give access only to a set of users and/or groups","text":"
    1. In order to give access only to a set of users or groups, open your app and click the link \u201cManaged application in local directory\u201d on the right side:
    2. Then, click in \u201cProperties\u201d and enable the \u201cAssignment required\u201d
    3. To add users and/or groups, go to \u201cUsers and Groups\u201d and click \u201cAdd user/group\u201d.

    With the above steps, only the users and groups listed here can access YData Fabric. For more information check Microsoft's official documentation for Microsoft identy platform and Microsoft Entra.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#aws-cognito","title":"AWS Cognito","text":"
    1. Go to the Amazon Cognito console. If prompted, enter your AWS credentials.
    2. Choose User Pools. Create a new User Pool.
    3. For the \u201cConfigure security requirements\u201d, \u201cConfigure sign-up experience\u201d and \u201cConfigure message delivery\u201d tabs are up to your choices or leave as the default.
    4. In the \u201cIntegrate your app\u201d please set the attributes as the following:

      1. \u201cUser Pool Name\u201d - a name of your choice
      2. Tick the \u201cUse the Cognito Hosted UI\u201d check box.
      3. \u201cDomain type\u201d, you can use a cognito or a custom domain.
      4. \u201cInitial app client\u201d choose \u201cPublic client\u201d and set a \u201cApp client name\u201d
      5. For \u201cClient secret\u201d, choose \u201cGenerate a client secret\u201d
      6. In the \u201cAllowed callback URLs\u201d, set your callback URL with the platform endpoint with a suffix */dex/callback* For the provided example:
      If you are using the DNS Public EndpointOr, if you are using the DNS Custom Domain

      1. In the \u201cAdvanced app client settings\u201d \u2192 \u201cAuthentication flows\u201d step, choose \u201cALLOW_USER_PASSWORD_AUTH\u201d
      2. For the \u201cOpenID Connect scopes\u201d choose: \u201cEmail\u201d, \u201cOpenID\u201d and \u201cProfile\u201d.
      3. Review your settings, and \u201cCreate User Pool\u201d.
      4. Click your new user pool, go to the \u201cApp integration\u201d tab and \u201cApp clients and analytics\u201d.
      5. Copy and save the Client ID and Client secret.
      6. For the \u201cIssuer URL\u201d, get your URL by going to https://cognito-idp.[region].amazonaws.com/[user_pool_id]/.well-known/openid-configuration And copy and save the \"issuer URL.
      7. Use these credentials as inputs for YData Fabric.
    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#adding-new-users","title":"Adding new users","text":"
    1. Go to the Cognito service.
    2. Click the YData platform Cognito user pool.
    3. Go to the Users tab
    4. Click Create user
    5. Create the users:
    6. The user will receive an e-mail with the temporary credentials.

    For more information check Amazon's Cognito official documentation on user pools^ and ^^user pool app client.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#github","title":"Github","text":"
    1. Go to the GitHub OAuth Application page. If prompted, enter your GitHub credentials.
    2. For the \u201cApplication Name\u201d, choose anything.
    3. For the \u201cHomepage URL\u201d and \u201cAuthorization callback URL\u201d, fill with the platform endpoint and platform endpoint with a suffix */dex/callback* correspondingly. For the provided example:
    If you are using the DNS Public EndpointOr, if you are using the DNS Custom Domain
    1. Open your new APP and generate a new secret
    2. Save the Client ID and Client secret
    3. For the org, use your GitHub organization name.

    Finally, use these credentials as inputs for to login YData Fabric. For more information check GitHub's official login documentation.

    "},{"location":"deployment_and_security/deployment/login_support/support/","title":"Support","text":"

    The YData Fabric support ticketing mechanism is designed to ensure that our users receive timely and efficient assistance for any issues they encounter while using our platform. This guide provides an in-depth overview of how the support ticketing system works, including how to submit a ticket and communicate with our support team.

    "},{"location":"deployment_and_security/deployment/login_support/support/#submitting-a-support-ticket","title":"Submitting a Support Ticket","text":"

    While logged into your YData Fabric instance, navigate to the Support section from the main dashboard, as shown in the image below.

    To create a new ticket, make sure to fill in the following fields:

    • Subject: The subject summary of your problem
    • Description: The detailed description of your issue. Please make sure to be thorough in your description, as it will help the team to provide you with better support. If you can describe the steps that you've made until you've found and issue or the blocker that you are asking support for.
    • Fabric Modules: Optionally, but highly recommend. If the issue happened while creating or interacting with the Data Catalog, Labs or Synthetic Data generation module, users can attach the operational logs (which the platform collects). The logs are fully operational and relate only to the selected component. Include no user data whatsoever (for instance, datasets are never sent). The files are uploaded in the background to a location accessible by YData\u2019s support team (private Amazon S3 Storage bucket in eu-west-1 region).

    Considerably increase the ability of YData\u2019s support team to offer timely and effective support. After receiving the ticket (and any attached logs), YData\u2019s support team will diagnose the issue and follow-up via e-mail as soon as possible. E-mail is used as the default communication channel from that moment onwards.

    "},{"location":"deployment_and_security/security/","title":"Security","text":"

    This section describes YData\u2019s security measures to provide a best-in-class experience for its customers, ensuring not only a good product and service but also risk management and compliance.

    Visit YData's Trust page to check all the Policies, Controls and Monitoring in place.

    "},{"location":"deployment_and_security/security/#hosting-security","title":"Hosting security","text":"

    YData is not a cloud service provider, however, we use providers which are hosted on their data centers, such as Google, Microsoft and Amazon Web Services, when the setup is not made on the customer premises. They are leading cloud infrastructure providers with top-class safety standards. They are able to respond quickly to both operational and security, including well-defined change management policies and procedures to determine when and how change occurs.

    "},{"location":"deployment_and_security/security/#clouds-compliance-standards","title":"Clouds compliance standards","text":"GoogleAWSMicrosoft Azure
    • CSA
    • ISO 27018
    • SOC 3
    • ISO 27001
    • SOC 1
    • ISO 27017
    • SOC 2
    • CSA
    • ISO 27017
    • SOC 2
    • ISO 9001
    • ISO 27018
    • SOC 3
    • ISO 27001
    • SOC 1
    • CSA
    • ISO 27017
    • ISO 22301
    • SOC
    • ISO 9001
    • ISO 27018
    • ISO 20000-1
    • ISO 27001
    • ISO 27701
    • WCAG

    Both physical access perimeters and entry points are strictly controlled by professional security personnel. Authorized personnel must pass a minimum of two-step verification to gain access to the authorized center floors.

    "},{"location":"deployment_and_security/security/#corporate-security","title":"Corporate security","text":"

    YData has applied internal security policies that are in line with the industry's ISO 27001 and SOC 2. We are regularly training our employees in safety and privacy awareness, which protects technical and non-technical roles. Training materials are developed for individual roles so that employees can fulfill their responsibilities appropriately.

    • Two-step verification for all services is enforced
    • Encrypted hard drives of our devices is enforced
    • Hard password requirements and rotation is enforced
    "},{"location":"deployment_and_security/security/#verification-and-access-management","title":"Verification and Access Management","text":"

    Users can log in via a secured Authentication provider, such as Security Assurance Markup Language, Microsoft Active Directory, Google Sign In or OpenID services. All requests to any of YData\u2019s APIs must be approved. Data writing requests require at least reporting access as well as an API key. Data reading requests require full user access as well as application keys. These keys act as carrier tokens to allow access to the YData service functionality. We also use Auth0 in user identification. Auth0 can never save a password because the password is encrypted when the user logs in, and compares with AuthO's encrypted password to see if they are using the correct password.

    The user can change and save the password as they wish. The user can use all types of characters to strengthen his password.

    "},{"location":"deployment_and_security/security/#certificate-management-communications","title":"Certificate Management & Communications","text":"

    All certificates are generated and used inside the Kubernetes cluster, using cert-manager. Exceptions for cloud providers for specific certificates and described below. Every component inside the cluster uses its own certificate, sharing the same issuer so all the components exchange encrypted communication between them.

    AWSMicrosoft Azure

    \"During the deployment, a certificate is requested and provisioned by Let\u2019s Encrypt to the specified domain.\"

    \"The public certificate is generated using the AWS Certificate Manager service.\"

    "},{"location":"deployment_and_security/security/#protection-of-customer-data","title":"Protection of Customer Data","text":"

    User uploaded information or data will be considered confidential, which is stored in encrypted form, separate from other networks, including the public network if available. Data for a limited time without user request, not allowed to come out. All data transmitted layer protection (TSL) and HTTP sent by users protected using Strike Transport Security (HSTS). The application is usable if encrypted communication is compromised. User uploaded data is not transferred from one data center to another. Encryption is used in many places to protect customer information, such as: IS-266 with encryption at rest, incomplete encryption (PGP) for system backups, KMS-based protection for privacy protection, and GPG encryption. Users can use the data stored for business or administrative purposes, but they have to go through many security levels, including multifactor authentication (MFA).

    "},{"location":"deployment_and_security/security/#secure-build-materials-sbom","title":"Secure Build Materials (SBOM)","text":"

    To enhance transparency and facilitate security assessments, we provide access to Secure Build Materials (SBOM) for our products and services. SBOM files offer detailed insights into the components, dependencies, and associated vulnerabilities within our software stack. These files enable stakeholders, including customers, auditors, and security researchers, to evaluate the security posture of our offerings comprehensively. For access to SBOM files and additional security-related information, please visit our Security Resources page at: Find more information here.

    "},{"location":"deployment_and_security/security/#certification-attestation-and-framework","title":"Certification, Attestation and Framework","text":"

    YData uses a frontend framework React (originally maintained by Facebook) which combines the use of unique user tokens to protect your users against common threats such as cross-site scripting (CSS / XSS) and cross-site request fraud (CSRF / XSRF). This makes it impossible for the user to access data from another user's account.

    "},{"location":"deployment_and_security/security/#laws-and-regulations","title":"Laws and Regulations","text":"

    The cloud service providers used by YData are compatible with the General Data Protection Resolution (GDPR). GDPR is working to expand its products, methods and processes to fulfill its responsibilities as a data processor. YData's security and privacy teams have established a vendor management program that determines the need for YData to be approved when it involves third parties or external vendors. Our security team recognizes that the company\u2019s information resources and vendor reliance are critical to our continued activities and service delivery. These spaces are designed to evaluate technical, physical and administrative controls and ensure that it meets the expectations of it and its customers. It is a monitoring service for infrastructure and applications. Our CCPA compliance process may provide additions so that our customers can fulfill their obligations under the CCPA if there is access to personal data, while we make no plans to transfer, process, use or store personal information.

    "},{"location":"deployment_and_security/security/#data-security","title":"Data Security","text":"
    • No data ever leaves the costumer client cloud.
    • All the data is stored using cloud specific services to ensure security, privacy and compliance with YData\u2019s customers requirements.
    "},{"location":"deployment_and_security/security/#data-encryption","title":"Data Encryption","text":"

    The way YData\u2019s customers communicate with the servers is through SSL / TLS connections, which are encrypted. YData protects the servers where YData Fabric is deployed from DDOS, SQL injection and other fraudulent activities. If one wants to interrupt the data transfer, one can only see a mixture of some characters, which is not possible to decrypt. All data in databases is encrypted with industry standard AES-256.

    "},{"location":"deployment_and_security/security/#api-security","title":"API Security","text":"

    To use the API the user needs to have a JWT token that is automatically generated by Fabric for a specific user. The token is signed and encrypted using a random key created during the deployment and only known by the service responsible for its provisioning.

    "},{"location":"deployment_and_security/security/#availability-and-disaster-recovery","title":"Availability and disaster recovery","text":"

    When using one of the cloud providers, the data stored in the bucket and database is distributed and copied to different servers. If a bucket or database fails, it is usually recovered from a different server without targeting other users.Databases are backed up on a daily basis and can be restored if the software or server fails significantly. Backups are stored in various European and North American data centers (depending on the customer location) for extra protection. It is not possible for YData to recover individual customer information - if you delete something in your account, it will be permanently deleted, and we will not be able to recover it.

    "},{"location":"deployment_and_security/security/#monitoring","title":"Monitoring","text":"

    The functionality of our applications and databases is monitored 24/7 through in-built monitoring tools provided by Google, Azure and Amazon Web Services. Internal errors or failures of our various integrations trigger logins and notifications. This usually helps us to identify the problem very quickly and remedy the situation.

    "},{"location":"deployment_and_security/security/#full-disclosure-policy","title":"Full disclosure policy","text":"

    If something serious happens and your data is damaged as required by GDPR, we will disclose in full (such as a data breach). Transparency is important to us and we will provide you with all the necessary information to properly assess the situation and potential impact. So far no customer data has been compromised and we aim to keep it that way.

    "},{"location":"deployment_and_security/security/security_building_materials/","title":"Secure Build Materials (SBOM)","text":"

    To enhance transparency and facilitate security assessments, we provide access to Secure Build Materials (SBOM) for our products and services.

    SBOM files offer detailed insights into the components, dependencies, and associated vulnerabilities within our software stack. These files enable stakeholders, including customers, auditors, and security researchers, to evaluate the security posture of our offerings comprehensively.

    "},{"location":"deployment_and_security/security/security_building_materials/#all-files","title":"All files","text":"

    https://s3.console.aws.amazon.com/s3/buckets/repos-sboms?region=eu-west-1&bucketType=general&tab=objects

    "},{"location":"deployment_and_security/security/security_building_materials/#individual-raw-files","title":"Individual raw files","text":"
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/api-gateway/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/api-gateway/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/authentication-service/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/authentication-service/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-adapter/metering-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-adapter/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-adapter/quota-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-asg-tags-lambda/command-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-asg-tags-lambda/lambda-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-asg-tags-lambda/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/azure-adapter/metering-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/azure-adapter/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/azure-adapter/quota-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/backoffice-console/command-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/backoffice-console/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/backoffice/api-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/backoffice/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dashboard-app/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dashboard-app/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/datasource-controller/api-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/datasource-controller/manager-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/datasource-controller/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dex-theme/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dex-theme/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/dask-gateway-scheduler/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/dask-gateway-worker/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/h2oflow/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/h2oflow/gpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_python/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_python_community/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_python_tensorflow/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_python_torch/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_r/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_r/gpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/pipelines_python_tensorflow/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/pipelines_python_torch/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/pipelines_python_ydata/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/pipelines_ydata/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/visualcode/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/visualcode/gpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/visualcode_tensorflow/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/visualcode_torch/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/visualcode_ydata/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/ydata/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/gcp-adapter/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/gcp-adapter/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/laboratory-controller/api-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/laboratory-controller/manager-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/laboratory-controller/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/metering-service/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/metering-service/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/profile-controller/api-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/profile-controller/manager-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/profile-controller/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/quota-manager/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/quota-manager/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/static-content-server/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/static-content-server/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/synthesizer-controller/api-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/synthesizer-controller/manager-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/synthesizer-controller/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/uploader-service/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/uploader-service/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/ydata-lib-platform-integration-tool/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/ydata-lib-platform-integration-tool/package-sbom.cyclonedx.json
    "},{"location":"get-started/","title":"Get started with Fabric","text":"

    The get started is here to help you if you are not yet familiar with YData Fabric or if you just want to learn more about data quality, data preparation workflows and how you can start leveraging synthetic data. Mention to YData Fabric Community

    "},{"location":"get-started/#create-your-first-dataset-with-the-data-catalog","title":"\ud83d\udcda Create your first Dataset with the Data Catalog","text":""},{"location":"get-started/#create-your-multi-table-dataset-with-the-data-catalog","title":"\ud83d\udcbe Create your Multi-Table Dataset with the Data Catalog","text":""},{"location":"get-started/#create-your-first-synthetic-data-generator","title":"\u2699\ufe0f Create your first Synthetic Data generator","text":""},{"location":"get-started/#create-a-relational-database-synthetic-data-generator","title":"\ud83d\uddc4\ufe0f Create a Relational Database Synthetic Data generator","text":""},{"location":"get-started/#create-your-first-lab","title":"\ud83e\uddea Create your first Lab","text":""},{"location":"get-started/#create-your-first-data-pipeline","title":"\ud83c\udf00 Create your first data Pipeline","text":""},{"location":"get-started/create_database_sd_generator/","title":"How to create your first Relational Database Synthetic Data generator","text":"

    Check this quickstart video on how to create your first Relational Database Synthetic Data generator.

    To generate your first synthetic relational database, you need to have a Multi-Dataset already available in your Data Catalog. Check this tutorial to see how you can add your first dataset to Fabric\u2019s Data Catalog.

    With your database created as a Datasource, you are now able to start configure your Synthetic Data (SD) generator to create a replicate of your database. You can either select \"Synthetic Data\" from your left side menu, or you can select \"Create Synthetic Data\" in your project Home as shown in the image below.

    You'll be asked to select the dataset you wish to generate synthetic data from and verify the tables you'd like to include in the synthesis process, validating their data types - Time-series or Tabular.

    Table data types are relevant for synthetic data quality

    In case some of your tables hold time-series information (meaning there is a time relation between records) it is very important that during the process of configuring your synthetic data generator you do change update your tables data types accordingly. This will not only ensure the quality of that particular table, but also the overall database quality and relations.

    All the PK and FK identified based on the database schema definition, have an automatically created anonymization setting defined. Aa standard and incremental integer will be used as the anonymization configuration, but user can change to other pre-defined generation options or regex base (user can provide the expected pattern of generation).

    Finally, as the last step of our process it comes the Synthetic Data generator specific configurations, for this particular case we need to define both Display Name and the Destination connector. The Destination connector it is mandatory and allow to select the database where the generated synthetic database is expected to be written. After providing both inputs we can finish the process by clicking in the \"Save\" button as per the image below.

    Your Synthetic Data generator is now training and listed under \"Synthetic Data\". While the model is being trained, the Status will be \ud83d\udfe1, as soon as the training is completed successfully it will transition to \ud83d\udfe2. Once the Synthetic Data generator has finished training, you're ready to start generating your first synthetic dataset. You can start by exploring an overview of the model configurations and even validate the quality of the synthetic data generator from a referential integrity point of view.

    Next, you can generate synthetic data samples by accessing the Generation tab or click on \"Go to Generation\". In this section, you are able to generate as many synthetic samples as you want. For that you need to define the size of your database in comparison to the real one. This ratio is provided as a percentage. In the example below, we have asked a sample with 100% size, meaning, a synthetic database with the same size as the original.

    A new line in your \"Sample History\" will be shown and as soon as the sample generation is completed you will be able to check the quality the synthetic data already available in your destination database.

    Congrats! \ud83d\ude80 You have now successfully created your first Relation Synthetic Database with Fabric. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_lab/","title":"How to create your first Lab environment","text":"

    Labs are code environments for a more flexible development of data-driven solutions while leveraging Fabric capabilities combined with already loved tools such as scikit-learn, numpy and pandas. To create your first Lab, you can use the \u201cCreate Lab\u201d from Fabric\u2019s home, or you can access it from the Labs module by selecting it on the left side menu, and clicking the \u201cCreate Lab\u201d button.

    Next, a menu with different IDEs will be shown. As a quickstart select Jupyter Lab. As labs are development environments you will be also asked what language you would prefer your environment to support: R or Python. Select Python.

    Select IDE Select language

    Bundles are environments with pre-installed packages. Select YData bundle, so we can leverage some other Fabric features such as Data Profiling, Synthetic Data and Pipelines.

    As a last step, you will be asked to configure the infrastructure resources for this new environment as well as giving it a Display Name. We will keep the defaults, but you have flexibility to select GPU acceleration or whether you need more computational resources for your developments.

    Finally, your Lab will be created and added to the \"Labs\" list, as per the image below. The status of the lab will be \ud83d\udfe1 while preparing, and this process takes a few minutes, as the infrastructure is being allocated to your development environment. As soon as the status changes to \ud83d\udfe2, you can open your lab by clicking in the button as shown below:

    Create a new notebook in the JupyterLab and give it a name. You are now ready to start your developments!

    Create a new notebook Notebook created

    Congrats! \ud83d\ude80 You have now successfully created your first Lab a code environment, so you can benefit from the most advanced Fabric features as well as compose complex data workflows. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_multitable_dataset/","title":"How to create your first Relational database in Fabric's Catalog","text":"

    To create your first multi-table dataset in the Data Catalog, you can start by clicking on \"Add Dataset\" from the Home section. Or click to Data Catalog (on the left side menu) and click \u201cAdd Dataset\u201d.

    After that the below modal will be shown. You will need to select a connector. To create a multi-table dataset, we need to choose an RDBMS connector like Azure SQL, Snowflake or MySQL. In this case let's select MySQL.

    Once you've selected the \u201cMySQL\u201d connector, a new screen will appear, enabling you to introduce the connection details such as database username, host, password as well as the database name.

    With the Connector created, you'll be able to add a dataset and specify its properties:

    • Name: The name of your dataset;
    • Table: You can create a dataset with all the tables from the schema or select the tables that you need in your project.
    • Query: Create a single table dataset by providing a query

    Now both the Connector to the MySQL Berka database and Berka dataset will be added to our Catalog. As soon as the status is green, you can navigate your Dataset. Click in Open dataset as per the image below.

    Within the Dataset details, you can gain valuable insights like your database schema.

    For each an every table you can explore the both an overview on the structure (number of columns, number of rows, etc.) but also a useful summary of the quality and warnings regarding your dataset behaviour.

    Congrats! \ud83d\ude80 You have now successfully created your first Connector and Multi-table Dataset in Fabric\u2019s Data Catalog. To get the both the ID of your database and project you can decompose the URL from the Database schema overview page. The structure is as follows:

        https://fabric.ydata.ai/rdbms/{your-dataset-id}?ns={your-project-id}\n

    Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_pipeline/","title":"How to create your first Pipeline","text":"

    Check this quickstart video on how to create your first Pipeline.

    The best way to get started with Pipelines is to use the interactive Pipeline editor available in the Labs with Jupyter Lab set as IDE. If you don't have a Lab yet, or you don't know how to create one, check our quickstart guide on how to create your first lab.

    Open an already existing lab.

    A Pipeline comprises one or more nodes that are connected (or not!) with each other to define execution dependencies. Each pipeline node is and should be implemented as a component that is expected to manage a single task, such as read the data, profiling the data, training a model, or even publishing a model to production environments.

    In this tutorial we will build a simple and generic pipeline that use a Dataset from Fabric's Data Catalog and profile to check it's quality. We have the notebooks template already available. For that you need to access the \"Academy\" folder as per the image below.

    Make sure to copy all the files in the folder \"3 - Pipelines/quickstart\" to the root folder of your lab, as per the image below.

    Now that we have our notebooks we need to make a small change in the notebook \"1. Read dataset\". Go back to your Data Catalog, from one of the datasets in your Catalog list, select the three vertical dots and click in \"Explore in Labs\" as shown in the image below.

    The following screen will be shown. Click in copy.

    Now that we have copied the code, let's get back to our \"1. Read data.ipynb\" notebook, and replace the first code cell by with the new code. This will allow us to use a dataset from the Data Catalog in our pipeline.

    Placeholder code Replaced with code snippet

    With our notebooks ready, we can now configure our Pipeline. For this quickstart we will be leveraging an already existing pipeline - double-click the file my_first_pipeline.pipeline. You should see a pipeline as depicted in the images below. To create a new Pipeline, you can open the lab launcher tab and select \"Pipeline Editor\".

    Open Pipeline My first pipeline

    Before running the pipeline, we need to check each component/step properties and configurations. Right-click each one of the steps, select \"Open Properties\", and a menu will be depicted in your right side. Make sure that you have \"YData - CPU\" selected as the Runtime Image as show below.

    Open properties Runtime image

    We are now ready to create and run our first pipeline. In the top left corner of the pipeline editor, the run button will be available for you to click.

    Accept the default values shown in the run dialog and start the run

    If the following message is shown, it means that you have create a run of your first pipeline.

    Now that you have created your first pipeline, you can select the Pipeline from Fabric's left side menu.

    Your most recent pipeline will be listed, as shown in below image.

    To check the run of your pipeline, jump into the \"Run\" tab. You will be able to see your first pipeline running!

    By clicking on top of the record you will be able to see the progress of the run step-by-step, and visualize the outputs of each and every step by clicking on each step and selecting the Visualizations tab.

    Congrats! \ud83d\ude80 You have now successfully created your first Pipeline a code environment, so you can benefit from Fabric's orchestration engine to crate scalable, versionable and comparable data workflows. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_syntheticdata_generator/","title":"How to create your first Synthetic Data generator","text":"

    Check this quickstart video on how to create your first Synthetic Data generator.

    To generate your first synthetic data, you need to have a Dataset already available in your Data Catalog. Check this tutorial to see how you can add your first dataset to Fabric\u2019s Data Catalog.

    With your first dataset created, you are now able to start the creation of your Synthetic Data generator. You can either select \"Synthetic Data\" from your left side menu, or you can select \"Create Synthetic Data\" in your project Home as shown in the image below.

    You'll be asked to select the dataset you wish to generate synthetic data from and verify the columns you'd like to include in the synthesis process, validating their Variable and Data Types.

    Data types are relevant for synthetic data quality

    Data Types are important to be revisited and aligned with the objectives for the synthetic data as they can highly impact the quality of the generated data. For example, let's say we have a column that is a \"Name\", while is some situations it would make sense to consider it a String, under the light of a dataset where \"Name\" refers to the name of the product purchases, it might be more beneficial to set it as a Category.

    Finally, as the last step of our process it comes the Synthetic Data specific configurations, for this particular case we only need to define a Display Name, and we can finish the process by clicking in the \"Save\" button as per the image below.

    Your Synthetic Data generator is now training and listed under \"Synthetic Data\". While the model is being trained, the Status will be \ud83d\udfe1, as soon as the training is completed successfully it will transition to \ud83d\udfe2 as per the image below.

    Once the Synthetic Data generator has finished training, you're ready to start generating your first synthetic dataset. You can start by exploring an overview of the model configurations and even download a PDF report with a comprehensive overview of your Synthetic Data Quality Metrics. Next, you can generate synthetic data samples by accessing the Generation tab or click on \"Go to Generation\".

    In this section, you are able to generate as many synthetic samples as you want. For that you need to define the number rows to generate and click \"Generate\", as depicted in the image below.

    A new line in your \"Sample History\" will be shown and as soon as the sample generation is completed you will be able to \"Compare\" your synthetic data with the original data, add as a Dataset with \"Add to Data Catalog\" and last but not the least download it as a file with \"Download csv\".

    Congrats! \ud83d\ude80 You have now successfully created your first Synthetic Data generator with Fabric. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/fabric_community/","title":"Get started with Fabric Community","text":"

    Fabric Community is a SaaS version that allows you to explore all the functionalities of Fabric first-hand: free, forever, for everyone. You\u2019ll be able to validate your data quality with automated profiling, unlock data sharing and improve your ML models with synthetic data, and increase your productivity with seamless integration:

    • Build 1 personal project;
    • Create your first Data Catalog and benefit from automated data profiling;
    • Train and generate synthetic data up to 2 models and datasets with 50 columns and 100K rows;
    • Optimize synthetic data quality for your use cases with an evaluation PDF report;
    • Create 1 development environment (Labs) and integrate it with your familiar ML packages and workflows.
    "},{"location":"get-started/fabric_community/#register","title":"Register","text":"

    To register for Fabric Community:

    • Access the Fabric Community Try Now and create your YData account by submitting the form
    • Check your email for your login credentials
    • Login into fabric.ydata.ai and enjoy!

    Once you login, you'll access the Home page and get started with your data preparation!

    "},{"location":"get-started/upload_csv/","title":"How to create your first Dataset from a CSV file","text":"

    Check this quickstart video on how to create your first Dataset from a CSV file.

    To create your first dataset in the Data Catalog, you can start by clicking on \"Add Dataset\" from the Home section. Or click to Data Catalog (on the left side menu) and click \u201cAdd Dataset\u201d.

    After that the below modal will be shown. You will need to select a connector. To upload a CSV file, we need to select \u201cUpload CSV\u201d.

    Once you've selected the \u201cUpload CSV\u201d connector, a new screen will appear, enabling you to upload your file and designate a name for your connector. This file upload connector will subsequently empower you to create one or more datasets from the same file at a later stage.

    Loading area Upload csv file

    With the Connector created, you'll be able to add a dataset and specify its properties:

    • Name: The name of your dataset;
    • Separator: This is an important parameter to make sure that we can parse your CSV correctly. The default value is \u201c,\u201d.
    • Data Type: Whether your dataset contains tabular or time-series (i.e., containing temporal dependency) data.

    Your created Connector (\u201cCensus File\u201d) and Dataset (\u201cCensus\u201d) will be added to the Data Catalog. As soon as the status is green, you can navigate your Dataset. Click in Open Dataset as per the image below.

    Within the Dataset details, you can gain valuable insights through our automated data quality profiling. This includes comprehensive metadata and an overview of your data, encompassing details like row count, identification of duplicates, and insights into the overall quality of your dataset.

    Or perhaps, you want to further explore through visualization, the profile of your data with both univariate and multivariate of your data.

    Congrats! \ud83d\ude80 You have now successfully created your first Connector and Dataset in Fabric\u2019s Data Catalog. Get ready for your journey of improved quality data for AI.

    "},{"location":"integrations/","title":"Integrations","text":"

    Recognizing the modern enterprise data stack comprises a vast array of services and tools, YData Fabric is augmented by a growing ecosystem of partners and integrations, acting both upstream and downstream in the lifecycle of an AI project.

    The list below is a non-exhaustive compilation of MLOps, Data and Cloud Providers which smoothly integrate with Fabric:

    • DVC: Enhancing data versioning
    • Databricks: Enhancing feature/data engineering before improving with YData

      • \ud83d\udcda Follow Databricks step-by-step tutorials
      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • Snowflake: Enhancing feature/data engineering before improving with YData

      • \ud83d\udcda Follow Snowflake step-by-step tutorials
      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • H2O: Framework available through code and Fabric Labs (H2O Flow)

    • Algorithmia: Integration for easy model deployment

      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • UbiOps: Integration for easy model deployment

      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • Great Expectations: Data profiling is integrated with Great Expectations

    • Azure ML: Integration for easy model deployment

      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • AWS SageMaker: Integration for easy model deployment

      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • Google Vertex AI: Integration for easy model deployment

    Up-to-date examples

    \ud83d\udc49 For the most up-to-date examples and ready-to-use recipes of how to integrate with YData Fabric with some services above, check out the Integrations section of YData\u2019s Academy.

    "},{"location":"integrations/databricks/integration_connectors_catalog/","title":"Connectors & Catalog","text":"

    YData Fabric provides a seamless integration with Databricks, allowing you to connect, query, and manage your data in Databricks Unity Catalog and Delta Lake with ease. This section will guide you through the benefits, setup, and usage of the Databricks' available connector in Fabric.

    Prerequisites

    Before using the YData SDK in Databricks notebooks, ensure the following prerequisites are met:

    • Access to a Databricks workspace
    • A valid YData Fabric account and API key
    • Credentials for Databricks (tokens, Databricks host, warehouse, database, schema, etc.).
    "},{"location":"integrations/databricks/integration_connectors_catalog/#delta-lake","title":"Delta Lake","text":"

    Databricks Delta Lake is an open-source storage layer that brings reliability to data lakes. Built on top of Apache Spark, Delta Lake provides ACID (Atomicity, Consistency, Isolation, Durability) transaction guarantees, scalable metadata handling, and unifies streaming and batch data processing.

    In this tutorial it will be covered how you can leverage YData Fabric connectors to integrate with Databricks Delta Lake.

    "},{"location":"integrations/databricks/integration_connectors_catalog/#setting-up-the-delta-lake-connector","title":"Setting Up the Delta Lake Connector","text":"

    To create a Delta Lake connector in YData Fabric Ui you need to meet the following pre-requisites.

    "},{"location":"integrations/databricks/integration_connectors_catalog/#step-by-step-creation-through-the-ui","title":"Step-by-step creation through the UI","text":"

    To create a connector in YData Fabric, select the \"Connectors\" page from the left side menu, as illustrated in the image below.

    Now, click in the \"Create Connector\" button and the following menu with the available connectors will be shown.

    Depending on the cloud vendor that you have your Databricks' instance deployed, select the Delta Lake connector for AWS or Azure. After selecting the connector type \"Databricks Delta Lake\" the below menu will be shown. This is where you can configure the connection to your Delta Lake. For that you will need the following information:

    • Databricks Host: The URL of your Databricks cluster
    • Access token: Your Databricks' user token
    • Catalog: The name of a Catalog that you want to connect to
    • Schema: The name of the schema that you want to connect to

    Depending on the cloud selected, you will be asked for the credentials to your staging storage (AWS S3 or Azure Blob Storage). In this example we are using AWS and for that reason the below inputs refer to AWS S3.

    • Key ID: The Snowflake database to connect to.
    • Key Secret: The schema within the database.

    And finally, the name for your connector: - Display name: A unique name for your connector. Test your connection and that's it! \ud83d\ude80

    You are now ready to create different Datasources using this connector - read the data from a table, evaluate the quality of the data or even read a full database and generate a synthetic replica of your data! Read more about Fabric Datasources in here.

    "},{"location":"integrations/databricks/integration_connectors_catalog/#use-it-inside-the-labs","title":"Use it inside the Labs","text":"

    \ud83d\udc68\u200d\ud83d\udcbb Full code example and recipe can be found here.

    In case you prefer a Python interface, we also have connectors available through Fabric SDK inside the labs. For a seamless integration between the UI and the Labs environment, Fabric offers an SDK that allows you to re-use connectors, datasources and even synthesizers.

    Start by creating your code environment through the Labs. In case you need to get started with the Labs, check this step-by-step guide.

        # Importing YData's packages\n    from ydata.labs import Connectors\n    # Getting a previously created Connector\n    connector = Connectors.get(uid='insert-connector-id',\n                               namespace='indert-namespace-id')\n    print(connector)\n
    "},{"location":"integrations/databricks/integration_connectors_catalog/#read-from-your-delta-lake","title":"Read from your Delta Lake","text":"

    Using the Delta Lake connector it is possible to:

    • Get the data from a Delta Lake table
    • Get a sample from a Delta Lake table
    • Get the data from a query to a Delta Lake instance
    "},{"location":"integrations/databricks/integration_connectors_catalog/#unity-catalog","title":"Unity Catalog","text":"

    Databricks Unity Catalog is a unified governance solution for all data and AI assets within the Databricks Lakehouse Platform.

    Databricks Unity Catalog leverages the concept of Delta Sharing, meaning this is a great way not only to ensure alignment between Catalogs but also to limit the access to data. This means that byt leveraging the Unity Catalog connector, users can only access a set of data assets that were authorized for a given Share.

    "},{"location":"integrations/databricks/integration_connectors_catalog/#step-by-step-creation-through-the-ui_1","title":"Step-by-step creation through the UI","text":"

    How to create a connector to Databricks Unity Catalog in Fabric?

    The process to create a new connector is similar to what we have covered before to create a new Databricks Unity Catalog connector in YData Fabric.

    After selecting the connector \"Databricks Unity Catalog\", you will be requested to upload your Delta Sharing token as depicted in the image below.

    Test your connection and that's it! \ud83d\ude80

    "},{"location":"integrations/databricks/integration_connectors_catalog/#use-it-inside-the-labs_1","title":"Use it inside the Labs","text":"

    \ud83d\udc68\u200d\ud83d\udcbb Full code example and recipe can be found here.

    In case you prefer a Python interface, we also have connectors available through Fabric inside the labs. Start by creating your code environment through the Labs. In case you need to get started with the Labs, check this step-by-step guide.

    "},{"location":"integrations/databricks/integration_connectors_catalog/#navigate-your-delta-share","title":"Navigate your Delta Share","text":"

    With your connector created you are now able to explore the schemas and tables available in a Delta share.

    List available shares
        #List the available shares for the provided authentication\n    connector.list_shares()\n
    List available schemas
        #List the available schemas for a given share\n    connector.list_schemas(share_name='teste')\n
    List available tables
        #List the available tables for a given schema in a share\n    connector.list_tables(schema_name='berka',\n                           share_name='teste')\n\n    #List all the tables regardless of share and schema\n    connector.list_all_tables()\n
    "},{"location":"integrations/databricks/integration_connectors_catalog/#read-from-your-delta-share","title":"Read from your Delta Share","text":"

    Using the Delta Lake connector it is possible to:

    • Get the data from a Delta Lake table
    • Get a sample from a Delta Lake table
    Read the data from a table
        #This method reads all the data records in the table\n    table = connector.read_table(table_name='insert-table-name',\n                                 schema_name='insert-schema-name',\n                                 share_name='insert-share-name')\n    print(table)\n
    Read a data sample from a table
        #This method reads all the data records in the table\n    table = connector.read_table(table_name='insert-table-name',\n                                 schema_name='insert-schema-name',\n                                 share_name='insert-share-name',\n                                 sample_size=100)\n    print(table)\n

    I hope you enjoyed this quick tutorial on seamlessly integrating Databricks with your data preparation workflows. \ud83d\ude80

    "},{"location":"integrations/databricks/integration_with_sdk/","title":"YData SDK in Databricks Notebooks","text":"

    The YData Fabric SDK provides a powerful set of tools for integrating and enhancing data within Databricks notebooks. This guide covers the installation, basic usage, and advanced features of the Fabric SDK, helping users maximize the potential of their data for AI and machine learning applications.

    \ud83d\udc68\u200d\ud83d\udcbb Full code example and recipe can be found here.

    Prerequisites

    Before using the YData Fabric SDK in Databricks notebooks, ensure the following prerequisites are met:

    • Access to a Databricks workspace
    • A valid YData Fabric account and API key
    • Basic knowledge of Python and Databricks notebooks
    • A safe connection between your Databricks cluster and Fabric

    Best Practices

    • Data Security: Ensure API keys and sensitive data are securely managed.
    • Efficient Coding: Use vectorized operations for data manipulation where possible.
    • Resource Management: Monitor and manage the resources used by your clusters (Databricks and Fabric) Databricks cluster to optimize performance.
    "},{"location":"integrations/databricks/integration_with_sdk/#installation","title":"Installation","text":"

    To install the YData SDK in a Databricks notebook, use the following command:

    %pip install ydata-sdk\ndbutils.library.restartPython()\n
    Ensure the installation is successful before proceeding to the next steps.

    "},{"location":"integrations/databricks/integration_with_sdk/#basic-usage-data-integration","title":"Basic Usage - data integration","text":"

    This section provides step-by-step instructions on connecting to YData Fabric and performing essential data operations using the YData SDK within Databricks notebooks. This includes establishing a secure connection to YData Fabric and accessing datasets.

    "},{"location":"integrations/databricks/integration_with_sdk/#connecting-to-ydata-fabric","title":"Connecting to YData Fabric","text":"

    First, establish a connection to YData Fabric using your API key:

    import os\n\n# Add your Fabric token as part of your environment variables for authentication\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'\n
    "},{"location":"integrations/databricks/integration_with_sdk/#data-access-manipulation","title":"Data access & manipulation","text":"

    Once connected, you can access and manipulate data within YData Fabric. For example, to list available datasets:

    from ydata.sdk.datasources import DataSource\n\n#return the list of available DataSources\nDataSource.list()\n

    To load a specific dataset into a Pandas DataFrame:

    #get the data from an existing datasource\ndataset = DataSource.get('<DATASOURCE-ID>')\n
    "},{"location":"integrations/databricks/integration_with_sdk/#advanced-usage-synthetic-data-generation","title":"Advanced Usage - Synthetic data generation","text":"

    This section explores one of the most powerful features of the Fabric SDK for enhancing and refining data within Databricks notebooks. This includes as generating synthetic data to augment datasets or to generate privacy-preserving data. By leveraging these advanced capabilities, users can significantly enhance the robustness and performance of their AI and machine learning models, unlocking the full potential of their data.

    "},{"location":"integrations/databricks/integration_with_sdk/#privacy-preserving","title":"Privacy-preserving","text":"

    Leveraging synthetic data allows to create privacy-preserving datasets that maintain real-world value, enabling users to work with sensitive information securely while accessing utility of real data.

    Check the SDK documentation for more information regarding privacy-controls and anonymization.

    "},{"location":"integrations/databricks/integration_with_sdk/#from-a-datasource-in-ydata-fabric","title":"From a datasource in YData Fabric","text":"

    Users can generate synthetic data from datasource's existing in Fabric:

    Train a synthetic data generator
    # From an existing Fabric datasource\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n\nsynth = RegularSynthesizer(name='<NAME-YOUR-MODEL>')\nsynth.fit(X=dataset)\n

    Sample from a Synthetic data generator

    # From an existing Fabric datasource\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n\nsynth = RegularSynthesizer(name='<NAME-YOUR-MODEL>')\nsynth.fit(X=dataset)\n
    After your synthetic data generator have been trained successfully you can generate as many synthetic datasets as needed Sampling from the model that we have just trained
    from ydata.sdk.synthesizers import RegularSynthesizer\nsample = synth.sample(100)\nsample.head()\n

    It is also possible to generate data from other synthetic data generation models previously trained:

    Generating synthetic data from a previously trained model
    from ydata.sdk.synthesizers import RegularSynthesizer\n\nexisting_synth = RegularSynthesizer('<INSERT-SYNTHETIC-DATA-GENERATOR-ID>').get()\nsample = existing_synth.sample(100)\n
    "},{"location":"integrations/databricks/integration_with_sdk/#from-a-datasource-in-databricks","title":"From a datasource in Databricks","text":"

    Another important integration is to train a synthetic data generator from a dataset that you are currently exploring in your notebook environment. In order to do so, we recommend that you create your dataset using YData Fabric integration connector to your Delta Lake and follow the flow for the creation of a synthetic data generation models from Fabric existing dasources.

    For a small dataset you can also follow this tutorial.

    "},{"location":"integrations/databricks/integration_with_sdk/#data-augmentation","title":"Data augmentation","text":"

    Another key focus is on generating synthetic data to augment existing datasets. This technique, particularly through conditional synthetic data generation, allows users to create targeted, realistic datasets. By addressing data imbalances and enriching the training data, conditional synthetic data generation significantly enhances the robustness and performance of machine learning (ML) models, leading to more accurate and reliable outcomes.

    Read data from a delta table
    # Read data from the catalog\ndf = spark.sql(\"SELECT * FROM ydata.default.credit_scoring_labeled\")\n\n# Display the dataframe\ndisplay(df)\n

    After reading the data we need to convert it to pandas dataframe in order to create our synthetic data generation model. For the augmentation use-case we will be leveraging Conditional Synthetic data generation.

    Training a conditional synthetic data generator
    from ydata.sdk.synthesizers import RegularSynthesizer\n\n# Convert Spark dataframe to pandas dataframe\npandas_df = df.toPandas()\npandas_df = pandas_df.drop('ID', axis=1)\n\n# Train a synthetic data generator using ydata-sdk\nsynth = RegularSynthesizer(name='Synth credit scoring | Conditional')\nsynth.fit(pandas_df, condition_on='Label')\n\n# Display the synthetic dataframe\ndisplay(synth)\n

    Now that we have a trained conditional synthetic data generator we are able to generate a few samples controlling the population behaviour based on the columns that we have conditioned the process to.

    Generating a synthetic sample conditioned to column 'Label'
    #generate synthetic samples condition to Label\nsynthetic_sample = synth.sample(n_samples=len(pandas_df), condition_on={\n            \"Label\": {\n                        \"categories\": [{\n                            \"category\": 1,\n                            \"percentage\": 0.7\n                        }]\n        }\n    }\n)\n

    After generating the synthetic data we can combine it with our dataset.

    Convert the dataframe to Spark dataframe
    # Enable Arrow-based columnar data transfers\nspark.conf.set(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\n\n#Create a spark dataframe from the synthetic dataframe\nsynthetic_df = spark.createDataFrame(synthetic_sample)\n\ndisplay(synthetic_df)\n
    Combining the datasets
    # Concatenate the original dataframe with the synthetic dataframe\n#removing the column ID as it is not used\ndf = df.drop('ID')\nconcatenated_df = df.union(synthetic_df)\n\n# Display the concatenated dataframe\ndisplay(concatenated_df)\n

    Afterwards you can use your augmented dataset to train a Machine Learning model using MLFlow.

    "},{"location":"integrations/databricks/overview/","title":"Overview","text":"

    This sections provides a detailed guide on integrating YData Fabric with Databricks. By combining Databricks and YData Fabric, users gain a comprehensive AI solution. Fabric enables access to previously siloed data, enhances understanding, and improves data quality. Meanwhile, Databricks provides the scalability needed to deliver robust AI capabilities.

    "},{"location":"integrations/databricks/overview/#integration-benefits","title":"Integration benefits","text":"
    • Enhanced Data Accessibility: Seamlessly access and integrate previously siloed data.
    • Improved Data Quality: Use YData Fabric's tools to enhance the quality of your data through data preparation and augmentation.
    • Scalability: Leverage Databricks' robust infrastructure to scale data processing and AI workloads.
    • Streamlined Workflows: Simplify data workflows with connectors and SDKs, reducing manual effort and potential errors.
    • Comprehensive Support: Benefit from extensive documentation and support for both platforms, ensuring smooth integration and operation.
    "},{"location":"integrations/databricks/overview/#integration-methods","title":"Integration methods","text":""},{"location":"integrations/databricks/overview/#data-catalog-connectors","title":"Data Catalog - Connectors","text":"

    YData Fabric provides a range of connectors that enable direct integration with Databricks' Unity Catalog and Delta Lake. These connectors streamline data transfer and ensure seamless interoperability between the two platforms.

    Key Features:

    • Easy configuration
    • Secure data transfer
    • Data synchronization
    "},{"location":"integrations/databricks/overview/#sdk","title":"SDK","text":"

    The YData Fabric SDK offers a programmatic approach to integrating with Databricks. It provides developers with the tools and libraries needed to automate and customize data workflows between YData Fabric and Databricks.

    Key Features:

    • Python based interface
    • Flexible and customizable
    • Comprehensive documentation and support

    Find a comprehensive guideline on using YData Fabric SDK in Databricks Notebooks.

    "},{"location":"integrations/databricks/overview/#api","title":"API","text":"

    The YData Fabric API allows for integration via RESTful services, providing a versatile method to interact with Databricks. This approach is ideal for applications requiring direct API calls and custom integrations.

    Key Features:

    • RESTful architecture
    • Language-agnostic integration
    • Detailed API documentation
    • Support for a wide range of operations
    "},{"location":"integrations/databricks/overview/#integration-diagram","title":"Integration diagram","text":"

    The integration diagram below illustrates the interaction between YData Fabric and Databricks, highlighting the data flow and key components involved in the integration process.

    "},{"location":"integrations/snowflake/integration_snowflake/","title":"\u2744\ufe0f Integrate Fabric with Snowflake - from Analytics to Machine Learning","text":"

    YData Fabric provides a seamless integration with Snowflake, allowing you to connect, query, and manage your data in Snowflake with ease. This section will guide you through the benefits, setup, and usage of the Snowflake connector within YData Fabric.

    "},{"location":"integrations/snowflake/integration_snowflake/#benefits-of-integration","title":"Benefits of Integration","text":"

    Integrating YData Fabric with Snowflake offers several key benefits:

    • Scalability: Snowflake's architecture scales effortlessly with your data needs, while YData Fabric's tools ensure efficient data integration and management.
    • Performance: Leveraging Snowflake's high performance for data querying and YData Fabric's optimization techniques enhances overall data processing speed.
    • Security: Snowflake's robust security features, combined with YData Fabric's data governance capabilities, ensure your data remains secure and compliant.
    • Interoperability: YData Fabric simplifies the process of connecting to Snowflake, allowing you to quickly set up and start using the data without extensive configuration. Benefit from the unique Fabric functionalities like data preparation with Python, synthetic data generation and data profiling.
    "},{"location":"integrations/snowflake/integration_snowflake/#setting-up-the-snowflake-connector","title":"Setting Up the Snowflake Connector","text":"

    How to create a connector to Snowflake in Fabric?

    To create a Snowflake connector in YData Fabric Ui you need to meet the following pre-requisites and steps:

    Prerequisites

    Before setting up the connector, ensure you have the following:

    • A Snowflake account with appropriate access permissions.
    • YData Fabric installed and running in your environment.
    • Credentials for Snowflake (username, password, account identifier, warehouse, database, schema).
    "},{"location":"integrations/snowflake/integration_snowflake/#step-by-step-creation-through-the-ui","title":"Step-by-step creation through the UI","text":"

    To create a connector in YData Fabric, select the \"Connectors\" page from the left side menu, as illustrated in the image below.

    Now, click in the \"Create Connector\" button and the following menu with the available connectors will be shown.

    After selecting the connector type \"Snowflake\" the below menu will be shown. This is where you can configure the connection to your Snowflake instance. For that you will need the following information:

    • Username: Your Snowflake username.
    • Password: Your Snowflake password.
    • Host/Account Identifier: Your Snowflake account identifier (e.g., xy12345.us-east-1).
    • Port: The Snowflake port number.
    • Database: The Snowflake database to connect to.
    • Schema: The schema within the database.
    • Warehouse: The Snowflake warehouse to use.
    • Display Name: A unique name for your connector.

    Test your connection and that's it! \ud83d\ude80

    You are now ready to create different Datasources using this connector - read the data from a query, evaluate the quality of the data from a table or even read a full database and generate a synthetic replica of your data! Read more about Fabric Datasources in here.

    "},{"location":"integrations/snowflake/integration_snowflake/#use-it-inside-the-labs","title":"Use it inside the Labs","text":"

    \ud83d\udc68\u200d\ud83d\udcbb Full code example and recipe can be found here.

    In case you prefer a Python interface, we also have connectors available through Fabric SDK inside the labs. For a seamless integration between the UI and the Labs environment, Fabric offers an SDK that allows you to re-use connectors, datasources and even synthesizers.

    Start by creating your code environment through the Labs. In case you need to get started with the Labs, check this step-by-step guide.

        # Importing YData's packages\n    from ydata.labs import Connectors\n    # Getting a previously created Connector\n    connector = Connectors.get(uid='insert-connector-id',\n                               namespace='indert-namespace-id')\n    print(connector)\n
    "},{"location":"integrations/snowflake/integration_snowflake/#navigate-your-database","title":"Navigate your database","text":"

    With your connector created you are now able to explore your database and available datasets.

    List available schemas and get the metadata of a given schema
        # returns a list of schemas\n    schemas = connector.list_schemas()\n\n    # get the metadata of a database schema, including columns and relations between tables (PK and FK)\n    schema = connector.get_database_schema('PATIENTS')\n
    "},{"location":"integrations/snowflake/integration_snowflake/#read-from-a-snowflake-instance","title":"Read from a Snowflake instance","text":"

    Using the Snowflake connector it is possible to:

    • Get the data from a Snowflake table
    • Get a sample from a Snowflake table
    • Get the data from a query to a Snowflake instance
    • Get the full data from a selected database
    Read full and a sample from a table
        # returns the whole data from a given table\n    table = connector.get_table('cardio_test')\n    print(table)\n\n    # Get a sample with n rows from a given table\n    table_sample = connector.get_table_sample(table='cardio_test', sample_size=50)\n    print(table_sample)\n
    Get the data from a query
        # returns the whole data from a given table\n    query_output = connector.query('SELECT * FROM patients.cardio_test;')\n    print(query_output)\n
    "},{"location":"integrations/snowflake/integration_snowflake/#write-to-a-snowflake-instance","title":"Write to a Snowflake instance","text":"

    If you need to write your data into a Snowflake instance you can also leverage your Snowflake connector for the following actions:

    • Write the data into a table
    • Write a new database schema

    The if_exists parameter allow you to decide whether you want to append, replace or fail in case a table with the same name already exists in the schema.

    Writing a dataset to a table in a Snowflake schema
        connector.write_table(data=tables['cardio_test'],\n                          name='cardio',\n                          if_exists='fail')\n

    table_names allow you to define a new name for the table in the database. If not provided it will be assumed the table names from your dataset. Writing a full database to a Snowflake schema

        connector.write_database(data=database,\n                         schema_name='new_cardio',\n                         table_names={'cardio_test': 'cardio'})\n

    I hope you enjoyed this quick tutorial on seamlessly integrating Snowflake with your data preparation workflows. \u2744\ufe0f\ud83d\ude80

    "},{"location":"labs/","title":"Fabric coding environment","text":"

    YData Fabric Labs are on-demand, cloud-based data development environments with automatically provisioned hardware (multiple infrastructure configurations, including GPUs, are possible) and full platform integration via a Python interface (allowing access to Data Sources, Synthesizers, and the Workspace\u2019s shared files).

    Wit Labs, you can create environment with the support to familiar IDEs like Visual Studio Code, **Jupyter Lab** and H20 Flow, with support for both Python and R are included.

    For Python specifically, pre-configured bundles including TensorFlow, PyTorch and/or the main popular data science libraries are also available, jumpstarting data development. Additional libraries can be easily installed leveraging a simple !pip install

    "},{"location":"labs/#get-started-with-your-first-lab","title":"Get started with your first lab","text":"

    \ud83e\uddea Follow this step-by-step guided tutorial to create your first Lab.

    "},{"location":"labs/#tutorials-recipes","title":"Tutorials & recipes","text":"

    Leverage YData extensive collection of tutorials and recipes that you can find in YData Academy. Quickstart or accelerate your data developments with recipes and tutorial use-cases.

    "},{"location":"labs/overview/","title":"Overview","text":"

    Labs exist for Data practitioners to tackle more complex use cases through a familiar environment supercharged with infrastructure, integration with other Fabric modules and access to advanced synthesis and profiling technology via a familiar python interface.

    It is the preferred environment for Data practitioners to express their domain expertise with all the required tools, technology and computational power at their fingertips. It is thus the natural continuation of the data understanding works which started in Data Sources.

    "},{"location":"labs/overview/#supported-ides-and-images","title":"Supported IDE's and images","text":""},{"location":"labs/overview/#ides","title":"IDEs","text":"

    YData Fabric supports integration with various Integrated Development Environments (IDEs) to enhance productivity and streamline workflows. The supported IDEs include:

    • Visual Studio Code (VS Code): A highly versatile and widely-used code editor that offers robust support for numerous programming languages and frameworks. Its integration with Git and extensions like GitLens makes it ideal for version control and collaborative development.
    • Jupyter Lab: An interactive development environment that allows for notebook-based data science and machine learning workflows. It supports seamless Git integration through extensions and offers a user-friendly interface for managing code, data, and visualizations.
    • H2O Flow: A web-based interface specifically designed for machine learning and data analysis with the H2O platform. It provides a flow-based, interactive environment for building and deploying machine learning models.
    "},{"location":"labs/overview/#labs-images","title":"Labs images","text":"

    In the Labs environment, users have access to the following default images, tailored to different computational needs:

    "},{"location":"labs/overview/#python","title":"Python","text":"

    All the below images support Python as the programming language. Current Python version is x

    • YData CPU: Optimized for general-purpose computing and data analysis tasks that do not require GPU acceleration. This image includes access to YData Fabric unique capabilities for data processing (profiling, constraints engine, synthetic data generation, etc).
    • YData GPU: Designed for tasks that benefit from GPU acceleration, providing enhanced performance for large-scale data processing and machine learning operations. Also includes access to YData Fabric unique capabilities for data processing.
    • YData GPU TensorFlow: Specifically configured for TensorFlow-based machine learning and deep learning applications, leveraging GPU capabilities to accelerate training and inference processes. These images ensure that users have the necessary resources and configurations to efficiently conduct their data science and machine learning projects within the Labs environment.
    • YData GPU Torch: Specifically configured for Torch-based machine learning and deep learning applications, leveraging GPU capabilities to accelerate training and inference processes. These images ensure that users have the necessary resources and configurations to efficiently conduct their data science and machine learning projects within the Labs environment.
    "},{"location":"labs/overview/#r","title":"R","text":"

    An image for R, that allows you to leverage the latest version of the language as well as the most user libraries.

    "},{"location":"labs/overview/#existing-labs","title":"Existing Labs","text":"

    Existing Labs appear in the Labs pane of the web application. Besides information about its settings and status, three buttons exist:

    • Open: Open the Lab\u2019s IDE in a new browser tab
    • Pause: Pause the Lab. When resumed, all data will be available.
    • Delete: Lab will be deleted. Data not saved in the workspace\u2019s shared folder (see below) will be deleted.

    The details list of a Lab, with the status and its main actions.

    The Status column indicates the Labs\u2019 status. A Lab can have 4 statuses:

    • \ud83d\udfe2 Lab is running
    • \ud83d\udfe1 Lab is being created (hardware is being provisioned) or is either pausing or starting
    • \ud83d\udd34 Lab was shutdown due to an error. A common error is the Lab going out-of-memory. Additional details are offered in the web application.
    • \u26ab Lab is paused
    "},{"location":"labs/overview/#git-integration","title":"Git integration","text":"

    Integrating Git with Jupyter Notebooks and Visual Studio Code (VS Code) streamlines version control and collaborative workflows for data developers. This integration allows you to track changes, manage project versions, and collaborate effectively within familiar interfaces.

    "},{"location":"labs/overview/#jupyter-lab","title":"Jupyter Lab","text":"

    Inside of Labs that use Jupyter Lab as IDE, you will find the jupyterlab-git extension installed in the environment.

    To create or clone a new repository you need to perform the following steps:

    Select Jupyter Lab Git extension Cloning a repository to your local env

    For more complex actions like forking and merging branches, see the gif below:

    "},{"location":"labs/overview/#visual-code-vs-code","title":"Visual Code (VS Code)","text":"

    To clone or create a new git repository you can click in \"Clone Git Repository...\" and paste it in the text box in the top center area of screen as depicted in the image below.

    Clone Git repository Cloning a repository to your local env"},{"location":"labs/overview/#building-pipelines","title":"Building Pipelines","text":"

    Building data pipelines and breaking them down into modular components can be challenging. For instance, a typical machine learning or deep learning pipeline starts with a series of preprocessing steps, followed by experimentation and optimization, and finally deployment. Each of these stages presents unique challenges within the development lifecycle.

    Fabric Jupyter Labs simplifies this process by incorporating Elyra as the Pipeline Visual Editor. The visual editor enables users to build data pipelines from notebooks, Python scripts, and R scripts, making it easier to convert multiple notebooks or script files into batch jobs or workflows.

    Currently, these pipelines can be executed either locally in JupyterLab or on Kubeflow Pipelines, offering flexibility and scalability for various project needs. Read more about pipelines.

    "},{"location":"pipelines/","title":"Pipelines","text":"

    The Pipelines module of YData Fabric is a general-purpose job orchestrator with built-in scalability and modularity plus reporting and experiment tracking capabilities. With automatic hardware provisioning, on-demand or scheduled execution, run fingerprinting and a UI interface for review and configuration, Pipelines equip the Fabric with operational capabilities for interfacing with up/downstream systems (for instance to automate data ingestion, synthesis and transfer workflows) and with the ability to experiment at scale (crucial during the iterative development process required to discover the data improvement pipeline yielding the highest quality datasets).

    YData Fabric's Pipelines are based on Kubeflow Pipelines and can be created via an interactive interface in Labs with Jupyter Lab as the IDE (recommended) or via Kubeflow Pipeline\u2019s Python SDK.

    With its full integration with Fabric's scalable architecture and the ability to leverage Fabric\u2019s Python interface, Pipelines are the recommended tool to scale up notebook work to experiment at scale or move from experimentation to production.

    "},{"location":"pipelines/#benefits","title":"Benefits","text":"

    Using Pipelines for data preparation offers several benefits, particularly in the context of data engineering, machine learning, and data science workflows. Here are some key advantages:

    • Modularity: they allow to break down data preparation into discrete, reusable steps. Each step can be independently developed, tested, and maintained, enhancing code modularity and readability.
    • Automation: they automate the data preparation process, reducing the need for manual intervention and ensuring that data is consistently processed. This leads to more efficient workflows and saves time.
    • Scalability: Fabric's distributed infrastructure combined with kubernetes based pipelines allows to handle large volumes of data efficiently, making them suitable for big data environments.
    • Reproducibility: By defining a series of steps that transform raw data into a ready-to-use format, pipelines ensure that the same transformations are applied every time. This reproducibility is crucial for maintaining data integrity and for validating results. Maintainability:
    • Versioning: support versioning of the data preparation steps. This versioning is crucial for tracking changes, auditing processes, and rolling back to previous versions if needed.
    • Flexibility: and above all they can be customized to fit specific requirements of different projects. They can be adapted to include various preprocessing techniques, feature engineering steps, and data validation processes.
    "},{"location":"pipelines/#related-materials","title":"Related Materials","text":"
    • \ud83d\udcd6 How to create your first Pipeline
    • How to build a pipeline with YData Fabric
    "},{"location":"pipelines/concepts/","title":"Concepts","text":"

    An example pipeline (as seen in the Pipelines module of the dashboard), where each single-responsibility block corresponds to a step in a typical machine learning workflow

    Each Pipeline is a set of connected blocks. A block is a self-contained set of code, packaged as a container, that performs one step in the Pipeline. Usually, each Pipeline block corresponds to a single responsibility task in a workflow. In a machine learning workflow, each step would correspond to one block, i.e, data ingestion, data cleaning, pre-processing, ML model training, ML model evaluation.

    Each block is parametrized by:

    • code: it executes (for instance, a Jupyter Notebook, a Python file, an R script)
    • runtime: which specifies the container environment it runs in, allowing modularization and inter-step independence of software requirements (for instance, specific Python versions for different blocks)
    • hardware requirements: depending on the workload, a block may have different needs regarding CPU/GPU/RAM. These requirements are automatically matched with the hardware availability of the cluster the Platform\u2019s running in. This, combined with the modularity of each block, allows cost and efficiency optimizations by up/downscaling hardware according to the workload.
    • file dependencies: local files that need to be copied to the container environment
    • environment variables, useful, for instance to apply specific settings or inject authentication credentials
    • output files: files generated during the block\u2019s workload, which will be made available to all subsequent Pipeline steps

    The hierarchy of a Pipeline, in an ascending manner, is as follows:

    • Run: A single execution of a Pipeline. Usually, Pipelines are run due to changes on the code, on the data sources or on its parameters (as Pipelines can have runtime parameters)
    • Experiment: Groups of runs of the same Pipeline (may have different parameters, code or settings, which are then easily comparable). All runs must have an Experiment. An Experiment can contain Runs from different Pipelines.
    • Pipeline Version: Pipeline definitions can be versioned (for instance, early iterations on the flow of operations; different versions for staging and production environments)
    • Pipeline

    \ud83d\udcd6 Get started with the concepts and a step-by-step tutorial

    "},{"location":"pipelines/concepts/#runs-recurring-runs","title":"Runs & Recurring Runs","text":"

    A run is a single execution of a pipeline. Runs comprise an immutable log of all experiments that you attempt, and are designed to be self-contained to allow for reproducibility. You can track the progress of a run by looking at its details page on the pipeline's UI, where you can see the runtime graph, output artifacts, and logs for each step in the run.

    A recurring run, or job in the backend APIs, is a repeatable run of a pipeline. The configuration for a recurring run includes a copy of a pipeline with all parameter values specified and a run trigger. You can start a recurring run inside any experiment, and it will periodically start a new copy of the run configuration. You can enable or disable the recurring run from the pipeline's UI. You can also specify the maximum number of concurrent runs to limit the number of runs launched in parallel. This can be helpful if the pipeline is expected to run for a long period and is triggered to run frequently.

    "},{"location":"pipelines/concepts/#experiment","title":"Experiment","text":"

    An experiment is a workspace where you can try different configurations of your pipelines. You can use experiments to organize your runs into logical groups. Experiments can contain arbitrary runs, including recurring runs.

    "},{"location":"pipelines/concepts/#pipeline-pipeline-version","title":"Pipeline & Pipeline Version","text":"

    A pipeline is a description of a workflow, which can include machine learning (ML) tasks, data preparation or even the generation of synthetic data. The pipeline outlines all the components involved in the workflow and illustrates how these components interrelate in the form of a graph. The pipeline configuration defines the inputs (parameters) required to run the pipeline and specifies the inputs and outputs of each component.

    When you run a pipeline, the system launches one or more Kubernetes Pods corresponding to the steps (components) in your workflow. The Pods start Docker containers, and the containers, in turn, start your programs.

    Pipelines can be easily versioned for reproducibility of results.

    "},{"location":"pipelines/concepts/#artifacts","title":"Artifacts","text":"

    For each block/step in a Run, Artifacts can be generated. Artifacts are raw output data which is automatically rendered in the Pipeline\u2019s UI in a rich manner - as formatted tables, text, charts, bar graphs/scatter plots/line graphs, ROC curves, confusion matrices or inline HTML.

    Artifacts are useful to attach, to each step/block of a data improvement workflow, relevant visualizations, summary tables, data profiling reports or text analyses. They are logged by creating a JSON file with a simple, pre-specified format (according to the output artifact type). Additional types of artifacts are supported (like binary files - models, datasets), yet will not benefit from rich visualizations in the UI.

    Compare side-by-side

    \ud83d\udca1 Artifacts and Metrics can be compared side-by-side across runs, which makes them a powerful tool when doing iterative experimentation over data quality improvement pipelines.

    "},{"location":"pipelines/concepts/#pipelines-examples-in-ydata-academy","title":"Pipelines examples in YData Academy","text":"

    \ud83d\udc49 Use cases on YData\u2019s Academy contain examples of full use-cases as well as Pipelines interface to log metrics and artifacts.

    "},{"location":"pipelines/runs/","title":"Creating & managing runs","text":""},{"location":"pipelines/runs/#viewing-run-details","title":"Viewing Run details","text":"

    To view a specific Run, we need to go into the Experiments list and click on the desired Run. Alternatively, accessing Runs and selecting directly the desired run is possible.

    Acessing Runs through its Experiment

    Viewing the full list of Runs, for all Pipelines and Experiments. Runs can be filtered and sorted based on different fields (including Metrics).

    Once a Run is selected, its graph can be viewed (and in real-time, if the Run is being executing). The graph shows the execution status of each log. Clicking on each block will reveal the block\u2019s details, including artifacts, various configuration details and logs (useful for troubleshooting).

    The details page of a step, showing a profiling report (as HTML) as an Artifact

    The Run Output tab includes outputs such as metrics or binary artifacts.

    "},{"location":"pipelines/runs/#creating-runs","title":"Creating Runs","text":"

    Besides triggering Execution via the pipeline editor in Jupyter Lab or the Python SDK, the Pipelines management UI can also be used.

    "},{"location":"pipelines/runs/#one-off","title":"One-off","text":"

    To create a one-off run of a Pipeline, choose a Pipeline in the Pipelines section (including the specific Pipeline version, in case there are multiple definitions) and click + Create Run.

    Creating a Run of a specific Pipeline

    To finish creating the Run, additional information is needed:

    • a Description (optional)
    • the Experiment (mandatory and can be chosen from the list of existing ones)
    • the Run Type (which should be one-off)
    • any eventual runtime parameters of the Pipeline.

    Clicking Start **will trigger execution. Each Run will have a unique, automatically created ID.

    \ud83d\udca1 One-off runs are useful for, for instance, quickly trying out different parameters or for stable data pipelines where the input data has changed (unexpectedly) and the pipelines needs to be ran again."},{"location":"pipelines/runs/#recurring","title":"Recurring","text":"

    To create a Recurring Run, the procedure shown above should be followed, but instead a Recurring Run Type should be chosen.

    The main configuration parameters of a Recurring Run are the frequency, start date and end date, as well as the maximum number of concurrent Runs of the Pipeline. The maximum number of concurrent Runs is a particularly relevant parameter for Pipelines whose execution time may stretch into the following\u2019s scheduled Run start time - it should be tweaked to avoid overwhelming the available infrastructure. Recurrency can also be configured via cron-like definitions.

    Configuring a Recurrent Run

    The recurring run will keep on executing until its end date or until it is manually disabled. Configured Recurrent Runs are listed on the Recurring Runs section.

    \ud83d\udca1 Recurring runs are useful in several situations: - determining the average execution time of a Pipeline (in case there are run-dependent time fluctuations) - when any of the inputs (for instance, input data read from a remote location) changes at a predictable pace"},{"location":"pipelines/runs/#creating-a-pipeline","title":"Creating a Pipeline","text":"

    The recommended way to create a Pipeline is to use the interactive Pipeline editor available on Labs with Jupyter Lab set as IDE. It allows the:

    • addition of blocks by dragging and dropping notebooks/Python scripts/R scripts (can be a mixture)
    • connecting blocks in linear and non-linear ways to define the execution sequence
    • configuring the parameters of each block in-line.

    Building a simple synthetic data generation pipeline in the interactive editor by dragging and dropping Jupyter Notebooks (Python/R files could also be dragged), leveraging input files for credentials, environment variables for workflow settings, software runtime specification and per-block hardware needs.

    Building a simple synthetic data generation pipeline in the interactive editor by dragging and dropping Jupyter Notebooks (Python/R files could also be dragged), leveraging input files for credentials, environment variables for workflow settings, software runtime specification and per-block hardware needs.

    The built Pipeline can be directly ran from the editor. It will then be automatically available in the dashboard\u2019s web UI, where it can be viewed and managed.

    \ud83d\udc49 To build Pipelines fully via code (in any Python IDE), refer to the [Kubeflow Pipelines SDK](https://www.kubeflow.org/docs/components/pipelines/sdk/sdk-overview/)."},{"location":"pipelines/runs/#managing-pipelines","title":"Managing Pipelines","text":"

    The Pipelines management interface is accessible in the platform\u2019s dashboard, via the sidebar item Pipelines.

    The Pipelines management module

    It has 6 main sub-modules:

    • Pipelines: list of existing Pipelines, which can be further drilled-down into the versions of each Pipeline, as Pipeline definitions can be versioned.
    • Experiments: a **list of all available Experiments (groups of Runs), regardless of their origin Pipeline.
    • Runs: a **list of all available Runs, regardless of their origin Pipeline/Experiment.
    • Recurring Runs: an interface to view and configure the Runs triggered on a schedule.
    • Artifacts: list of Artifacts generated by all Runs of all Pipelines
    • Executions: a list of all executed blocks/steps across all Runs of all Pipelines
    \ud83d\udca1 Pipelines created via code can be compiled to a `.pipeline` file, which can then be submited via the *+ Upload pipeline* button."},{"location":"pipelines/runs/#creating-a-new-experiment","title":"Creating a new Experiment","text":"

    An experiment is used to group together the runs of a single or different Pipelines. It is particularly useful for organization and Artifacts/Metrics comparison purposes.

    To create a new Experiment, access the Experiments section and click + Create Experiment. An Experiment requires a name and an optional description.

    "},{"location":"pipelines/runs/#comparing-runs","title":"Comparing Runs","text":"

    Comparing runs is particularly useful in iterative data improvement scenarios, as Artifacts, Metrics and Parameters can be directly compared side-by-side. Runs using different pre-processing techniques, settings, algorithms can be put against each other side-by-side in a visual and intuitive interface.

    To compare multiple Runs, select the Runs of interest (either from the Experiments or Runs pane) and select Compare runs:

    Selecting Runs to compare from the Experiments list

    In case of this particular data quality improvement Pipeline, the Metrics of each Run are shown side by side.

    Up to 10 runs can be selected for side-by-side comparison. In case any step of the Run has logged Artifacts, the equivalent Artifacts are shown in a comparative interface.

    Comparing the confusion matrices of three Runs of a Pipeline, which were logged as Artifacts during one of the Pipeline\u2019s steps.

    "},{"location":"pipelines/runs/#cloning-runs","title":"Cloning Runs","text":"

    For full reproducibility purposes, it is possible to select a previous run and clone it. Cloned runs will use exactly the same runtime input parameters and settings. However, any time dependent inputs (like the state of a remote data source at a particular point in time) will not be recreated.

    To clone a Run, click the Clone run button available in a Run\u2019s detail page or in the list of Runs/Experiment (when a single Run is selected). It will be possible to review the settings prior to triggering the execution.

    "},{"location":"pipelines/runs/#archiving-runs","title":"Archiving Runs","text":"

    Archiving a Run will move it to the Archived section the Runs and Experiments list. This section can be used to save older executions, to highlight best runs or to record anomalous executions which require further digging into.

    Archive a Run by clicking the Archive button from the Run\u2019s details page (or from the list of Runs/Experiments when a Run is selected).

    The Archived section, which is in all ways similar to the list of Active buttons. The Restore button (highlighted) moves Runs between the two sections.

    When a Run is archived, it can be restored through the Restore button.

    \ud83d\udca1 **Learn by example** To understand how to best apply the full capabilities of Pipelines in real world use cases, check out the [use cases section of YData\u2019s Academy](https://github.com/ydataai/academy/tree/master/5%20-%20use-cases). Most use cases include a pipeline leveraging common and use case specific features of the Pipelines module. These pipelines are offered in `.pipeline` files which can be interactively explored in Jupyter Lab, inside Labs.

    "},{"location":"sdk/","title":"Overview","text":"

    The Fabric SDK is an ecosystem of methods that allows users to, through a python interface, adopt data development focused on improving the quality of the data. The solution includes a set of integrated components for data ingestion, standardized data quality evaluation and data improvement, such as synthetic data generation, allowing an iterative improvement of the datasets used in high-impact business applications.

    YData Fabric SDK for improved data quality everywhere!

    To start using create a Fabric community account at ydata.ai/register

    "},{"location":"sdk/#benefits","title":"Benefits","text":"

    Fabric SDK interface enables the ability to integrate data quality tooling with other platforms offering several beneficts in the realm of data science development and data management:

    • Interoperability: seamless integration with other data platform and systems like Databricks, Snowflake, etc. This ensures that all your software will work cohesively with all the elements from your data architecture.
    • Collaboration: ease of integration with a multitude of tools and services, reducing the need to reinvent the wheel and fostering a collaborative environment for all developers (data scientists, data engineers, software developers, etc.)
    • Improved usage experience: Fabric SDK enables a well-integrated software solution, which allows a seamless transition between different tools or platforms without facing compatibility issues.
    "},{"location":"sdk/#current-functionality","title":"Current functionality","text":"

    Fabric SDK is currently composed by the following main modules:

    • Datasources

      • YData\u2019s SDK includes several connectors for easy integration with existing data sources. It supports several storage types, like filesystems and RDBMS. Check the list of connectors.
      • SDK\u2019s Datasources run on top of Dask, which allows it to deal with not only small workloads but also larger volumes of data.
    • Synthetic data generators

      • Simplified interface to train a generative model and learn in a data-driven manner the behavior, the patterns and original data distribution. Optimize your model for privacy or utility use-cases.
      • From a trained synthetic data generator, you can generate synthetic samples as needed and parametrise the number of records needed.
      • Anonymization and privacy preserving capabilities to ensure that synthetic datasets does not contain Personal Identifiable Information (PII) and can safely be shared!
      • Conditional sampling can be used to restrict the domain and values of specific features in the sampled data.
    • Synthetic data quality report Coming soon

      • An extensive synthetic data quality report that measures 3 dimensions: privacy, utility and fidelity of the generated data. The report can be downloaded in PDF format for ease of sharing and compliance purposes or as a JSON to enable the integration in data flows.
    • Profiling Coming soon

      • A set of metrics and algorithms summarizes datasets quality in three main dimensions: warnings, univariate analysis and a multivariate perspective.
    "},{"location":"sdk/#supported-data-formats","title":"Supported data formats","text":"TabularTime-SeriesTransactionalRelational databases

    The RegularSynthesizer is perfect to synthesize high-dimensional data, that is time-indepentent with high quality results.

    The TimeSeriesSynthesizer is perfect to synthesize both regularly and not evenly spaced time-series, from smart-sensors to stock.

    The TimeSeriesSynthesizer supports transactional data, known to have highly irregular time intervals between records and directional relations between entities.

    Coming soon

    The MultiTableSynthesizer is perfect to learn how to replicate the data within a relational database schema.

    "},{"location":"sdk/installation/","title":"Installation","text":"

    YData SDK is generally available through both Pypi and Conda allowing an easy process of installation. This experience allows combining YData SDK with other packages such as Pandas, Numpy or Scikit-Learn.

    YData SDK is available for the public through a token-based authentication system. If you don\u2019t have one yet, you can get your free license key during the installation process. You can check what features are available in the free version here.

    "},{"location":"sdk/installation/#installing-the-package","title":"Installing the package","text":"

    YData SDK supports python versions bigger than python 3.8, and can be installed in Windows, Linux or MacOS operating systems.

    Prior to the package installation, it is recommended the creation of a virtual or conda environment:

    pyenv
    pyenv virtualenv 3.10 ydatasdk\n

    And install ydata-sdk

    pypi
    pip install ydata-sdk\n
    "},{"location":"sdk/installation/#authentication","title":"Authentication","text":"

    Once you've installed ydata-sdk package you will need a token to run the functionalities. YData SDK uses a token based authentication system. To get access to your token, you need to create a YData account.

    YData SDK offers a free-trial and an enterprise version. To access your free-trial token, you need to create a YData account.

    The token will be available here, after login:

    With your account toke copied, you can set a new environment variable YDATA_TOKEN in the beginning of your development session.

        import os\n\n    os.setenv['YDATA_TOKEN'] = '{add-your-token}'\n

    Once you have set your token, you are good to go to start exploring the incredible world of data-centric AI and smart synthetic data generation!

    Check out our quickstart guide!

    "},{"location":"sdk/quickstart/","title":"Quickstart","text":"

    YData SDK allows you to with an easy and familiar interface, to adopt a Data-Centric AI approach for the development of Machine Learning solutions. YData SDK features were designed to support structure data, including tabular data, time-series and transactional data.

    "},{"location":"sdk/quickstart/#read-data","title":"Read data","text":"

    To start leveraging the package features you should consume your data either through the Connectors or pandas.Dataframe. The list of available connectors can be found here [add a link].

    From pandas dataframeFrom a connector
        # Example for a Google Cloud Storage Connector\n    credentials = \"{insert-credentials-file-path}\"\n\n    # We create a new connector for Google Cloud Storage\n    connector = Connector(connector_type='gcs', credentials=credentials)\n\n    # Create a Datasource from the connector\n    # Note that a connector can be re-used for several datasources\n    X = DataSource(connector=connector, path='gs://<my_bucket>.csv')\n
        # Load a small dataset\n    X = pd.read_csv('{insert-file-path.csv}')\n\n    # Init a synthesizer\n    synth = RegularSynthesizer()\n\n    # Train the synthesizer with the pandas Dataframe as input\n    # The data is then sent to the cluster for processing\n    synth.fit(X)\n

    The synthesis process returns a pandas.DataFrame object. Note that if you are using the ydata-sdk free version, all of your data is sent to a remote cluster on YData's infrastructure.

    "},{"location":"sdk/quickstart/#data-synthesis-flow","title":"Data synthesis flow","text":"

    The process of data synthesis can be described into the following steps:

    stateDiagram-v2\n  state read_data\n  read_data --> init_synth\n  init_synth --> train_synth\n  train_synth --> generate_samples\n  generate_samples --> [*]

    The code snippet below shows how easy can be to start generating new synthetic data. The package includes a set of examples datasets for a quickstart.

        from ydata.sdk.dataset import get_dataset\n\n    #read the example data\n    X = get_dataset('census')\n\n    # Init a synthesizer\n    synth = RegularSynthesizer()\n\n    # Fit the synthesizer to the input data\n    synth.fit(X)\n\n    # Sample new synthetic data. The below request ask for new 1000 synthetic rows\n    synth.sample(n_samples=1000)\n

    Do I need to prepare my data before synthesis?

    The sdk ensures that the original behaviour is replicated. For that reason, there is no need to preprocess outlier observations or missing data.

    By default all the missing data is replicated as NaN.

    "},{"location":"sdk/examples/synthesize_tabular_data/","title":"Synthesize tabular data","text":"

    Use YData's RegularSynthesizer to generate tabular synthetic data

    import os\n\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\n\n\ndef main():\n    \"\"\"In this example, we demonstrate how to train a synthesizer from a pandas\n    DataFrame.\n\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\n    X = get_dataset('census')\n\n    # We initialize a regular synthesizer\n    # As long as the synthesizer does not call `fit`, it exists only locally\n    synth = RegularSynthesizer()\n\n    # We train the synthesizer on our dataset\n    synth.fit(X)\n\n    # We request a synthetic dataset with 50 rows\n    sample = synth.sample(n_samples=50)\n\n    print(sample.shape)\n\n\nif __name__ == \"__main__\":\n    main()\n
    "},{"location":"sdk/examples/synthesize_timeseries_data/","title":"Synthesize time-series data","text":"

    Use YData's TimeSeriesSynthesizer to generate time-series synthetic data

    Tabular data is the most common type of data we encounter in data problems.

    When thinking about tabular data, we assume independence between different records, but this does not happen in reality. Suppose we check events from our day-to-day life, such as room temperature changes, bank account transactions, stock price fluctuations, and air quality measurements in our neighborhood. In that case, we might end up with datasets where measures and records evolve and are related through time. This type of data is known to be sequential or time-series data.

    Thus, sequential or time-series data refers to any data containing elements ordered into sequences in a structured format. Dissecting any time-series dataset, we see differences in variables' behavior that need to be understood for an effective generation of synthetic data. Typically any time-series dataset is composed of the following:

    • Variables that define the order of time (these can be simple with one variable or composed)
    • Time-variant variables
    • Variables that refer to entities (single or multiple entities)
    • Variables that are attributes (those that don't depend on time but rather on the entity)

    Below find an example:

    import os\n\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import TimeSeriesSynthesizer\n\n# Do not forget to add your token as env variable\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'\n\nX = get_dataset('occupancy')\n\n# We initialize a time series synthesizer\n# As long as the synthesizer does not call `fit`, it exists only locally\nsynth = TimeSeriesSynthesizer()\n\n# We train the synthesizer on our dataset\n# sortbykey -> variable that define the time order for the sequence\nsynth.fit(X, sortbykey='date')\n\n# By default it is requested a synthetic sample with the same length as the original data\n# The TimeSeriesSynthesizer is designed to replicate temporal series and therefore the original time-horizon is respected\nsample = synth.sample(n_entities=1)\n
    "},{"location":"sdk/examples/synthesize_with_anonymization/","title":"Anonymization","text":"

    YData Synthesizers offers a way to anonymize sensitive information such that the original values are not present in the synthetic data but replaced by fake values.

    Does the model retain the original values?

    No! The anonymization is performed before the model training such that it never sees the original values.

    The anonymization is performed by specifying which columns need to be anonymized and how to perform the anonymization. The anonymization rules are defined as a dictionary with the following format:

    {column_name: anonymization_rule}

    While here are some predefined anonymization rules such as name, email, company, it is also possible to create a rule using a regular expression. The anonymization rules have to be passed to a synthesizer in its fit method using the parameter anonymize.

    What is the difference between anonymization and privacy?

    Anonymization makes sure sensitive information are hidden from the data. Privacy makes sure it is not possible to infer the original data points from the synthetic data points via statistical attacks.

    Therefore, for data sharing anonymization and privacy controls are complementary.

    The example below demonstrates how to anonymize the column Name by fake names and the column Ticket by a regular expression:

    import os\n\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\n\n\ndef main():\n    \"\"\"In this example, we demonstrate how to train a synthesizer from a pandas\n    DataFrame.\n\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\n    X = get_dataset('titanic')\n\n    # We initialize a regular synthesizer\n    # As long as the synthesizer does not call `fit`, it exists only locally\n    synth = RegularSynthesizer(name=\"Titanic\")\n\n    # We define anonymization rules, which is a dictionary with format:\n    # {column_name: anonymization_rule, ...}\n    # while here are some predefined anonymization rules like: name, email, company\n    # it is also possible to create a rule using a regular expression\n    rules = {\n        \"Name\": \"name\",\n        \"Ticket\": \"[A-Z]{2}-[A-Z]{4}\"\n    }\n\n    # or a different option for anonymization configuration\n\n    rules = {\n        'Name': {'type': 'name'},\n        'Ticket': {'type': 'regex',\n                   'regex': '[A-Z]{2}-[A-Z]{4}'}\n    }\n\n    # We train the synthesizer on our dataset\n    synth.fit(\n        X,\n        anonymize=rules\n    )\n\n    # We request a synthetic dataset with 50 rows\n    sample = synth.sample(n_samples=50)\n\n    print(sample[[\"Name\", \"Ticket\"]].head(3))\n\n\nif __name__ == \"__main__\":\n    main()\n

    "},{"location":"sdk/examples/synthesize_with_conditional_sampling/","title":"Conditional sampling","text":"

    YData Synthesizers support conditional sampling. The fit method has an optional parameter named condition_on, which receives a list of features to condition upon. Furthermore, the sample method receives the conditions to be applied through another optional parameter also named condition_on. For now, two types of conditions are supported:

    • Condition upon a categorical (or string) feature. The parameters are the name of the feature and a list of values (i.e., categories) to be considered. Each category also has its percentage of representativeness. For example, if we want to condition upon two categories, we need to define the percentage of rows each of these categories will have on the synthetic dataset. Naturally, the sum of such percentages needs to be 1. The default percentage is also 1 since it is the required value for a single category.
    • Condition upon a numerical feature. The parameters are the name of the feature and the minimum and maximum of the range to be considered. This feature will present a uniform distribution on the synthetic dataset, limited by the specified range.

    The example below demonstrates how to train and sample from a synthesizer using conditional sampling:

    import os\n\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n\n# Do not forget to add your token as env variables.\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined.\n\n\ndef main():\n    \"\"\"In this example, we demonstrate how to train and\n    sample from a synthesizer using conditional sampling.\"\"\"\n    X = get_dataset('census')\n\n    # We initialize a regular synthesizer.\n    # As long as the synthesizer does not call `fit`, it exists only locally.\n    synth = RegularSynthesizer()\n\n    # We train the synthesizer on our dataset setting\n    # the features to condition upon.\n    synth.fit(\n        X,\n        name=\"census_synthesizer\",\n        condition_on=[\"sex\", \"native-country\", \"age\"]\n    )\n\n    # We request a synthetic dataset with specific condition rules.\n    sample = synth.sample(\n        n_samples=500,\n        condition_on={\n            \"sex\": {\n                \"categories\": [{\n                    \"category\": 'Female',\n                    \"percentage\": 0.7\n                }]\n            },\n            \"native-country\": {\n                \"categories\": [{\n                    \"category\": 'United-States',\n                    \"percentage\": 0.6\n                }, {\n                    \"category\": 'Mexico',\n                    \"percentage\": 0.4\n                }]\n            },\n            \"age\": {\n                \"minimum\": 55,\n                \"maximum\": 60\n            }\n        }\n    )\n    print(sample)\n\n\nif __name__ == \"__main__\":\n    main()\n
    "},{"location":"sdk/examples/synthesize_with_privacy_control/","title":"Privacy control","text":"

    YData Synthesizers offers 3 different levels of privacy:

    1. high privacy: the model is optimized for privacy purposes,
    2. high fidelity (default): the model is optimized for high fidelity,
    3. balanced: tradeoff between privacy and fidelity.

    The default privacy level is high fidelity. The privacy level can be changed by the user at the moment a synthesizer level is trained by using the parameter privacy_level. The parameter expect a PrivacyLevel value.

    What is the difference between anonymization and privacy?

    Anonymization makes sure sensitive information are hidden from the data. Privacy makes sure it is not possible to infer the original data points from the synthetic data points via statistical attacks.

    Therefore, for data sharing anonymization and privacy controls are complementary.

    The example below demonstrates how to train a synthesizer configured for high privacy:

    import os\n\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import PrivacyLevel, RegularSynthesizer\n\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\n\n\ndef main():\n    \"\"\"In this example, we demonstrate how to train a synthesizer\n    with a high-privacy setting from a pandas DataFrame.\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\n    X = get_dataset('titanic')\n\n    # We initialize a regular synthesizer\n    # As long as the synthesizer does not call `fit`, it exists only locally\n    synth = RegularSynthesizer()\n\n    # We train the synthesizer on our dataset setting the privacy level to high\n    synth.fit(\n        X,\n        name=\"titanic_synthesizer\",\n        privacy_level=PrivacyLevel.HIGH_PRIVACY\n    )\n\n    # We request a synthetic dataset with 50 rows\n    sample = synth.sample(n_samples=50)\n    print(sample)\n\n\nif __name__ == \"__main__\":\n    main()\n
    "},{"location":"sdk/examples/synthesizer_multitable/","title":"Synthesize Relational databases","text":"

    Integrate Fabric's MultiTableSynthesizer in your data flows and generate synthetic relational databases or multi-table datasets

    The capability to generate synthetic data from relational databases is a powerful and innovative approach to streamline the access to data and improve data democratization strategy within the organization. Fabric's SDK makes available an easy-to-use code interface to integrate the process of generating synthetic multi-table databases into your existing data flows.

    How to get your datasource?

    Learn how to create your multi-table data in Fabric here before creating your first multi-table synthetic data generator!

    Get your datasource and connector ID

    Datasource uid: You can find your datasource ID through Fabric UI. Open your relational dataset and click in the \"Explore in Labs\" button. Copy the uid that you find available in the code snippet.

    Connector uid: You can find your connector ID through Fabric UI. Open the connector tab from your Data Catalog. Under the connector \"Actions\" select \"Explore in Lab\". Copy the uid available in the code snippet.

    Quickstart example:

    import os\n\nfrom ydata.sdk.datasources import DataSource\nfrom ydata.sdk.synthesizers import MultiTableSynthesizer\n\n# Authenticate to Fabric to leverage the SDK - https://docs.sdk.ydata.ai/latest/sdk/installation/\n# Make sure to add your token as env variable.\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\n\n# In this example, we demonstrate how to train a synthesizer from an existing RDBMS Dataset.\n# Make sure to follow the step-by-step guide to create a Dataset in Fabric's catalog: https://docs.sdk.ydata.ai/latest/get-started/create_multitable_dataset/\nX = DataSource.get('<DATASOURCE_UID>')\n\n# Init a multi-table synthesizer. Provide a connector so that the process of data synthesis write the\n# synthetic data into the destination database\n# Provide a connector ID as the write_connector argument. See in this tutorial how to get a connector ID\nsynth = MultiTableSynthesizer(write_connector='<CONNECTOR_UID')\n\n# Start the training of your synthetic data generator\nsynth.fit(X)\n\n# As soon as the training process is completed you are able to sample a synthetic database\n# The input expected is a percentage of the original database size\n# In this case it was requested a synthetic database with the same size as the original\n# Your synthetic sample was written to the database provided in the write_connector\nsynth.sample(frac=1.)\n
    "},{"location":"sdk/modules/connectors/","title":"Connectors","text":"

    YData SDK allows users to consume data assets from remote storages through Connectors. YData Connectors support different types of storages, from filesystems to RDBMS'.

    Below the list of available connectors:

    Connector Name Type Supported File Types Useful Links Notes AWS S3 Remote object storage CSV, Parquet https://aws.amazon.com/s3/ Google Cloud Storage Remote object storage CSV, Parquet https://cloud.google.com/storage Azure Blob Storage Remote object storage CSV, Parquet https://azure.microsoft.com/en-us/services/storage/blobs/ File Upload Local CSV - Maximum file size is 220MB. Bigger files should be uploaded and read from remote object storages MySQL RDBMS Not applicable https://www.mysql.com/ Supports reading whole schemas or specifying a query Azure SQL Server RDBMS Not applicable https://azure.microsoft.com/en-us/services/sql-database/campaign/ Supports reading whole schemas or specifying a query PostgreSQL RDBMS Not applicable https://www.postgresql.org/ Supports reading whole schemas or specifying a query Snowflake RDBMS Not applicable https://docs.snowflake.com/en/sql-reference-commands Supports reading whole schemas or specifying a query Google BigQuery Data warehouse Not applicable https://cloud.google.com/bigquery Azure Data Lake Data lake CSV, Parquet https://azure.microsoft.com/en-us/services/storage/data-lake-storage/

    More details can be found at Connectors APi Reference Docs.

    "},{"location":"sdk/modules/synthetic_data/","title":"Synthetic data generation","text":""},{"location":"sdk/modules/synthetic_data/#data-formats","title":"Data formats","text":""},{"location":"sdk/modules/synthetic_data/#tabular-data","title":"Tabular data","text":""},{"location":"sdk/modules/synthetic_data/#time-series-data","title":"Time-series data","text":""},{"location":"sdk/modules/synthetic_data/#transactions-data","title":"Transactions data","text":""},{"location":"sdk/modules/synthetic_data/#best-practices","title":"Best practices","text":""},{"location":"sdk/reference/api/common/client/","title":"Get client","text":"

    Deduce how to initialize or retrieve the client.

    This is meant to be a zero configuration for the user.

    Create and set a client globally
    from ydata.sdk.client import get_client\nget_client(set_as_global=True)\n

    Parameters:

    Name Type Description Default client_or_creds Optional[Union[Client, dict, str, Path]]

    Client to forward or credentials for initialization

    None set_as_global bool

    If True, set client as global

    False wait_for_auth bool

    If True, wait for the user to authenticate

    True

    Returns:

    Type Description Client

    Client instance

    Source code in ydata/sdk/common/client/utils.py
    def get_client(client_or_creds: Optional[Union[Client, Dict, str, Path]] = None, set_as_global: bool = False, wait_for_auth: bool = True) -> Client:\n    \"\"\"Deduce how to initialize or retrieve the client.\n\n    This is meant to be a zero configuration for the user.\n\n    Example: Create and set a client globally\n            ```py\n            from ydata.sdk.client import get_client\n            get_client(set_as_global=True)\n            ```\n\n    Args:\n        client_or_creds (Optional[Union[Client, dict, str, Path]]): Client to forward or credentials for initialization\n        set_as_global (bool): If `True`, set client as global\n        wait_for_auth (bool): If `True`, wait for the user to authenticate\n\n    Returns:\n        Client instance\n    \"\"\"\n    client = None\n    global WAITING_FOR_CLIENT\n    try:\n\n        # If a client instance is set globally, return it\n        if not set_as_global and Client.GLOBAL_CLIENT is not None:\n            return Client.GLOBAL_CLIENT\n\n        # Client exists, forward it\n        if isinstance(client_or_creds, Client):\n            return client_or_creds\n\n        # Explicit credentials\n        ''' # For the first version, we deactivate explicit credentials via string or file for env var only\n        if isinstance(client_or_creds, (dict, str, Path)):\n            if isinstance(client_or_creds, str):  # noqa: SIM102\n                if Path(client_or_creds).is_file():\n                    client_or_creds = Path(client_or_creds)\n\n            if isinstance(client_or_creds, Path):\n                client_or_creds = json.loads(client_or_creds.open().read())\n\n            return Client(credentials=client_or_creds)\n\n        # Last try with environment variables\n        #if client_or_creds is None:\n        client = _client_from_env(wait_for_auth=wait_for_auth)\n        '''\n        credentials = environ.get(TOKEN_VAR)\n        if credentials is not None:\n            client = Client(credentials=credentials)\n\n    except ClientHandshakeError as e:\n        wait_for_auth = False  # For now deactivate wait_for_auth until the backend is ready\n        if wait_for_auth:\n            WAITING_FOR_CLIENT = True\n            start = time()\n            login_message_printed = False\n            while client is None:\n                if not login_message_printed:\n                    print(\n                        f\"The token needs to be refreshed - please validate your token by browsing at the following URL:\\n\\n\\t{e.auth_link}\")\n                    login_message_printed = True\n                with suppress(ClientCreationError):\n                    sleep(BACKOFF)\n                    client = get_client(wait_for_auth=False)\n                now = time()\n                if now - start > CLIENT_INIT_TIMEOUT:\n                    WAITING_FOR_CLIENT = False\n                    break\n\n    if client is None and not WAITING_FOR_CLIENT:\n        sys.tracebacklimit = None\n        raise ClientCreationError\n    return client\n

    Main Client class used to abstract the connection to the backend.

    A normal user should not have to instanciate a Client by itself. However, in the future it will be useful for power-users to manage projects and connections.

    Parameters:

    Name Type Description Default credentials Optional[dict]

    (optional) Credentials to connect

    None project Optional[Project]

    (optional) Project to connect to. If not specified, the client will connect to the default user's project.

    None Source code in ydata/sdk/common/client/client.py
    @typechecked\nclass Client(metaclass=SingletonClient):\n    \"\"\"Main Client class used to abstract the connection to the backend.\n\n    A normal user should not have to instanciate a [`Client`][ydata.sdk.common.client.Client] by itself.\n    However, in the future it will be useful for power-users to manage projects and connections.\n\n    Args:\n        credentials (Optional[dict]): (optional) Credentials to connect\n        project (Optional[Project]): (optional) Project to connect to. If not specified, the client will connect to the default user's project.\n    \"\"\"\n\n    codes = codes\n\n    DEFAULT_PROJECT: Optional[Project] = environ.get(\"DEFAULT_PROJECT\", None)\n\n    def __init__(self, credentials: Optional[Union[str, Dict]] = None, project: Optional[Project] = None, set_as_global: bool = False):\n        self._base_url = environ.get(\"YDATA_BASE_URL\", DEFAULT_URL).removesuffix('/')\n        self._verify_ssl = bool(int(environ.get('YDATA_VERIFY_SSL', 1)))\n        self._headers = {'Authorization': credentials}\n\n        if self._verify_ssl is False:\n            self._http_client = httpClient(\n                headers=self._headers, timeout=Timeout(10, read=None), verify=self._verify_ssl)\n        else:\n            self._http_client = httpClient(\n                headers=self._headers, timeout=Timeout(10, read=None))\n\n        self._handshake()\n\n        self._default_project = project or Client.DEFAULT_PROJECT or self._get_default_project(\n            credentials)\n        if set_as_global:\n            self.__set_global()\n\n    @property\n    def project(self) -> Project:\n        return Client.DEFAULT_PROJECT or self._default_project\n\n    @project.setter\n    def project(self, value: Project):\n        self._default_project = value\n\n    def post(\n        self, endpoint: str, content: Optional[RequestContent] = None, data: Optional[Dict] = None,\n        json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n        raise_for_status: bool = True\n    ) -> Response:\n        \"\"\"POST request to the backend.\n\n        Args:\n            endpoint (str): POST endpoint\n            content (Optional[RequestContent])\n            data (Optional[dict]): (optional) multipart form data\n            json (Optional[dict]): (optional) json data\n            files (Optional[dict]): (optional) files to be sent\n            raise_for_status (bool): raise an exception on error\n\n        Returns:\n            Response object\n        \"\"\"\n        url_data = self.__build_url(\n            endpoint, data=data, json=json, files=files, project=project)\n        response = self._http_client.post(**url_data)\n\n        if response.status_code != Client.codes.OK and raise_for_status:\n            self.__raise_for_status(response)\n\n        return response\n\n    def patch(\n        self, endpoint: str, content: Optional[RequestContent] = None, data: Optional[Dict] = None,\n        json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n        raise_for_status: bool = True\n    ) -> Response:\n        \"\"\"PATCH request to the backend.\n\n        Args:\n            endpoint (str): POST endpoint\n            content (Optional[RequestContent])\n            data (Optional[dict]): (optional) multipart form data\n            json (Optional[dict]): (optional) json data\n            files (Optional[dict]): (optional) files to be sent\n            raise_for_status (bool): raise an exception on error\n\n        Returns:\n            Response object\n        \"\"\"\n        url_data = self.__build_url(\n            endpoint, data=data, json=json, files=files, project=project)\n        response = self._http_client.patch(**url_data, content=content)\n\n        if response.status_code != Client.codes.OK and raise_for_status:\n            self.__raise_for_status(response)\n\n        return response\n\n    def get(\n        self, endpoint: str, params: Optional[Dict] = None, project: Optional[Project] = None,\n        cookies: Optional[Dict] = None, raise_for_status: bool = True\n    ) -> Response:\n        \"\"\"GET request to the backend.\n\n        Args:\n            endpoint (str): GET endpoint\n            cookies (Optional[dict]): (optional) cookies data\n            raise_for_status (bool): raise an exception on error\n\n        Returns:\n            Response object\n        \"\"\"\n        url_data = self.__build_url(endpoint, params=params,\n                                    cookies=cookies, project=project)\n        response = self._http_client.get(**url_data)\n\n        if response.status_code != Client.codes.OK and raise_for_status:\n            self.__raise_for_status(response)\n\n        return response\n\n    def get_static_file(\n        self, endpoint: str, project: Optional[Project] = None, raise_for_status: bool = True\n    ) -> Response:\n        \"\"\"Retrieve a static file from the backend.\n\n        Args:\n            endpoint (str): GET endpoint\n            raise_for_status (bool): raise an exception on error\n\n        Returns:\n            Response object\n        \"\"\"\n        from urllib.parse import urlparse\n        url_data = self.__build_url(endpoint, project=project)\n        url_parse = urlparse(self._base_url)\n        url_data['url'] = f\"\"\"{\n            url_parse.scheme}://{url_parse.netloc}/static-content{endpoint}\"\"\"\n        response = self._http_client.get(**url_data)\n\n        if response.status_code != Client.codes.OK and raise_for_status:\n            self.__raise_for_status(response)\n\n        return response\n\n    def _handshake(self):\n        \"\"\"Client handshake.\n\n        It is used to determine is the client can connect with its\n        current authorization token.\n        \"\"\"\n        response = self.get('/profiles', params={}, raise_for_status=False)\n        if response.status_code == Client.codes.FOUND:\n            parser = LinkExtractor()\n            parser.feed(response.text)\n            raise ClientHandshakeError(auth_link=parser.link)\n\n    def _get_default_project(self, token: str):\n        response = self.get('/profiles/me', params={}, cookies={'access_token': token})\n        data: Dict = response.json()\n        return data['myWorkspace']\n\n    def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,\n                    json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n                    cookies: Optional[Dict] = None) -> Dict:\n        \"\"\"Build a request for the backend.\n\n        Args:\n            endpoint (str): backend endpoint\n            params (Optional[dict]): URL parameters\n            data (Optional[Project]): (optional) multipart form data\n            json (Optional[dict]): (optional) json data\n            files (Optional[dict]): (optional) files to be sent\n            cookies (Optional[dict]): (optional) cookies data\n\n        Returns:\n            dictionary containing the information to perform a request\n        \"\"\"\n        _params = params if params is not None else {\n            'ns': project or self._default_project\n        }\n\n        url_data = {\n            'url': f\"\"\"{self._base_url}/{endpoint.removeprefix(\"/\")}\"\"\",\n            'headers': self._headers,\n            'params': _params,\n        }\n\n        if data is not None:\n            url_data['data'] = data\n\n        if json is not None:\n            url_data['json'] = json\n\n        if files is not None:\n            url_data['files'] = files\n\n        if cookies is not None:\n            url_data['cookies'] = cookies\n\n        return url_data\n\n    def __set_global(self) -> None:\n        \"\"\"Sets a client instance as global.\"\"\"\n        # If the client is stateful, close it gracefully!\n        Client.GLOBAL_CLIENT = self\n\n    def __raise_for_status(self, response: Response) -> None:\n        \"\"\"Raise an exception if the response is not OK.\n\n        When an exception is raised, we try to convert it to a ResponseError which is\n        a wrapper around a backend error. This usually gives enough context and provides\n        nice error message.\n\n        If it cannot be converted to ResponseError, it is re-raised.\n\n        Args:\n            response (Response): response to analyze\n        \"\"\"\n        try:\n            response.raise_for_status()\n        except HTTPStatusError as e:\n            with suppress(Exception):\n                e = ResponseError(**response.json())\n            raise e\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__build_url","title":"__build_url(endpoint, params=None, data=None, json=None, project=None, files=None, cookies=None)","text":"

    Build a request for the backend.

    Parameters:

    Name Type Description Default endpoint str

    backend endpoint

    required params Optional[dict]

    URL parameters

    None data Optional[Project]

    (optional) multipart form data

    None json Optional[dict]

    (optional) json data

    None files Optional[dict]

    (optional) files to be sent

    None cookies Optional[dict]

    (optional) cookies data

    None

    Returns:

    Type Description Dict

    dictionary containing the information to perform a request

    Source code in ydata/sdk/common/client/client.py
    def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,\n                json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n                cookies: Optional[Dict] = None) -> Dict:\n    \"\"\"Build a request for the backend.\n\n    Args:\n        endpoint (str): backend endpoint\n        params (Optional[dict]): URL parameters\n        data (Optional[Project]): (optional) multipart form data\n        json (Optional[dict]): (optional) json data\n        files (Optional[dict]): (optional) files to be sent\n        cookies (Optional[dict]): (optional) cookies data\n\n    Returns:\n        dictionary containing the information to perform a request\n    \"\"\"\n    _params = params if params is not None else {\n        'ns': project or self._default_project\n    }\n\n    url_data = {\n        'url': f\"\"\"{self._base_url}/{endpoint.removeprefix(\"/\")}\"\"\",\n        'headers': self._headers,\n        'params': _params,\n    }\n\n    if data is not None:\n        url_data['data'] = data\n\n    if json is not None:\n        url_data['json'] = json\n\n    if files is not None:\n        url_data['files'] = files\n\n    if cookies is not None:\n        url_data['cookies'] = cookies\n\n    return url_data\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__raise_for_status","title":"__raise_for_status(response)","text":"

    Raise an exception if the response is not OK.

    When an exception is raised, we try to convert it to a ResponseError which is a wrapper around a backend error. This usually gives enough context and provides nice error message.

    If it cannot be converted to ResponseError, it is re-raised.

    Parameters:

    Name Type Description Default response Response

    response to analyze

    required Source code in ydata/sdk/common/client/client.py
    def __raise_for_status(self, response: Response) -> None:\n    \"\"\"Raise an exception if the response is not OK.\n\n    When an exception is raised, we try to convert it to a ResponseError which is\n    a wrapper around a backend error. This usually gives enough context and provides\n    nice error message.\n\n    If it cannot be converted to ResponseError, it is re-raised.\n\n    Args:\n        response (Response): response to analyze\n    \"\"\"\n    try:\n        response.raise_for_status()\n    except HTTPStatusError as e:\n        with suppress(Exception):\n            e = ResponseError(**response.json())\n        raise e\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__set_global","title":"__set_global()","text":"

    Sets a client instance as global.

    Source code in ydata/sdk/common/client/client.py
    def __set_global(self) -> None:\n    \"\"\"Sets a client instance as global.\"\"\"\n    # If the client is stateful, close it gracefully!\n    Client.GLOBAL_CLIENT = self\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.get","title":"get(endpoint, params=None, project=None, cookies=None, raise_for_status=True)","text":"

    GET request to the backend.

    Parameters:

    Name Type Description Default endpoint str

    GET endpoint

    required cookies Optional[dict]

    (optional) cookies data

    None raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def get(\n    self, endpoint: str, params: Optional[Dict] = None, project: Optional[Project] = None,\n    cookies: Optional[Dict] = None, raise_for_status: bool = True\n) -> Response:\n    \"\"\"GET request to the backend.\n\n    Args:\n        endpoint (str): GET endpoint\n        cookies (Optional[dict]): (optional) cookies data\n        raise_for_status (bool): raise an exception on error\n\n    Returns:\n        Response object\n    \"\"\"\n    url_data = self.__build_url(endpoint, params=params,\n                                cookies=cookies, project=project)\n    response = self._http_client.get(**url_data)\n\n    if response.status_code != Client.codes.OK and raise_for_status:\n        self.__raise_for_status(response)\n\n    return response\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.get_static_file","title":"get_static_file(endpoint, project=None, raise_for_status=True)","text":"

    Retrieve a static file from the backend.

    Parameters:

    Name Type Description Default endpoint str

    GET endpoint

    required raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def get_static_file(\n    self, endpoint: str, project: Optional[Project] = None, raise_for_status: bool = True\n) -> Response:\n    \"\"\"Retrieve a static file from the backend.\n\n    Args:\n        endpoint (str): GET endpoint\n        raise_for_status (bool): raise an exception on error\n\n    Returns:\n        Response object\n    \"\"\"\n    from urllib.parse import urlparse\n    url_data = self.__build_url(endpoint, project=project)\n    url_parse = urlparse(self._base_url)\n    url_data['url'] = f\"\"\"{\n        url_parse.scheme}://{url_parse.netloc}/static-content{endpoint}\"\"\"\n    response = self._http_client.get(**url_data)\n\n    if response.status_code != Client.codes.OK and raise_for_status:\n        self.__raise_for_status(response)\n\n    return response\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.patch","title":"patch(endpoint, content=None, data=None, json=None, project=None, files=None, raise_for_status=True)","text":"

    PATCH request to the backend.

    Parameters:

    Name Type Description Default endpoint str

    POST endpoint

    required data Optional[dict]

    (optional) multipart form data

    None json Optional[dict]

    (optional) json data

    None files Optional[dict]

    (optional) files to be sent

    None raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def patch(\n    self, endpoint: str, content: Optional[RequestContent] = None, data: Optional[Dict] = None,\n    json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n    raise_for_status: bool = True\n) -> Response:\n    \"\"\"PATCH request to the backend.\n\n    Args:\n        endpoint (str): POST endpoint\n        content (Optional[RequestContent])\n        data (Optional[dict]): (optional) multipart form data\n        json (Optional[dict]): (optional) json data\n        files (Optional[dict]): (optional) files to be sent\n        raise_for_status (bool): raise an exception on error\n\n    Returns:\n        Response object\n    \"\"\"\n    url_data = self.__build_url(\n        endpoint, data=data, json=json, files=files, project=project)\n    response = self._http_client.patch(**url_data, content=content)\n\n    if response.status_code != Client.codes.OK and raise_for_status:\n        self.__raise_for_status(response)\n\n    return response\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.post","title":"post(endpoint, content=None, data=None, json=None, project=None, files=None, raise_for_status=True)","text":"

    POST request to the backend.

    Parameters:

    Name Type Description Default endpoint str

    POST endpoint

    required data Optional[dict]

    (optional) multipart form data

    None json Optional[dict]

    (optional) json data

    None files Optional[dict]

    (optional) files to be sent

    None raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def post(\n    self, endpoint: str, content: Optional[RequestContent] = None, data: Optional[Dict] = None,\n    json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n    raise_for_status: bool = True\n) -> Response:\n    \"\"\"POST request to the backend.\n\n    Args:\n        endpoint (str): POST endpoint\n        content (Optional[RequestContent])\n        data (Optional[dict]): (optional) multipart form data\n        json (Optional[dict]): (optional) json data\n        files (Optional[dict]): (optional) files to be sent\n        raise_for_status (bool): raise an exception on error\n\n    Returns:\n        Response object\n    \"\"\"\n    url_data = self.__build_url(\n        endpoint, data=data, json=json, files=files, project=project)\n    response = self._http_client.post(**url_data)\n\n    if response.status_code != Client.codes.OK and raise_for_status:\n        self.__raise_for_status(response)\n\n    return response\n
    "},{"location":"sdk/reference/api/common/types/","title":"Types","text":""},{"location":"sdk/reference/api/connectors/connector/","title":"Connector","text":"

    Bases: ModelFactoryMixin

    A Connector allows to connect and access data stored in various places. The list of available connectors can be found here.

    Parameters:

    Name Type Description Default connector_type Union[ConnectorType, str]

    Type of the connector to be created

    None credentials dict

    Connector credentials

    None name Optional[str]

    (optional) Connector name

    None project Optional[Project]

    (optional) Project name for this Connector

    None client Client

    (optional) Client to connect to the backend

    None

    Attributes:

    Name Type Description uid UID

    UID fo the connector instance (creating internally)

    type ConnectorType

    Type of the connector

    Source code in ydata/sdk/connectors/connector.py
    class Connector(ModelFactoryMixin):\n    \"\"\"A [`Connector`][ydata.sdk.connectors.Connector] allows to connect and\n    access data stored in various places. The list of available connectors can\n    be found [here][ydata.sdk.connectors.ConnectorType].\n\n    Arguments:\n        connector_type (Union[ConnectorType, str]): Type of the connector to be created\n        credentials (dict): Connector credentials\n        name (Optional[str]): (optional) Connector name\n        project (Optional[Project]): (optional) Project name for this Connector\n        client (Client): (optional) Client to connect to the backend\n\n    Attributes:\n        uid (UID): UID fo the connector instance (creating internally)\n        type (ConnectorType): Type of the connector\n    \"\"\"\n\n    _MODEL_CLASS = mConnector\n\n    _model: Optional[mConnector]\n\n    def __init__(\n            self, connector_type: Union[ConnectorType, str, None] = None, credentials: Optional[Dict] = None,\n            name: Optional[str] = None, project: Optional[Project] = None, client: Optional[Client] = None):\n        self._init_common(client=client)\n        self._model = _connector_type_to_model(ConnectorType._init_connector_type(connector_type))._create_model(\n            connector_type, credentials, name, client=client)\n\n        self._project = project\n\n    @init_client\n    def _init_common(self, client: Optional[Client] = None):\n        self._client = client\n        self._logger = create_logger(__name__, level=LOG_LEVEL)\n\n    @property\n    def uid(self) -> UID:\n        return self._model.uid\n\n    @property\n    def name(self) -> str:\n        return self._model.name\n\n    @property\n    def type(self) -> ConnectorType:\n        return ConnectorType(self._model.type)\n\n    @property\n    def project(self) -> Project:\n        return self._project or self._client.project\n\n    @staticmethod\n    @init_client\n    def get(\n        uid: UID, project: Optional[Project] = None, client: Optional[Client] = None\n    ) -> _T:\n        \"\"\"Get an existing connector.\n\n        Arguments:\n            uid (UID): Connector identifier\n            project (Optional[Project]): (optional) Project name from where to get the connector\n            client (Optional[Client]): (optional) Client to connect to the backend\n\n        Returns:\n            Connector\n        \"\"\"\n        response = client.get(f'/connector/{uid}', project=project)\n        data = response.json()\n        data_type = data[\"type\"]\n        connector_class = _connector_type_to_model(\n            ConnectorType._init_connector_type(data_type))\n        connector = connector_class._init_from_model_data(\n            connector_class._MODEL_CLASS(**data))\n        connector._project = project\n\n        return connector\n\n    @staticmethod\n    def _init_credentials(\n        connector_type: ConnectorType, credentials: Union[str, Path, Dict, Credentials]\n    ) -> Credentials:\n        _credentials = None\n\n        if isinstance(credentials, str):\n            credentials = Path(credentials)\n\n        if isinstance(credentials, Path):\n            try:\n                _credentials = json_loads(credentials.open().read())\n            except Exception:\n                raise CredentialTypeError(\n                    'Could not read the credentials. Please, check your path or credentials structure.')\n\n        try:\n            from ydata.sdk.connectors._models.connector_map import TYPE_TO_CLASS\n            credential_cls = TYPE_TO_CLASS.get(connector_type.value)\n            _credentials = credential_cls(**_credentials)\n        except Exception:\n            raise CredentialTypeError(\n                \"Could not create the credentials. Verify the path or the structure your credentials.\")\n\n        return _credentials\n\n    @staticmethod\n    def create(\n        connector_type: Union[ConnectorType, str], credentials: Union[str, Path, Dict, Credentials],\n        name: Optional[str] = None, project: Optional[Project] = None, client: Optional[Client] = None\n    ) -> _T:\n        \"\"\"Create a new connector.\n\n        Arguments:\n            connector_type (Union[ConnectorType, str]): Type of the connector to be created\n            credentials (dict): Connector credentials\n            name (Optional[str]): (optional) Connector name\n            project (Optional[Project]): (optional) Project where to create the connector\n            client (Client): (optional) Client to connect to the backend\n\n        Returns:\n            New connector\n        \"\"\"\n        connector_type = ConnectorType._init_connector_type(connector_type)\n        connector_class = _connector_type_to_model(connector_type)\n\n        payload = {\n            \"type\": connector_type.value,\n            \"credentials\": credentials.dict(by_alias=True)\n        }\n        model = connector_class._create(payload, name, project, client)\n\n        connector = connector_class._init_from_model_data(model)\n        connector._project = project\n        return connector\n\n    @classmethod\n    @init_client\n    def _create(\n        cls, payload: dict, name: Optional[str] = None, project: Optional[Project] = None,\n        client: Optional[Client] = None\n    ) -> _MODEL_CLASS:\n        _name = name if name is not None else str(uuid4())\n        payload[\"name\"] = _name\n        response = client.post('/connector/', project=project, json=payload)\n        data = response.json()\n\n        return cls._MODEL_CLASS(**data)\n\n    @staticmethod\n    @init_client\n    def list(project: Optional[Project] = None, client: Optional[Client] = None) -> ConnectorsList:\n        \"\"\"List the connectors instances.\n\n        Arguments:\n            project (Optional[Project]): (optional) Project name from where to list the connectors\n            client (Client): (optional) Client to connect to the backend\n\n        Returns:\n            List of connectors\n        \"\"\"\n        response = client.get('/connector', project=project)\n        data: list = response.json()\n        return ConnectorsList(data)\n\n    def __repr__(self):\n        return self._model.__repr__()\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.create","title":"create(connector_type, credentials, name=None, project=None, client=None) staticmethod","text":"

    Create a new connector.

    Parameters:

    Name Type Description Default connector_type Union[ConnectorType, str]

    Type of the connector to be created

    required credentials dict

    Connector credentials

    required name Optional[str]

    (optional) Connector name

    None project Optional[Project]

    (optional) Project where to create the connector

    None client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description _T

    New connector

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\ndef create(\n    connector_type: Union[ConnectorType, str], credentials: Union[str, Path, Dict, Credentials],\n    name: Optional[str] = None, project: Optional[Project] = None, client: Optional[Client] = None\n) -> _T:\n    \"\"\"Create a new connector.\n\n    Arguments:\n        connector_type (Union[ConnectorType, str]): Type of the connector to be created\n        credentials (dict): Connector credentials\n        name (Optional[str]): (optional) Connector name\n        project (Optional[Project]): (optional) Project where to create the connector\n        client (Client): (optional) Client to connect to the backend\n\n    Returns:\n        New connector\n    \"\"\"\n    connector_type = ConnectorType._init_connector_type(connector_type)\n    connector_class = _connector_type_to_model(connector_type)\n\n    payload = {\n        \"type\": connector_type.value,\n        \"credentials\": credentials.dict(by_alias=True)\n    }\n    model = connector_class._create(payload, name, project, client)\n\n    connector = connector_class._init_from_model_data(model)\n    connector._project = project\n    return connector\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.get","title":"get(uid, project=None, client=None) staticmethod","text":"

    Get an existing connector.

    Parameters:

    Name Type Description Default uid UID

    Connector identifier

    required project Optional[Project]

    (optional) Project name from where to get the connector

    None client Optional[Client]

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description _T

    Connector

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\n@init_client\ndef get(\n    uid: UID, project: Optional[Project] = None, client: Optional[Client] = None\n) -> _T:\n    \"\"\"Get an existing connector.\n\n    Arguments:\n        uid (UID): Connector identifier\n        project (Optional[Project]): (optional) Project name from where to get the connector\n        client (Optional[Client]): (optional) Client to connect to the backend\n\n    Returns:\n        Connector\n    \"\"\"\n    response = client.get(f'/connector/{uid}', project=project)\n    data = response.json()\n    data_type = data[\"type\"]\n    connector_class = _connector_type_to_model(\n        ConnectorType._init_connector_type(data_type))\n    connector = connector_class._init_from_model_data(\n        connector_class._MODEL_CLASS(**data))\n    connector._project = project\n\n    return connector\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.list","title":"list(project=None, client=None) staticmethod","text":"

    List the connectors instances.

    Parameters:

    Name Type Description Default project Optional[Project]

    (optional) Project name from where to list the connectors

    None client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description ConnectorsList

    List of connectors

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\n@init_client\ndef list(project: Optional[Project] = None, client: Optional[Client] = None) -> ConnectorsList:\n    \"\"\"List the connectors instances.\n\n    Arguments:\n        project (Optional[Project]): (optional) Project name from where to list the connectors\n        client (Client): (optional) Client to connect to the backend\n\n    Returns:\n        List of connectors\n    \"\"\"\n    response = client.get('/connector', project=project)\n    data: list = response.json()\n    return ConnectorsList(data)\n
    "},{"location":"sdk/reference/api/connectors/connector/#connectortype","title":"ConnectorType","text":"

    Bases: str, Enum

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AWS_S3","title":"AWS_S3 = 'aws-s3' class-attribute instance-attribute","text":"

    AWS S3 connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AZURE_BLOB","title":"AZURE_BLOB = 'azure-blob' class-attribute instance-attribute","text":"

    Azure Blob connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AZURE_SQL","title":"AZURE_SQL = 'azure-sql' class-attribute instance-attribute","text":"

    AzureSQL connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.BIGQUERY","title":"BIGQUERY = 'google-bigquery' class-attribute instance-attribute","text":"

    BigQuery connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.FILE","title":"FILE = 'file' class-attribute instance-attribute","text":"

    File connector (placeholder)

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.GCS","title":"GCS = 'gcs' class-attribute instance-attribute","text":"

    Google Cloud Storage connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.MYSQL","title":"MYSQL = 'mysql' class-attribute instance-attribute","text":"

    MySQL connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.SNOWFLAKE","title":"SNOWFLAKE = 'snowflake' class-attribute instance-attribute","text":"

    Snowflake connector

    "},{"location":"sdk/reference/api/datasources/datasource/","title":"DataSource","text":"

    Bases: ModelFactoryMixin

    A DataSource represents a dataset to be used by a Synthesizer as training data.

    Parameters:

    Name Type Description Default connector Connector

    Connector from which the datasource is created

    required datatype Optional[Union[DataSourceType, str]]

    (optional) DataSource type

    TABULAR name Optional[str]

    (optional) DataSource name

    None project Optional[Project]

    (optional) Project name for this datasource

    None wait_for_metadata bool

    If True, wait until the metadata is fully calculated

    True client Client

    (optional) Client to connect to the backend

    None **config

    Datasource specific configuration

    {}

    Attributes:

    Name Type Description uid UID

    UID fo the datasource instance

    datatype DataSourceType

    Data source type

    status Status

    Status of the datasource

    metadata Metadata

    Metadata associated to the datasource

    Source code in ydata/sdk/datasources/datasource.py
    class DataSource(ModelFactoryMixin):\n    \"\"\"A [`DataSource`][ydata.sdk.datasources.DataSource] represents a dataset\n    to be used by a Synthesizer as training data.\n\n    Arguments:\n        connector (Connector): Connector from which the datasource is created\n        datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n        name (Optional[str]): (optional) DataSource name\n        project (Optional[Project]): (optional) Project name for this datasource\n        wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n        client (Client): (optional) Client to connect to the backend\n        **config: Datasource specific configuration\n\n    Attributes:\n        uid (UID): UID fo the datasource instance\n        datatype (DataSourceType): Data source type\n        status (Status): Status of the datasource\n        metadata (Metadata): Metadata associated to the datasource\n    \"\"\"\n\n    def __init__(\n        self, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR,\n        name: Optional[str] = None, project: Optional[Project] = None, wait_for_metadata: bool = True,\n        client: Optional[Client] = None, **config\n    ):\n        datasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\n        self._init_common(client=client)\n        self._model: Optional[mDataSource] = self._create_model(\n            connector=connector, datasource_type=datasource_type, datatype=datatype,\n            config=config, name=name, client=self._client)\n\n        if wait_for_metadata:\n            self._model = DataSource._wait_for_metadata(self)._model\n\n        self._project = project\n\n    @init_client\n    def _init_common(self, client: Optional[Client] = None):\n        self._client = client\n        self._logger = create_logger(__name__, level=LOG_LEVEL)\n\n    @property\n    def uid(self) -> UID:\n        return self._model.uid\n\n    @property\n    def datatype(self) -> DataSourceType:\n        return self._model.datatype\n\n    @property\n    def project(self) -> Project:\n        return self._project or self._client.project\n\n    @property\n    def status(self) -> Status:\n        try:\n            self._model = self.get(uid=self._model.uid,\n                                   project=self.project, client=self._client)._model\n            return self._model.status\n        except Exception:  # noqa: PIE786\n            return Status.unknown()\n\n    @property\n    def metadata(self) -> Optional[Metadata]:\n        return self._model.metadata\n\n    @staticmethod\n    @init_client\n    def list(project: Optional[Project] = None, client: Optional[Client] = None) -> DataSourceList:\n        \"\"\"List the  [`DataSource`][ydata.sdk.datasources.DataSource]\n        instances.\n\n        Arguments:\n            project (Optional[Project]): (optional) Project name from where to list the datasources\n            client (Client): (optional) Client to connect to the backend\n\n        Returns:\n            List of datasources\n        \"\"\"\n        def __process_data(data: list) -> list:\n            to_del = ['metadata']\n            for e in data:\n                for k in to_del:\n                    e.pop(k, None)\n            return data\n\n        response = client.get('/datasource', project=project)\n        data: list = response.json()\n        data = __process_data(data)\n\n        return DataSourceList(data)\n\n    @staticmethod\n    @init_client\n    def get(uid: UID, project: Optional[Project] = None, client: Optional[Client] = None) -> \"DataSource\":\n        \"\"\"Get an existing [`DataSource`][ydata.sdk.datasources.DataSource].\n\n        Arguments:\n            uid (UID): DataSource identifier\n            project (Optional[Project]): (optional) Project name from where to get the connector\n            client (Client): (optional) Client to connect to the backend\n\n        Returns:\n            DataSource\n        \"\"\"\n        response = client.get(f'/datasource/{uid}', project=project)\n        data: list = response.json()\n        datasource_type = CONNECTOR_TO_DATASOURCE.get(\n            ConnectorType(data['connector']['type']))\n        model = DataSource._model_from_api(data, datasource_type)\n        datasource = DataSource._init_from_model_data(model)\n        datasource._project = project\n        return datasource\n\n    @classmethod\n    def create(\n        cls, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR,\n        name: Optional[str] = None, project: Optional[Project] = None, wait_for_metadata: bool = True,\n        client: Optional[Client] = None, **config\n    ) -> \"DataSource\":\n        \"\"\"Create a new [`DataSource`][ydata.sdk.datasources.DataSource].\n\n        Arguments:\n            connector (Connector): Connector from which the datasource is created\n            datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n            name (Optional[str]): (optional) DataSource name\n            project (Optional[Project]): (optional) Project name for this datasource\n            wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n            client (Client): (optional) Client to connect to the backend\n            **config: Datasource specific configuration\n\n        Returns:\n            DataSource\n        \"\"\"\n        datasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\n        return cls._create(\n            connector=connector, datasource_type=datasource_type, datatype=datatype, config=config, name=name,\n            project=project, wait_for_metadata=wait_for_metadata, client=client)\n\n    @classmethod\n    def _create(\n        cls, connector: Connector, datasource_type: Type[mDataSource],\n        datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, config: Optional[Dict] = None,\n        name: Optional[str] = None, project: Optional[Project] = None, wait_for_metadata: bool = True,\n        client: Optional[Client] = None\n    ) -> \"DataSource\":\n        model = DataSource._create_model(\n            connector, datasource_type, datatype, config, name, project, client)\n        datasource = DataSource._init_from_model_data(model)\n\n        if wait_for_metadata:\n            datasource._model = DataSource._wait_for_metadata(datasource)._model\n\n        datasource._project = project\n\n        return datasource\n\n    @classmethod\n    @init_client\n    def _create_model(\n        cls, connector: Connector, datasource_type: Type[mDataSource],\n        datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, config: Optional[Dict] = None,\n        name: Optional[str] = None, project: Optional[Project] = None, client: Optional[Client] = None\n    ) -> mDataSource:\n        _name = name if name is not None else str(uuid4())\n        _config = config if config is not None else {}\n        payload = {\n            \"name\": _name,\n            \"connector\": {\n                \"uid\": connector.uid,\n                \"type\": ConnectorType(connector.type).value\n            },\n            \"dataType\": DataSourceType(datatype).value\n        }\n        if connector.type != ConnectorType.FILE:\n            _config = datasource_type(**config).to_payload()\n        payload.update(_config)\n        response = client.post('/datasource/', project=project, json=payload)\n        data: list = response.json()\n        return DataSource._model_from_api(data, datasource_type)\n\n    @staticmethod\n    def _wait_for_metadata(datasource):\n        logger = create_logger(__name__, level=LOG_LEVEL)\n        while State(datasource.status.state) not in [State.AVAILABLE, State.FAILED, State.UNAVAILABLE]:\n            logger.info(f'Calculating metadata [{datasource.status}]')\n            datasource = DataSource.get(uid=datasource.uid, client=datasource._client)\n            sleep(BACKOFF)\n        return datasource\n\n    @staticmethod\n    def _model_from_api(data: Dict, datasource_type: Type[mDataSource]) -> mDataSource:\n        data['datatype'] = data.pop('dataType', None)\n        data = filter_dict(datasource_type, data)\n        model = datasource_type(**data)\n        return model\n\n    def __repr__(self):\n        return self._model.__repr__()\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.create","title":"create(connector, datatype=DataSourceType.TABULAR, name=None, project=None, wait_for_metadata=True, client=None, **config) classmethod","text":"

    Create a new DataSource.

    Parameters:

    Name Type Description Default connector Connector

    Connector from which the datasource is created

    required datatype Optional[Union[DataSourceType, str]]

    (optional) DataSource type

    TABULAR name Optional[str]

    (optional) DataSource name

    None project Optional[Project]

    (optional) Project name for this datasource

    None wait_for_metadata bool

    If True, wait until the metadata is fully calculated

    True client Client

    (optional) Client to connect to the backend

    None **config

    Datasource specific configuration

    {}

    Returns:

    Type Description DataSource

    DataSource

    Source code in ydata/sdk/datasources/datasource.py
    @classmethod\ndef create(\n    cls, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR,\n    name: Optional[str] = None, project: Optional[Project] = None, wait_for_metadata: bool = True,\n    client: Optional[Client] = None, **config\n) -> \"DataSource\":\n    \"\"\"Create a new [`DataSource`][ydata.sdk.datasources.DataSource].\n\n    Arguments:\n        connector (Connector): Connector from which the datasource is created\n        datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n        name (Optional[str]): (optional) DataSource name\n        project (Optional[Project]): (optional) Project name for this datasource\n        wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n        client (Client): (optional) Client to connect to the backend\n        **config: Datasource specific configuration\n\n    Returns:\n        DataSource\n    \"\"\"\n    datasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\n    return cls._create(\n        connector=connector, datasource_type=datasource_type, datatype=datatype, config=config, name=name,\n        project=project, wait_for_metadata=wait_for_metadata, client=client)\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.get","title":"get(uid, project=None, client=None) staticmethod","text":"

    Get an existing DataSource.

    Parameters:

    Name Type Description Default uid UID

    DataSource identifier

    required project Optional[Project]

    (optional) Project name from where to get the connector

    None client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description DataSource

    DataSource

    Source code in ydata/sdk/datasources/datasource.py
    @staticmethod\n@init_client\ndef get(uid: UID, project: Optional[Project] = None, client: Optional[Client] = None) -> \"DataSource\":\n    \"\"\"Get an existing [`DataSource`][ydata.sdk.datasources.DataSource].\n\n    Arguments:\n        uid (UID): DataSource identifier\n        project (Optional[Project]): (optional) Project name from where to get the connector\n        client (Client): (optional) Client to connect to the backend\n\n    Returns:\n        DataSource\n    \"\"\"\n    response = client.get(f'/datasource/{uid}', project=project)\n    data: list = response.json()\n    datasource_type = CONNECTOR_TO_DATASOURCE.get(\n        ConnectorType(data['connector']['type']))\n    model = DataSource._model_from_api(data, datasource_type)\n    datasource = DataSource._init_from_model_data(model)\n    datasource._project = project\n    return datasource\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.list","title":"list(project=None, client=None) staticmethod","text":"

    List the DataSource instances.

    Parameters:

    Name Type Description Default project Optional[Project]

    (optional) Project name from where to list the datasources

    None client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description DataSourceList

    List of datasources

    Source code in ydata/sdk/datasources/datasource.py
    @staticmethod\n@init_client\ndef list(project: Optional[Project] = None, client: Optional[Client] = None) -> DataSourceList:\n    \"\"\"List the  [`DataSource`][ydata.sdk.datasources.DataSource]\n    instances.\n\n    Arguments:\n        project (Optional[Project]): (optional) Project name from where to list the datasources\n        client (Client): (optional) Client to connect to the backend\n\n    Returns:\n        List of datasources\n    \"\"\"\n    def __process_data(data: list) -> list:\n        to_del = ['metadata']\n        for e in data:\n            for k in to_del:\n                e.pop(k, None)\n        return data\n\n    response = client.get('/datasource', project=project)\n    data: list = response.json()\n    data = __process_data(data)\n\n    return DataSourceList(data)\n
    "},{"location":"sdk/reference/api/datasources/datasource/#status","title":"Status","text":"

    Bases: BaseModel

    "},{"location":"sdk/reference/api/datasources/datasource/#datasourcetype","title":"DataSourceType","text":"

    Bases: StringEnum

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.DataSourceType.MULTITABLE","title":"MULTITABLE = 'multiTable' class-attribute instance-attribute","text":"

    The DataSource is a multi table RDBMS.

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.DataSourceType.TABULAR","title":"TABULAR = 'tabular' class-attribute instance-attribute","text":"

    The DataSource is tabular (i.e. it does not have a temporal dimension).

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.DataSourceType.TIMESERIES","title":"TIMESERIES = 'timeseries' class-attribute instance-attribute","text":"

    The DataSource has a temporal dimension.

    "},{"location":"sdk/reference/api/datasources/metadata/","title":"Metadata","text":"

    Bases: BaseModel

    The Metadata object contains descriptive information about a.

    DataSource

    Attributes:

    Name Type Description columns List[Column]

    columns information

    "},{"location":"sdk/reference/api/synthesizers/base/","title":"Synthesizer","text":"

    Bases: ABC, ModelFactoryMixin

    Main synthesizer class.

    This class cannot be directly instanciated because of the specificities between RegularSynthesizer, TimeSeriesSynthesizer or MultiTableSynthesizer sample methods.

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer--methods","title":"Methods","text":"
    • fit: train a synthesizer instance.
    • sample: request synthetic data.
    • status: current status of the synthesizer instance.
    Note

    The synthesizer instance is created in the backend only when the fit method is called.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None Source code in ydata/sdk/synthesizers/synthesizer.py
    @typechecked\nclass BaseSynthesizer(ABC, ModelFactoryMixin):\n    \"\"\"Main synthesizer class.\n\n    This class cannot be directly instanciated because of the specificities between [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer], [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] or [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer] `sample` methods.\n\n    Methods\n    -------\n    - `fit`: train a synthesizer instance.\n    - `sample`: request synthetic data.\n    - `status`: current status of the synthesizer instance.\n\n    Note:\n            The synthesizer instance is created in the backend only when the `fit` method is called.\n\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n    \"\"\"\n\n    def __init__(\n            self, uid: Optional[UID] = None, name: Optional[str] = None,\n            project: Optional[Project] = None, client: Optional[Client] = None):\n        self._init_common(client=client)\n        self._model = mSynthesizer(uid=uid, name=name or str(uuid4()))\n        self._project = project\n\n    @init_client\n    def _init_common(self, client: Optional[Client] = None):\n        self._client = client\n        self._logger = create_logger(__name__, level=LOG_LEVEL)\n\n    @property\n    def project(self) -> Project:\n        return self._project or self._client.project\n\n    def fit(self, X: Union[DataSource, pdDataFrame],\n            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n            datatype: Optional[Union[DataSourceType, str]] = None,\n            sortbykey: Optional[Union[str, List[str]]] = None,\n            entities: Optional[Union[str, List[str]]] = None,\n            generate_cols: Optional[List[str]] = None,\n            exclude_cols: Optional[List[str]] = None,\n            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n            target: Optional[str] = None,\n            anonymize: Optional[dict] = None,\n            condition_on: Optional[List[str]] = None) -> None:\n        \"\"\"Fit the synthesizer.\n\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.\n\n        The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].\n\n        By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.\n        The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.\n\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]\n            sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Target for the dataset\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\n        if self._already_fitted():\n            raise AlreadyFittedError()\n\n        datatype = DataSourceType(datatype)\n\n        dataset_attrs = self._init_datasource_attributes(\n            sortbykey, entities, generate_cols, exclude_cols, dtypes)\n        self._validate_datasource_attributes(X, dataset_attrs, datatype, target)\n\n        # If the training data is a pandas dataframe, we first need to create a data source and then the instance\n        if isinstance(X, pdDataFrame):\n            if X.empty:\n                raise EmptyDataError(\"The DataFrame is empty\")\n            self._logger.info('creating local connector with pandas dataframe')\n            connector = LocalConnector.create(\n                source=X, project=self._project, client=self._client)\n            self._logger.info(\n                f'created local connector. creating datasource with {connector}')\n            _X = LocalDataSource(connector=connector, project=self._project,\n                                 datatype=datatype, client=self._client)\n            self._logger.info(f'created datasource {_X}')\n        else:\n            _X = X\n\n        if dsState(_X.status.state) != dsState.AVAILABLE:\n            raise DataSourceNotAvailableError(\n                f\"The datasource '{_X.uid}' is not available (status = {_X.status})\")\n\n        if isinstance(dataset_attrs, dict):\n            dataset_attrs = DataSourceAttrs(**dataset_attrs)\n\n        self._fit_from_datasource(\n            X=_X, datatype=datatype, dataset_attrs=dataset_attrs, target=target,\n            anonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)\n\n    @staticmethod\n    def _init_datasource_attributes(\n            sortbykey: Optional[Union[str, List[str]]],\n            entities: Optional[Union[str, List[str]]],\n            generate_cols: Optional[List[str]],\n            exclude_cols: Optional[List[str]],\n            dtypes: Optional[Dict[str, Union[str, DataType]]]) -> DataSourceAttrs:\n        dataset_attrs = {\n            'sortbykey': sortbykey if sortbykey is not None else [],\n            'entities': entities if entities is not None else [],\n            'generate_cols': generate_cols if generate_cols is not None else [],\n            'exclude_cols': exclude_cols if exclude_cols is not None else [],\n            'dtypes': {k: DataType(v) for k, v in dtypes.items()} if dtypes is not None else {}\n        }\n        return DataSourceAttrs(**dataset_attrs)\n\n    @staticmethod\n    def _validate_datasource_attributes(X: Union[DataSource, pdDataFrame], dataset_attrs: DataSourceAttrs, datatype: DataSourceType, target: Optional[str]):\n        columns = []\n        if isinstance(X, pdDataFrame):\n            columns = X.columns\n            if datatype is None:\n                raise DataTypeMissingError(\n                    \"Argument `datatype` is mandatory for pandas.DataFrame training data\")\n        else:\n            columns = [c.name for c in X.metadata.columns]\n\n        if target is not None and target not in columns:\n            raise DataSourceAttrsError(\n                \"Invalid target: column '{target}' does not exist\")\n\n        if datatype == DataSourceType.TIMESERIES:\n            if not dataset_attrs.sortbykey:\n                raise DataSourceAttrsError(\n                    \"The argument `sortbykey` is mandatory for timeseries datasource.\")\n\n        invalid_fields = {}\n        for field, v in dataset_attrs.dict().items():\n            field_columns = v if field != 'dtypes' else v.keys()\n            not_in_cols = [c for c in field_columns if c not in columns]\n            if len(not_in_cols) > 0:\n                invalid_fields[field] = not_in_cols\n\n        if len(invalid_fields) > 0:\n            error_msgs = [\"\\t- Field '{}': columns {} do not exist\".format(\n                f, ', '.join(v)) for f, v in invalid_fields.items()]\n            raise DataSourceAttrsError(\n                \"The dataset attributes are invalid:\\n {}\".format('\\n'.join(error_msgs)))\n\n    @staticmethod\n    def _metadata_to_payload(\n        datatype: DataSourceType, ds_metadata: Metadata,\n        dataset_attrs: Optional[DataSourceAttrs] = None, target: Optional[str] = None\n    ) -> dict:\n        \"\"\"Transform a the metadata and dataset attributes into a valid\n        payload.\n\n        Arguments:\n            datatype (DataSourceType): datasource type\n            ds_metadata (Metadata): datasource metadata object\n            dataset_attrs ( Optional[DataSourceAttrs] ): (optional) Dataset attributes\n            target (Optional[str]): (optional) target column name\n\n        Returns:\n            metadata payload dictionary\n        \"\"\"\n\n        columns = [\n            {\n                'name': c.name,\n                'generation': True and c.name not in dataset_attrs.exclude_cols,\n                'dataType': DataType(dataset_attrs.dtypes[c.name]).value if c.name in dataset_attrs.dtypes else c.datatype,\n                'varType': c.vartype,\n            }\n            for c in ds_metadata.columns]\n\n        metadata = {\n            'columns': columns,\n            'target': target\n        }\n\n        if dataset_attrs is not None:\n            if datatype == DataSourceType.TIMESERIES:\n                metadata['sortBy'] = [c for c in dataset_attrs.sortbykey]\n                metadata['entity'] = [c for c in dataset_attrs.entities]\n\n        return metadata\n\n    def _fit_from_datasource(\n        self,\n        X: DataSource,\n        datatype: DataSourceType,\n        privacy_level: Optional[PrivacyLevel] = None,\n        dataset_attrs: Optional[DataSourceAttrs] = None,\n        target: Optional[str] = None,\n        anonymize: Optional[dict] = None,\n        condition_on: Optional[List[str]] = None\n    ) -> None:\n        payload = self._create_payload()\n\n        payload['dataSourceUID'] = X.uid\n\n        if privacy_level:\n            payload['privacyLevel'] = privacy_level.value\n\n        if X.metadata is not None:\n            payload['metadata'] = self._metadata_to_payload(\n                datatype, X.metadata, dataset_attrs, target)\n\n        payload['type'] = str(datatype.value)\n\n        if anonymize is not None:\n            # process and validated the anonymization config shared by the end user\n            anonymize = build_and_validate_anonimization(\n                anonimyze=anonymize, cols=[col.name for col in X.metadata.columns])\n            payload[\"extraData\"][\"anonymize\"] = anonymize\n        if condition_on is not None:\n            payload[\"extraData\"][\"condition_on\"] = condition_on\n\n        response = self._client.post(\n            '/synthesizer/', json=payload, project=self._project)\n        data = response.json()\n        self._model = mSynthesizer(**data)\n        while self._check_fitting_not_finished(self.status):\n            self._logger.info('Training the synthesizer...')\n            sleep(BACKOFF)\n\n    def _create_payload(self) -> dict:\n        payload = {\n            'extraData': {}\n        }\n\n        if self._model and self._model.name:\n            payload['name'] = self._model.name\n\n        return payload\n\n    def _check_fitting_not_finished(self, status: Status) -> bool:\n        self._logger.debug(f'checking status {status}')\n\n        if Status.State(status.state) in [Status.State.READY, Status.State.REPORT]:\n            return False\n\n        self._logger.debug(f'status not ready yet {status.state}')\n\n        if status.prepare and PrepareState(status.prepare.state) == PrepareState.FAILED:\n            raise FittingError('Could not train the synthesizer')\n\n        if status.training and TrainingState(status.training.state) == TrainingState.FAILED:\n            raise FittingError('Could not train the synthesizer')\n\n        return True\n\n    @abstractmethod\n    def sample(self) -> pdDataFrame:\n        \"\"\"Abstract method to sample from a synthesizer.\"\"\"\n\n    def _sample(self, payload: Dict) -> pdDataFrame:\n        \"\"\"Sample from a synthesizer.\n\n        Arguments:\n            payload (dict): payload configuring the sample request\n\n        Returns:\n            pandas `DataFrame`\n        \"\"\"\n        response = self._client.post(\n            f\"/synthesizer/{self.uid}/sample\", json=payload, project=self._project)\n\n        data: Dict = response.json()\n        sample_uid = data.get('uid')\n        sample_status = None\n        while sample_status not in ['finished', 'failed']:\n            self._logger.info('Sampling from the synthesizer...')\n            response = self._client.get(\n                f'/synthesizer/{self.uid}/history', project=self._project)\n            history: Dict = response.json()\n            sample_data = next((s for s in history if s.get('uid') == sample_uid), None)\n            sample_status = sample_data.get('status', {}).get('state')\n            sleep(BACKOFF)\n\n        response = self._client.get_static_file(\n            f'/synthesizer/{self.uid}/sample/{sample_uid}/sample.csv', project=self._project)\n        data = StringIO(response.content.decode())\n        return read_csv(data)\n\n    @property\n    def uid(self) -> UID:\n        \"\"\"Get the status of a synthesizer instance.\n\n        Returns:\n            Synthesizer status\n        \"\"\"\n        if not self._is_initialized():\n            return Status.State.NOT_INITIALIZED\n\n        return self._model.uid\n\n    @property\n    def status(self) -> Status:\n        \"\"\"Get the status of a synthesizer instance.\n\n        Returns:\n            Synthesizer status\n        \"\"\"\n        if not self._is_initialized():\n            return Status.not_initialized()\n\n        try:\n            self = self.get()\n            return self._model.status\n        except Exception:  # noqa: PIE786\n            return Status.unknown()\n\n    def get(self):\n        assert self._is_initialized() and self._model.uid, InputError(\n            \"Please provide the synthesizer `uid`\")\n\n        response = self._client.get(f'/synthesizer/{self.uid}', project=self._project)\n        data = response.json()\n        self._model = mSynthesizer(**data)\n\n        return self\n\n    @staticmethod\n    @init_client\n    def list(client: Optional[Client] = None) -> SynthesizersList:\n        \"\"\"List the synthesizer instances.\n\n        Arguments:\n            client (Client): (optional) Client to connect to the backend\n\n        Returns:\n            List of synthesizers\n        \"\"\"\n        def __process_data(data: list) -> list:\n            to_del = ['metadata', 'report', 'mode']\n            for e in data:\n                for k in to_del:\n                    e.pop(k, None)\n            return data\n\n        response = client.get('/synthesizer')\n        data: list = response.json()\n        data = __process_data(data)\n\n        return SynthesizersList(data)\n\n    def _is_initialized(self) -> bool:\n        \"\"\"Determine if a synthesizer is instanciated or not.\n\n        Returns:\n            True if the synthesizer is instanciated\n        \"\"\"\n        return self._model is not None\n\n    def _already_fitted(self) -> bool:\n        \"\"\"Determine if a synthesizer is already fitted.\n\n        Returns:\n            True if the synthesizer is instanciated\n        \"\"\"\n\n        return self._is_initialized() and \\\n            (self._model.status is not None\n             and self._model.status.training is not None\n             and self._model.status.training.state is not [TrainingState.PREPARING])\n\n    @staticmethod\n    def _resolve_api_status(api_status: Dict) -> Status:\n        \"\"\"Determine the status of the Synthesizer.\n\n        The status of the synthesizer instance is determined by the state of\n        its different components.\n\n        Arguments:\n            api_status (dict): json from the endpoint GET /synthesizer\n\n        Returns:\n            Synthesizer Status\n        \"\"\"\n        status = Status(api_status.get('state', Status.UNKNOWN.name))\n        if status == Status.PREPARE:\n            if PrepareState(api_status.get('prepare', {}).get(\n                    'state', PrepareState.UNKNOWN.name)) == PrepareState.FAILED:\n                return Status.FAILED\n        elif status == Status.TRAIN:\n            if TrainingState(api_status.get('training', {}).get(\n                    'state', TrainingState.UNKNOWN.name)) == TrainingState.FAILED:\n                return Status.FAILED\n        elif status == Status.REPORT:\n            return Status.READY\n        return status\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.status","title":"status: Status property","text":"

    Get the status of a synthesizer instance.

    Returns:

    Type Description Status

    Synthesizer status

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.uid","title":"uid: UID property","text":"

    Get the status of a synthesizer instance.

    Returns:

    Type Description UID

    Synthesizer status

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, datatype=None, sortbykey=None, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource. When the training dataset is a pandas DataFrame, the argument datatype is required as it cannot be deduced.

    The argumentsortbykey is mandatory for TimeSeries.

    By default, if generate_cols or exclude_cols are not specified, all columns are generated by the synthesizer. The argument exclude_cols has precedence over generate_cols, i.e. a column col will not be generated if it is in both list.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY datatype Optional[Union[DataSourceType, str]]

    (optional) Dataset datatype - required if X is a pandas.DataFrame

    None sortbykey Union[str, List[str]]

    (optional) column(s) to use to sort timeseries datasets

    None entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Target for the dataset

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/synthesizer.py
    def fit(self, X: Union[DataSource, pdDataFrame],\n        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n        datatype: Optional[Union[DataSourceType, str]] = None,\n        sortbykey: Optional[Union[str, List[str]]] = None,\n        entities: Optional[Union[str, List[str]]] = None,\n        generate_cols: Optional[List[str]] = None,\n        exclude_cols: Optional[List[str]] = None,\n        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n        target: Optional[str] = None,\n        anonymize: Optional[dict] = None,\n        condition_on: Optional[List[str]] = None) -> None:\n    \"\"\"Fit the synthesizer.\n\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.\n\n    The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].\n\n    By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.\n    The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.\n\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]\n        sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Target for the dataset\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\n    if self._already_fitted():\n        raise AlreadyFittedError()\n\n    datatype = DataSourceType(datatype)\n\n    dataset_attrs = self._init_datasource_attributes(\n        sortbykey, entities, generate_cols, exclude_cols, dtypes)\n    self._validate_datasource_attributes(X, dataset_attrs, datatype, target)\n\n    # If the training data is a pandas dataframe, we first need to create a data source and then the instance\n    if isinstance(X, pdDataFrame):\n        if X.empty:\n            raise EmptyDataError(\"The DataFrame is empty\")\n        self._logger.info('creating local connector with pandas dataframe')\n        connector = LocalConnector.create(\n            source=X, project=self._project, client=self._client)\n        self._logger.info(\n            f'created local connector. creating datasource with {connector}')\n        _X = LocalDataSource(connector=connector, project=self._project,\n                             datatype=datatype, client=self._client)\n        self._logger.info(f'created datasource {_X}')\n    else:\n        _X = X\n\n    if dsState(_X.status.state) != dsState.AVAILABLE:\n        raise DataSourceNotAvailableError(\n            f\"The datasource '{_X.uid}' is not available (status = {_X.status})\")\n\n    if isinstance(dataset_attrs, dict):\n        dataset_attrs = DataSourceAttrs(**dataset_attrs)\n\n    self._fit_from_datasource(\n        X=_X, datatype=datatype, dataset_attrs=dataset_attrs, target=target,\n        anonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.list","title":"list(client=None) staticmethod","text":"

    List the synthesizer instances.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description SynthesizersList

    List of synthesizers

    Source code in ydata/sdk/synthesizers/synthesizer.py
    @staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> SynthesizersList:\n    \"\"\"List the synthesizer instances.\n\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n\n    Returns:\n        List of synthesizers\n    \"\"\"\n    def __process_data(data: list) -> list:\n        to_del = ['metadata', 'report', 'mode']\n        for e in data:\n            for k in to_del:\n                e.pop(k, None)\n        return data\n\n    response = client.get('/synthesizer')\n    data: list = response.json()\n    data = __process_data(data)\n\n    return SynthesizersList(data)\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.sample","title":"sample() abstractmethod","text":"

    Abstract method to sample from a synthesizer.

    Source code in ydata/sdk/synthesizers/synthesizer.py
    @abstractmethod\ndef sample(self) -> pdDataFrame:\n    \"\"\"Abstract method to sample from a synthesizer.\"\"\"\n
    "},{"location":"sdk/reference/api/synthesizers/base/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"sdk/reference/api/synthesizers/multitable/","title":"MultiTable","text":"

    Bases: BaseSynthesizer

    MultiTable synthesizer class.

    "},{"location":"sdk/reference/api/synthesizers/multitable/#ydata.sdk.synthesizers.multitable.MultiTableSynthesizer--methods","title":"Methods","text":"
    • fit: train a synthesizer instance.
    • sample: request synthetic data.
    • status: current status of the synthesizer instance.
    Note

    The synthesizer instance is created in the backend only when the fit method is called.

    Parameters:

    Name Type Description Default write_connector UID | Connector

    Connector of type RDBMS to be used to write the samples

    required uid UID

    (optional) UID to identify this synthesizer

    None name str

    (optional) Name to be used when creating the synthesizer. Calculated internally if not provided

    None client Client

    (optional) Client to connect to the backend

    None Source code in ydata/sdk/synthesizers/multitable.py
    class MultiTableSynthesizer(BaseSynthesizer):\n    \"\"\"MultiTable synthesizer class.\n\n    Methods\n    -------\n    - `fit`: train a synthesizer instance.\n    - `sample`: request synthetic data.\n    - `status`: current status of the synthesizer instance.\n\n    Note:\n            The synthesizer instance is created in the backend only when the `fit` method is called.\n\n    Arguments:\n        write_connector (UID | Connector): Connector of type RDBMS to be used to write the samples\n        uid (UID): (optional) UID to identify this synthesizer\n        name (str): (optional) Name to be used when creating the synthesizer. Calculated internally if not provided\n        client (Client): (optional) Client to connect to the backend\n    \"\"\"\n\n    def __init__(\n            self, write_connector: Union[Connector, UID], uid: Optional[UID] = None, name: Optional[str] = None,\n            project: Optional[Project] = None, client: Optional[Client] = None):\n\n        super().__init__(uid, name, project, client)\n\n        connector = self._check_or_fetch_connector(write_connector)\n        self.__write_connector = connector.uid\n\n    def fit(self, X: DataSource,\n            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n            datatype: Optional[Union[DataSourceType, str]] = None,\n            sortbykey: Optional[Union[str, List[str]]] = None,\n            entities: Optional[Union[str, List[str]]] = None,\n            generate_cols: Optional[List[str]] = None,\n            exclude_cols: Optional[List[str]] = None,\n            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n            target: Optional[str] = None,\n            anonymize: Optional[dict] = None,\n            condition_on: Optional[List[str]] = None) -> None:\n        \"\"\"Fit the synthesizer.\n\n        The synthesizer accepts as training dataset a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        Except X, all the other arguments are for now ignored until they are supported.\n\n        Arguments:\n            X (DataSource): DataSource to Train\n        \"\"\"\n\n        self._fit_from_datasource(X, datatype=DataSourceType.MULTITABLE)\n\n    def sample(self, frac: Union[int, float] = 1, write_connector: Optional[Union[Connector, UID]] = None) -> None:\n        \"\"\"Sample from a [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer]\n        instance.\n        The sample is saved in the connector that was provided in the synthesizer initialization\n        or in the\n\n        Arguments:\n            frac (int | float): fraction of the sample to be returned\n        \"\"\"\n\n        assert frac >= 0.1, InputError(\n            \"It is not possible to generate an empty synthetic data schema. Please validate the input provided. \")\n        assert frac <= 5, InputError(\n            \"It is not possible to generate a database that is 5x bigger than the original dataset. Please validate the input provided.\")\n\n        payload = {\n            'fraction': frac,\n        }\n\n        if write_connector is not None:\n            connector = self._check_or_fetch_connector(write_connector)\n            payload['writeConnector'] = connector.uid\n\n        response = self._client.post(\n            f\"/synthesizer/{self.uid}/sample\", json=payload, project=self._project)\n\n        data = response.json()\n        sample_uid = data.get('uid')\n        sample_status = None\n        while sample_status not in ['finished', 'failed']:\n            self._logger.info('Sampling from the synthesizer...')\n            response = self._client.get(\n                f'/synthesizer/{self.uid}/history', project=self._project)\n            history = response.json()\n            sample_data = next((s for s in history if s.get('uid') == sample_uid), None)\n            sample_status = sample_data.get('status', {}).get('state')\n            sleep(BACKOFF)\n\n        print(\n            f\"Sample created and saved into connector with ID {self.__write_connector or write_connector}\")\n\n    def _create_payload(self) -> dict:\n        payload = super()._create_payload()\n        payload['writeConnector'] = self.__write_connector\n\n        return payload\n\n    def _check_or_fetch_connector(self, write_connector: Union[Connector, UID]) -> Connector:\n        self._logger.debug(f'Write connector is {write_connector}')\n        if isinstance(write_connector, str):\n            self._logger.debug(f'Write connector is of type `UID` {write_connector}')\n            write_connector = Connector.get(write_connector)\n            self._logger.debug(f'Using fetched connector {write_connector}')\n\n        if write_connector.uid is None:\n            raise InputError(\"Invalid connector provided as input for write\")\n\n        if write_connector.type not in [ConnectorType.AZURE_SQL, ConnectorType.MYSQL, ConnectorType.SNOWFLAKE]:\n            raise ConnectorError(\n                f\"Invalid type `{write_connector.type}` for the provided connector\")\n\n        return write_connector\n
    "},{"location":"sdk/reference/api/synthesizers/multitable/#ydata.sdk.synthesizers.multitable.MultiTableSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, datatype=None, sortbykey=None, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset a YData DataSource. Except X, all the other arguments are for now ignored until they are supported.

    Parameters:

    Name Type Description Default X DataSource

    DataSource to Train

    required Source code in ydata/sdk/synthesizers/multitable.py
    def fit(self, X: DataSource,\n        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n        datatype: Optional[Union[DataSourceType, str]] = None,\n        sortbykey: Optional[Union[str, List[str]]] = None,\n        entities: Optional[Union[str, List[str]]] = None,\n        generate_cols: Optional[List[str]] = None,\n        exclude_cols: Optional[List[str]] = None,\n        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n        target: Optional[str] = None,\n        anonymize: Optional[dict] = None,\n        condition_on: Optional[List[str]] = None) -> None:\n    \"\"\"Fit the synthesizer.\n\n    The synthesizer accepts as training dataset a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    Except X, all the other arguments are for now ignored until they are supported.\n\n    Arguments:\n        X (DataSource): DataSource to Train\n    \"\"\"\n\n    self._fit_from_datasource(X, datatype=DataSourceType.MULTITABLE)\n
    "},{"location":"sdk/reference/api/synthesizers/multitable/#ydata.sdk.synthesizers.multitable.MultiTableSynthesizer.sample","title":"sample(frac=1, write_connector=None)","text":"

    Sample from a MultiTableSynthesizer instance. The sample is saved in the connector that was provided in the synthesizer initialization or in the

    Parameters:

    Name Type Description Default frac int | float

    fraction of the sample to be returned

    1 Source code in ydata/sdk/synthesizers/multitable.py
    def sample(self, frac: Union[int, float] = 1, write_connector: Optional[Union[Connector, UID]] = None) -> None:\n    \"\"\"Sample from a [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer]\n    instance.\n    The sample is saved in the connector that was provided in the synthesizer initialization\n    or in the\n\n    Arguments:\n        frac (int | float): fraction of the sample to be returned\n    \"\"\"\n\n    assert frac >= 0.1, InputError(\n        \"It is not possible to generate an empty synthetic data schema. Please validate the input provided. \")\n    assert frac <= 5, InputError(\n        \"It is not possible to generate a database that is 5x bigger than the original dataset. Please validate the input provided.\")\n\n    payload = {\n        'fraction': frac,\n    }\n\n    if write_connector is not None:\n        connector = self._check_or_fetch_connector(write_connector)\n        payload['writeConnector'] = connector.uid\n\n    response = self._client.post(\n        f\"/synthesizer/{self.uid}/sample\", json=payload, project=self._project)\n\n    data = response.json()\n    sample_uid = data.get('uid')\n    sample_status = None\n    while sample_status not in ['finished', 'failed']:\n        self._logger.info('Sampling from the synthesizer...')\n        response = self._client.get(\n            f'/synthesizer/{self.uid}/history', project=self._project)\n        history = response.json()\n        sample_data = next((s for s in history if s.get('uid') == sample_uid), None)\n        sample_status = sample_data.get('status', {}).get('state')\n        sleep(BACKOFF)\n\n    print(\n        f\"Sample created and saved into connector with ID {self.__write_connector or write_connector}\")\n
    "},{"location":"sdk/reference/api/synthesizers/regular/","title":"Regular","text":"

    Bases: BaseSynthesizer

    Source code in ydata/sdk/synthesizers/regular.py
    class RegularSynthesizer(BaseSynthesizer):\n\n    def sample(self, n_samples: int = 1, condition_on: Optional[dict] = None) -> pdDataFrame:\n        \"\"\"Sample from a [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer]\n        instance.\n\n        Arguments:\n            n_samples (int): number of rows in the sample\n            condition_on: (Optional[dict]): (optional) conditional sampling parameters\n\n        Returns:\n            synthetic data\n        \"\"\"\n        if n_samples < 1:\n            raise InputError(\"Parameter 'n_samples' must be greater than 0\")\n\n        payload = {\"numberOfRecords\": n_samples}\n        if condition_on is not None:\n            payload[\"extraData\"] = {\n                \"condition_on\": condition_on\n            }\n        return self._sample(payload=payload)\n\n    def fit(self, X: Union[DataSource, pdDataFrame],\n            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n            entities: Optional[Union[str, List[str]]] = None,\n            generate_cols: Optional[List[str]] = None,\n            exclude_cols: Optional[List[str]] = None,\n            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n            target: Optional[str] = None,\n            anonymize: Optional[dict] = None,\n            condition_on: Optional[List[str]] = None) -> None:\n        \"\"\"Fit the synthesizer.\n\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Target column\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\n        BaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TABULAR, entities=entities,\n                            generate_cols=generate_cols, exclude_cols=exclude_cols, dtypes=dtypes,\n                            target=target, anonymize=anonymize, privacy_level=privacy_level,\n                            condition_on=condition_on)\n\n    def __repr__(self):\n        if self._model is not None:\n            return self._model.__repr__()\n        else:\n            return \"RegularSynthesizer(Not Initialized)\"\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.regular.RegularSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Target column

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/regular.py
    def fit(self, X: Union[DataSource, pdDataFrame],\n        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n        entities: Optional[Union[str, List[str]]] = None,\n        generate_cols: Optional[List[str]] = None,\n        exclude_cols: Optional[List[str]] = None,\n        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n        target: Optional[str] = None,\n        anonymize: Optional[dict] = None,\n        condition_on: Optional[List[str]] = None) -> None:\n    \"\"\"Fit the synthesizer.\n\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Target column\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\n    BaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TABULAR, entities=entities,\n                        generate_cols=generate_cols, exclude_cols=exclude_cols, dtypes=dtypes,\n                        target=target, anonymize=anonymize, privacy_level=privacy_level,\n                        condition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.regular.RegularSynthesizer.sample","title":"sample(n_samples=1, condition_on=None)","text":"

    Sample from a RegularSynthesizer instance.

    Parameters:

    Name Type Description Default n_samples int

    number of rows in the sample

    1 condition_on Optional[dict]

    (Optional[dict]): (optional) conditional sampling parameters

    None

    Returns:

    Type Description DataFrame

    synthetic data

    Source code in ydata/sdk/synthesizers/regular.py
    def sample(self, n_samples: int = 1, condition_on: Optional[dict] = None) -> pdDataFrame:\n    \"\"\"Sample from a [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer]\n    instance.\n\n    Arguments:\n        n_samples (int): number of rows in the sample\n        condition_on: (Optional[dict]): (optional) conditional sampling parameters\n\n    Returns:\n        synthetic data\n    \"\"\"\n    if n_samples < 1:\n        raise InputError(\"Parameter 'n_samples' must be greater than 0\")\n\n    payload = {\"numberOfRecords\": n_samples}\n    if condition_on is not None:\n        payload[\"extraData\"] = {\n            \"condition_on\": condition_on\n        }\n    return self._sample(payload=payload)\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"sdk/reference/api/synthesizers/timeseries/","title":"TimeSeries","text":"

    Bases: BaseSynthesizer

    Source code in ydata/sdk/synthesizers/timeseries.py
    class TimeSeriesSynthesizer(BaseSynthesizer):\n\n    def sample(self, n_entities: int, condition_on: Optional[dict] = None) -> pdDataFrame:\n        \"\"\"Sample from a [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] instance.\n\n        If a training dataset was not using any `entity` column, the Synthesizer assumes a single entity.\n        A [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] always sample the full trajectory of its entities.\n\n        Arguments:\n            n_entities (int): number of entities to sample\n            condition_on: (Optional[dict]): (optional) conditional sampling parameters\n\n        Returns:\n            synthetic data\n        \"\"\"\n        if n_entities is not None and n_entities < 1:\n            raise InputError(\"Parameter 'n_entities' must be greater than 0\")\n\n        payload = {\"numberOfRecords\": n_entities}\n        if condition_on is not None:\n            payload[\"extraData\"] = {\n                \"condition_on\": condition_on\n            }\n        return self._sample(payload=payload)\n\n    def fit(self, X: Union[DataSource, pdDataFrame],\n            sortbykey: Optional[Union[str, List[str]]],\n            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n            entities: Optional[Union[str, List[str]]] = None,\n            generate_cols: Optional[List[str]] = None,\n            exclude_cols: Optional[List[str]] = None,\n            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n            target: Optional[str] = None,\n            anonymize: Optional[dict] = None,\n            condition_on: Optional[List[str]] = None) -> None:\n        \"\"\"Fit the synthesizer.\n\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            sortbykey (Union[str, List[str]]): column(s) to use to sort timeseries datasets\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Metadata associated to the datasource\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\n        BaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TIMESERIES, sortbykey=sortbykey,\n                            entities=entities, generate_cols=generate_cols, exclude_cols=exclude_cols,\n                            dtypes=dtypes, target=target, anonymize=anonymize, privacy_level=privacy_level,\n                            condition_on=condition_on)\n\n    def __repr__(self):\n        if self._model is not None:\n            return self._model.__repr__()\n        else:\n            return \"TimeSeriesSynthesizer(Not Initialized)\"\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.timeseries.TimeSeriesSynthesizer.fit","title":"fit(X, sortbykey, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required sortbykey Union[str, List[str]]

    column(s) to use to sort timeseries datasets

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Metadata associated to the datasource

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/timeseries.py
    def fit(self, X: Union[DataSource, pdDataFrame],\n        sortbykey: Optional[Union[str, List[str]]],\n        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n        entities: Optional[Union[str, List[str]]] = None,\n        generate_cols: Optional[List[str]] = None,\n        exclude_cols: Optional[List[str]] = None,\n        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n        target: Optional[str] = None,\n        anonymize: Optional[dict] = None,\n        condition_on: Optional[List[str]] = None) -> None:\n    \"\"\"Fit the synthesizer.\n\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        sortbykey (Union[str, List[str]]): column(s) to use to sort timeseries datasets\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Metadata associated to the datasource\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\n    BaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TIMESERIES, sortbykey=sortbykey,\n                        entities=entities, generate_cols=generate_cols, exclude_cols=exclude_cols,\n                        dtypes=dtypes, target=target, anonymize=anonymize, privacy_level=privacy_level,\n                        condition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.timeseries.TimeSeriesSynthesizer.sample","title":"sample(n_entities, condition_on=None)","text":"

    Sample from a TimeSeriesSynthesizer instance.

    If a training dataset was not using any entity column, the Synthesizer assumes a single entity. A TimeSeriesSynthesizer always sample the full trajectory of its entities.

    Parameters:

    Name Type Description Default n_entities int

    number of entities to sample

    required condition_on Optional[dict]

    (Optional[dict]): (optional) conditional sampling parameters

    None

    Returns:

    Type Description DataFrame

    synthetic data

    Source code in ydata/sdk/synthesizers/timeseries.py
    def sample(self, n_entities: int, condition_on: Optional[dict] = None) -> pdDataFrame:\n    \"\"\"Sample from a [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] instance.\n\n    If a training dataset was not using any `entity` column, the Synthesizer assumes a single entity.\n    A [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] always sample the full trajectory of its entities.\n\n    Arguments:\n        n_entities (int): number of entities to sample\n        condition_on: (Optional[dict]): (optional) conditional sampling parameters\n\n    Returns:\n        synthetic data\n    \"\"\"\n    if n_entities is not None and n_entities < 1:\n        raise InputError(\"Parameter 'n_entities' must be greater than 0\")\n\n    payload = {\"numberOfRecords\": n_entities}\n    if condition_on is not None:\n        payload[\"extraData\"] = {\n            \"condition_on\": condition_on\n        }\n    return self._sample(payload=payload)\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"support/help-troubleshooting/","title":"Help & Troubleshooting","text":""},{"location":"synthetic_data/","title":"Synthetic Data generation","text":"

    YData Fabric's Synthetic data Generation capabilities leverages the latest generative models to create high-quality artificial data that replicates real-world data properties. Regardless it is a table, a database or a tex corpus this powerful capability ensures privacy, enhances data availability, and boosts model performance across various industries. In this section discover how YData Fabric's synthetic data solutions can transform your data-driven initiatives.

    "},{"location":"synthetic_data/#what-is-synthetic-data","title":"What is Synthetic Data?","text":"

    Synthetic data is artificially generated data that mimics the statistical properties and structure of real-world data without directly copying it. It is created using algorithms and models designed to replicate the characteristics of actual data sets. This process ensures that synthetic data retains the essential patterns and relationships present in the original data, making it a valuable asset for various applications, particularly in situations where using real data might pose privacy, security, or availability concerns. It can be used for:

    • Guaranteeing privacy and compliance when sharing datasets (for quality assurance, product development and other analytics teams)
    • Removing bias by upsampling rare events
    • Balancing datasets
    • Augment existing datasets to improve the performance of machine learning models or use in stress testing
    • Smartly fill in missing values based on context
    • Simulate new scenarios and hypothesis
    "},{"location":"synthetic_data/#the-benefits-of-synthetic-data","title":"The benefits of Synthetic Data","text":"

    Leveraging synthetic data offers numerous benefits:

    • Privacy and Security: Synthetic data eliminates the risk of exposing sensitive information, making it an ideal solution for industries handling sensitive data, such as healthcare, finance, and telecommunications.
    • Data Augmentation: It enables organizations to augment existing data sets, enhancing model training by providing diverse and representative samples, thereby improving model accuracy and robustness.
    • Cost Efficiency: Generating synthetic data can be more cost-effective than collecting and labeling large volumes of real data, particularly for rare events or scenarios that are difficult to capture.
    • Testing and Development: Synthetic data provides a safe environment for testing and developing algorithms, ensuring that models are robust before deployment in real-world scenarios.
    "},{"location":"synthetic_data/#synthetic-data-in-fabric","title":"Synthetic Data in Fabric","text":"

    YData Fabric offers robust support for creating high-quality synthetic data using generative models and/or through bootstrapping. The platform is designed to address the diverse needs of data scientists, engineers, and analysts by providing a comprehensive set of tools and features.

    "},{"location":"synthetic_data/#data-types-supported","title":"Data Types Supported:","text":"

    YData Fabric supports the generation of various data types, including:

    • Tabular Data: Generate synthetic versions of structured data typically found in spreadsheets and databases, with support for categorical, numerical, and mixed data types.
    • Time Series Data: Create synthetic time series data that preserves the temporal dependencies and trends, useful for applications like financial forecasting and sensor data analysis.
    • Multi-Table or Database Synthesis: Synthesize complex databases with multiple interrelated tables, maintaining the relational integrity and dependencies, which is crucial for comprehensive data analysis and testing applications.
    • Text Data: Produce synthetic text data for natural language processing (NLP) tasks, ensuring the generated text maintains the linguistic properties and context of the original data.
    "},{"location":"synthetic_data/#related-materials","title":"Related Materials","text":"
    • \ud83d\udcd6 The 5 Benefits of Synthetic data generation for modern AI
    • \ud83d\udcd6 The role of Synthetic data in Healthcare
    • \ud83d\udcd6 The role of Synthetic data to overcome Bias
    "},{"location":"synthetic_data/best_practices/","title":"Best practices for optimal synthetic data generation","text":""},{"location":"synthetic_data/best_practices/#overview","title":"Overview","text":"

    This document outlines the best practices for generating structured synthetic data, focusing on ensuring data quality, privacy, and utility. Synthetic data generation is a sophisticated process involving the training of generative models to produce artificial datasets that mimic real-world data. This documentation is intended to guide data scientists, engineers, and analysts in configuring and refining the synthetic data generation process, with a focus on avoiding common pitfalls.

    "},{"location":"synthetic_data/best_practices/#1-understanding-the-use-case","title":"1. Understanding the Use Case","text":"

    Before beginning the synthetic data generation process, it is essential to clearly define the use case. The purpose of the synthetic data\u2014whether for training machine learning models, testing algorithms, or validating data pipelines\u2014will influence the structure, scale, and fidelity required.

    Key Considerations:

    Understand and know your data: Deeply understanding the characteristics and behaviors of the original dataset is crucial for configuring the synthetic data generation process to optimize outcomes. This understanding is also essential for validating and assessing the quality of the synthetic data. If your synthetic data fails to represent all classes from the original dataset, it could indicate that the original data lacks sufficient records for those particular behaviors.

    • Data Characteristics: Identify the necessary size, format, and distribution of the data.

    • Privacy Concerns: Determine if there are specific regulations or privacy requirements to be met.

    • Critical Variables: Identify the key variables and relationships that must be preserved in the synthetic data.

    "},{"location":"synthetic_data/best_practices/#2-configuring-the-data-schema-relations","title":"2. Configuring the Data Schema & Relations","text":"

    Setting and configuring a concise and business aligned dataset schema is crucial for generating high-quality synthetic data. The schema should mirror the structure of the real-world data you aim to emulate, while ensuring the selected PII Types and Data Types are aligned with the use-case and applications.

    Key Considerations:

    • Data Types: Make sure to always verify the configured data types. After all learning a \"Category\" is a different from learning the distribution for a Numerical variable.

    • Unique Identifiers: Exclude unique identifiers (e.g., user IDs, transaction IDs) from the data generation process. These identifiers are typically arbitrary and do not carry meaningful information for the generative model to learn. Instead, generate them separately or replace them with randomized values. Documentation: Thoroughly document the schema, including all constraints and relationships, for future reference and reproducibility.

    • Data Constraints: Include constraints such as primary keys, foreign keys, and data types to maintain data integrity. Also, make sure to configure the relation between tables (eg. x= a + b) as it will ensure that the model will treat the outcome for variable x as a deterministic process.

    "},{"location":"synthetic_data/best_practices/#3-avoiding-overfitting-to-the-original-data","title":"3. Avoiding Overfitting to the Original Data","text":"

    To ensure that the synthetic data is useful and generalizable, it is important to avoid overfitting the generative model to the original dataset. YData Fabric synthetic data generation process leverages the concept of Holdout in order to avoid overfitting, but the effectiveness of the holdout might vary depending on the dataset behaviour and size.

    Key Considerations:

    • Excessive Fine-Tuning: Avoid overly fine-tuning the generative model on your whole dataset, as this can lead to synthetic data that is too similar to the original, reducing its utility.

    • Ignoring Variability: Ensure that the synthetic data introduces enough variability to cover edge cases and rare events, rather than merely replicating common patterns from the training data.

    "},{"location":"synthetic_data/best_practices/#4-ensuring-data-privacy","title":"4. Ensuring Data Privacy","text":"

    One of the key benefits of synthetic data is the ability to mitigate privacy risks. However, careful attention must be paid to ensure that the synthetic data does not inadvertently reveal sensitive information from the original dataset.

    Key Considerations:

    • Reusing Identifiable Information: Do not include direct identifiers (such as names, addresses, etc.) in the synthetic data.

    Having a true identifier among the synthetic data might not only hinder the quality of the synthetic data but also its capacity to remain anonymous.

    "},{"location":"synthetic_data/best_practices/#5-validating-the-synthetic-data","title":"5. Validating the Synthetic Data","text":"

    Validation is a critical step in the synthetic data generation process. The synthetic data must be rigorously tested to ensure that it meets the necessary criteria for its intended use.

    Key Considerations:

    • Skipping Statistical Validation: Do not skip the step of comparing the statistical properties of the synthetic data against the real data. This is essential to ensure that the synthetic data is both realistic and useful.

    • Using a Single Metric: Avoid relying on a single validation metric. Validate the synthetic data across multiple dimensions, such as distribution, correlation, and predictive performance, to get a comprehensive view of its quality.

    YData Fabric synthetic data generation process offers an extensive and automated synthetic data quality report and profiling compare to help with the data quality validation.

    "},{"location":"synthetic_data/best_practices/#6-iterating-and-refining-the-process","title":"6. Iterating and Refining the Process","text":"

    Synthetic data generation is inherently iterative. The initial datasets may require refinement to improve their accuracy, utility, or realism.

    Key Considerations:

    • Treating the First Version as Final: The first generated dataset is rarely perfect. Continuous iteration and refinement are key to achieving high-quality synthetic data.

    • Ignoring Feedback: Feedback from domain experts and end-users is invaluable. Do not disregard this input, as it can significantly improve the relevance and utility of the synthetic data.

    "},{"location":"synthetic_data/best_practices/#7-documenting-and-sharing-the-process","title":"7. Documenting and Sharing the Process","text":"

    Thorough documentation is essential for transparency, reproducibility, and collaboration in synthetic data generation.

    Key Considerations:

    • Skipping Documentation: Failing to document the synthetic data generation process can make it difficult to reproduce results or understand the rationale behind certain decisions.

    • Keeping the Process Opaque: Transparency is crucial, especially when synthetic data is used in critical applications. Ensure that all relevant details, including methodologies, parameters, and assumptions, are clearly documented and accessible to stakeholders.

    Before diving into complex applications, ensure you're thoroughly familiar with synthetic data by starting small and gradually increasing complexity. Build your understanding step by step, and only proceed to more advanced use cases once you're confident in the quality and reliability of the synthetic data. Know your data and ensure that your synthetic data matches your expectations fully before leveraging it for downstream applications.

    "},{"location":"synthetic_data/relational_database/","title":"Multi-Table Synthetic data generation","text":"

    Multi-Table or Database's synthetic data generation is a powerful method to create high-quality artificial datasets that mirror the statistical properties and relational structures of original multi-table databases. A multi-table database consists of multiple interrelated tables, often with various data types (dates, categorical, numerical, etc.) and complex relationships between records. Key use cases include privacy-preserving access to full production databases and the creation of realistic test environments. Synthetic data allows organizations to share and analyze full production databases without exposing sensitive information, ensuring compliance with data privacy regulations. Additionally, it is invaluable for creating realistic test environments, enabling developers and testers to simulate real-world scenarios, identify potential issues, and validate database applications without risking data breaches. By leveraging synthetic multi-table data, organizations can simulate complex relational data environments, enhance the robustness of database applications, and ensure data privacy, making it a valuable tool for industries that rely on intricate data structures and interdependencies.

    "},{"location":"synthetic_data/relational_database/#tutorials-recipes","title":"Tutorials & Recipes","text":"

    To get-started with Synthetic Data Generation you can follow out quickstart guide.

    For more tutorial and recipes, follow the link to YData's Academy.

    "},{"location":"synthetic_data/relational_database/#related-materials","title":"Related Materials","text":"
    • How to generate Synthetic Data from a Database
    • How to generate Multi-Table step-by-step
    • How to generate Multi-Table synthetic data in Google Colab
    "},{"location":"synthetic_data/single_table/","title":"Tabular synthetic data generation","text":"

    Tabular synthetic data generation is a powerful method to create high-quality artificial datasets that mirror the statistical properties of original tabular data. A tabular dataset is usually composed by several columns with structured data and mixed data types (dates, categorical, numerical, etc) with not time dependence between records. This ability of generating synthetic data from this type of datasets is essential for a wide range of applications, from data augmentation to privacy preservation, and is particularly useful in scenarios where obtaining or using real data is challenging.

    "},{"location":"synthetic_data/single_table/#tutorials-recipes","title":"Tutorials & Recipes","text":"

    To get-started with Synthetic Data Generation you can follow out quickstart guide.

    For more tutorial and recipes, follow the link to YData's Academy.

    "},{"location":"synthetic_data/single_table/#related-materials","title":"Related Materials","text":"
    • \ud83d\udcd6 Generating Synthetic data from a Tabular dataset with a large number of columns
    • \ud83d\udcd6 Synthetic data to improve Credit Scoring models
    • Generate Synthetic data with Python code
    • Synthetic data generation with API
    "},{"location":"synthetic_data/text/","title":"Text Synthetic Data generation","text":"

    Synthetic data generation for text creates high-quality artificial text datasets that mimic the properties and patterns of original text data, playing a crucial role in Generative AI applications. This technique enhances the performance of large language models (LLMs) by providing extensive training datasets, which improve model accuracy and robustness. It addresses data scarcity by generating text for specialized domains or languages where data is limited. Additionally, synthetic text generation ensures privacy preservation, allowing organizations to create useful datasets without compromising sensitive information, thereby complying with data privacy regulations while enabling comprehensive data analysis and model training\u200b

    Feature in Preview

    This feature is in preview and not available for all users. Contact us if you are interested in giving it a try!

    "},{"location":"synthetic_data/text/#related-materials","title":"Related Materials","text":"
    • How to generate Synthetic Text Data?
    "},{"location":"synthetic_data/timeseries/","title":"Time-series synthetic data generation","text":"

    Time-series synthetic data generation is a powerful method to create high-quality artificial datasets that mirror the statistical properties of original time-series data. A time-series dataset is composed of sequential data points recorded at specific time intervals, capturing trends, patterns, and temporal dependencies. This ability to generate synthetic data from time-series datasets is essential for a wide range of applications, from data augmentation to privacy preservation, and is particularly useful in scenarios where obtaining or using real data is challenging. By leveraging synthetic time-series data, organizations can simulate various conditions and events, enhance model robustness, and ensure data privacy, making it a valuable tool for industries reliant on temporal data analysis. This type of data is prevalent in various fields, including finance, healthcare, energy, and IoT (Internet of Things).

    "},{"location":"synthetic_data/timeseries/#tutorials-recipes","title":"Tutorials & Recipes","text":"

    To get-started with Synthetic Data Generation you can follow out quickstart guide.

    For more tutorial and recipes, follow the link to YData's Academy.

    "},{"location":"synthetic_data/timeseries/#related-materials","title":"Related Materials","text":"
    • \ud83d\udcd6 Understanding the structure of a time-series dataset
    • \ud83d\udcd6 Time-series synthetic data generation
    • \ud83d\udcd6 Synthetic multivariate time-series data
    • How to generate time-series synthetic data?
    "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Welcome","text":"

    YData Fabric is a Data-Centric AI development platform that accelerates AI development by helping data practitioners achieve production-quality data.

    Much like for software engineering the quality of code is a must for the success of software development, Fabric accounts for the data quality requirements for data-driven applications. It introduces standards, processes, and acceleration to empower data science, analytics, and data engineering teams.

    \ud83d\ude80 YData SDK Version 1.0 Released! \ud83c\udf89

    We are excited to announce the release of YData Fabric SDK v1.0! This major release marks the beginning of long-term support for the package, ensuring stability, continuous improvements, and ongoing support for all users. YData SDK empowers developers with easy access to state-of-the-art data quality tools and generative AI capabilities. Stay tuned for more updates and new features!

    "},{"location":"#try-fabric","title":"Try Fabric","text":"
    • Get started with Fabric Community
    "},{"location":"#why-adopt-ydata-fabric","title":"Why adopt YData Fabric?","text":"

    With Fabric, you can standardize the understanding of your data, quickly identify data quality issues, streamline and version your data preparation workflows and finally leverage synthetic data for privacy-compliance or as a tool to boost ML performance. Fabric is a development environment that supports a faster and easier process of preparing data for AI development. Data practitioners are using Fabric to:

    • Establish a centralized and collaborative repository for data projects.
    • Create and share comprehensive documentation of data, encompassing data schema, structure, and personally identifiable information (PII).
    • Prevent data quality issues with standardized data quality profiling, providing visual understanding and warnings on potential issues.
    • Accelerate data preparation with customizable recipes.
    • Improve machine learning performance with optimal data preparation through solutions such as synthetic data.
    • Shorten access to data with privacy-compliant synthetic data generatio.
    • Build and streamline data preparation workflows effortlessly through a user-friendly drag-and-drop interface.
    • Efficiently manage business rules, conduct comparisons, and implement version control for data workflows using pipelines.
    "},{"location":"#key-features","title":"\ud83d\udcdd Key features","text":""},{"location":"#data-catalog","title":"Data Catalog","text":"

    Fabric Data Catalog provides a centralized perspective on datasets within a project-basis, optimizing data management through seamless integration with the organization's existing data architectures via scalable connectors (e.g., MySQL, Google Cloud Storage, AWS S3). It standardizes data quality profiling, streamlining the processes of efficient data cleaning and preparation, while also automating the identification of Personally Identifiable Information (PII) to facilitate compliance with privacy regulations.

    Explore how a Data Catalog through a centralized repository of your datasets, schema validation, and automated data profiling.

    "},{"location":"#labs","title":"Labs","text":"

    Fabric's Labs environments provide collaborative, scalable, and secure workspaces layered on a flexible infrastructure, enabling users to seamlessly switch between CPUs and GPUs based on their computational needs. Labs are familiar environments that empower data developers with powerful IDEs (Jupyter Notebooks, Visual Code or H2O flow) and a seamless experience with the tools they already love combined with YData's cutting-edge SDK for data preparation.

    Learn how to use the Labs to generate synthetic data in a familiar Python interface.

    "},{"location":"#synthetic-data","title":"Synthetic data","text":"

    Synthetic data, enabled by YData Fabric, provides data developers with a user-friendly interfaces (UI and code) for generating artificial datasets, offering a versatile solution across formats like tabular, time-series and multi-table datasets. The generated synthetic data holds the same value of the original and aligns intricately with specific business rules, contributing to machine learning models enhancement, mitigation of privacy concerns and more robustness for data developments. Fabric offers synthetic data that is ease to adapt and configure, allows customization in what concerns privacy-utility trade-offs.

    Learn how you to create high-quality synthetic data within a user-friendly UI using Fabric\u2019s data synthesis flow.

    "},{"location":"#pipelines","title":"Pipelines","text":"

    Fabric Pipelines streamlines data preparation workflows by automating, orchestrating, and optimizing data pipelines, providing benefits such as flexibility, scalability, monitoring, and reproducibility for efficient and reliable data processing. The intuitive drag-and-drop interface, leveraging Jupyter notebooks or Python scripts, expedites the pipeline setup process, providing data developers with a quick and user-friendly experience.

    Explore how you can leverage Fabric Pipelines to build versionable and reproducible data preparation workflows for ML development.

    "},{"location":"#tutorials","title":"Tutorials","text":"

    To understand how to best apply Fabric to your use cases, start by exploring the following tutorials:

    • Handling Imbalanced Data for Improved Fraud DetectionLearn how to implement high-performant fraud detection models by incorporating synthetic data to balance your datasets.

    • Prediction with Quality Inspection Learn how to develop data preparation workflows with automated data quality checks and Pipelines.

    • Generating Synthetic Data for Financial TransactionsLearn how to use synthetic data generation to replicate your existing relational databases while ensuring referential integrity.

    You can find additional examples and use cases at YData Academy GitHub Repository.

    "},{"location":"#support","title":"\ud83d\ude4b Support","text":"

    Facing an issue? We\u2019re committed to providing all the support you need to ensure a smooth experience using Fabric:

    • Create a support ticket: our team will help you move forward!
    • Contact a Fabric specialist: for personalized guidance or full access to the platform
    "},{"location":"data_catalog/","title":"Data Catalog","text":"

    In the realm of data management and analysis, the ability to efficiently discover, understand, and access data is crucial. Fabric's Data Catalog emerges as a pivotal solution in this context, designed to facilitate an organized, searchable, and accessible repository of metadata. This chapter introduces the concept, functionality, and advantages of the Data Catalog within Fabric's ecosystem, offering developers a comprehensive overview of its significance and utility.

    To ensure that large volumes of data can be processed through the entire data pipeline, Fabric is equipped with integrated connectors for various types of storages (from RDBMS to cloud object storage), guaranteeing the data never leaves your premises. Furthermore Fabric's Catalog ensures a timely and scalable data analysis as it runs on top of a distributed architecture powered by Kubernetes and Dask.

    The benefits of Fabric's Data Catalog for data teams are manifold, enhancing not only the efficiency but also the effectiveness of data understanding operations:

    • Improved Data Accessibility: With the Data Catalog, developers can consume the data they need for a certain project through a user-friendly interface, significantly reducing the time spent searching for data across disparate sources. This enhanced discoverability makes it easier to initiate data analysis, machine learning projects,
    • or any other data-driven tasks.

    • Enhanced Data Governance and Quality: Fabric's Data Catalog provides comprehensive tools for data-drive projects governance in terms of data assets, including data quality profiling and metadata management. These tools help maintain high-data quality and compliance with regulatory standards, ensuring that developers work with reliable and standardized information throughout the project.

    • Knowledge and Insight Sharing: Through detailed metadata, data quality warnings and detailed profiling, Fabric's Data Catalog enhances the understanding of data's context and behaviour. This shared knowledge base supports better decision-making and innovation in a data-driven project.

    "},{"location":"data_catalog/#related-materials","title":"Related Materials","text":"
    • \ud83d\udcd6 Data Catalogs in the modern data stack
    • How to create your first Datasource from a CSV file?
    • How to create a Database in the Data Catalog?
    • How to automate data quality profiling?
    "},{"location":"data_catalog/connectors/","title":"Connectors","text":"

    Fabric connectors play an important role in the landscape of data-driven projects, acting as essential components that facilitate the movement and integration of data across different systems, platforms, and applications. Fabric connectors where designe to offer a seamless and easy connectivity for data exchange between disparate data sources (such as databases, cloud storage systems, etc).

    "},{"location":"data_catalog/connectors/#benefits","title":"Benefits","text":"
    • Data Integration: Fabric Connectors are primarily used to consume and integrate data a variety of different sources in a single project, ensuring that data can be easily combined, transformed, and made ready for analysis or operational use.
    • Automation of data flows: They automate the process of data extraction, transformation and loading (ETL), which is crucial for maintaining up-to-date and accurate the data that is being used for a certain project.
    • Simplification of data access: Fabric connectors experience simplify the process of accessing and using data from specialized or complex systems, making it easier for users without deep technical expertise to leverage data for insights.
    • Enhancement of Data Security: Designed to manage in a secure way the credentials and access to your different storage.
    "},{"location":"data_catalog/connectors/#get-started-with-fabric-connectors","title":"Get started with Fabric Connectors","text":"
    • How to create a connector in Fabric?
    • How to use Object Storage Connectors through Labs?
    • How to use RDBMS connectors through Labs?
    "},{"location":"data_catalog/connectors/create_connector/","title":"How to create a connector in Fabric's Data Catalog?","text":"

    How to create a connector to an RDBMS in Fabric?

    To create a connector in YData Fabric, select the \"Connectors\" page from the left side menu, as illustrated in the image below.

    Click in \"Add Connector\" and a list of connector types to choose from will be shown to you.

    For the purpose of this example we will be creating a connector to our AWS S3 storage. The credentials/secrets to your storage will be requested. After adding them, you can \"Test connection\" to ensure that all the details are correct. A confirmation message, similar to the one shown in the image below, should appear in our screen, letting you know that you can now save your connector successfully!

    Congrats! \ud83d\ude80 You have now created your first Connector! You can now create different Datasources in your project's Data Catalog. Get ready for your journey of improved quality data for AI.

    "},{"location":"data_catalog/connectors/supported_connections/","title":"Supported connections","text":"

    Fabric can read and write data from a variety of data sources.

    "},{"location":"data_catalog/connectors/supported_connections/#connectors","title":"Connectors","text":"

    Here is the list of the available connectors in Fabric.

    Connector Name Type Supported file types Notes AWS S3 Object Storage Parquet CSV Azure Blog Storage Object Storage Parquet CSV Azure Data Lake Object Storage Parquet CSV Google Cloud storage Object Storage Parquet CSV Upload file File Parquet CSV Maximum file size is 700MB. Bigger files should be uploaded and read from remote object storages Google BigQuery Big Table Not applicable MySQL RDBMS Not applicable Supports reading whole schemas or specifying a query Azure SQL Server RDBMS Not applicable Supports reading whole schemas or specifying a query PostGreSQL RDBMS Not applicable Supports reading whole schemas or specifying a query Snowflake RDBMS Not applicable Supports reading whole schemas or specifying a query Oracle DB RDBMS Not applicable Supports reading whole schemas or specifying a query Databricks Unity Catalog Catalog Not applicable Supports reading a table Databricks Delta Lake Lakehouse Not applicable Supports reading a table"},{"location":"data_catalog/connectors/supported_connections/#havent-found-your-storage","title":"Haven't found your storage?","text":"

    To understand our development roadmap or to request prioritization of new data connector, reach out to us at ydata.ai/contact-us.

    "},{"location":"data_catalog/connectors/use_in_labs/","title":"Use connectors in Lab","text":""},{"location":"data_catalog/connectors/use_in_labs/#create-a-lab-environment","title":"Create a lab environment","text":""},{"location":"data_catalog/datasources/","title":"Overview","text":"

    YData Fabric Datasources are entities that represent specific data sets such as tables, file sets, or other structured formats within the YData Fabric platform. They offer a centralized framework for managing, cataloging, and profiling data, enhancing data management and quality.

    "},{"location":"data_catalog/datasources/#benefits","title":"Benefits","text":"
    • Summarized metadata information: Fabric Datasources provide comprehensive metadata management, offering detailed information about each datasource, including schema details, descriptions, tags, and data lineage. This metadata helps users understand the structure and context of their data.

    • Data Quality Management: Users can find data quality warnings, validation results, cleansing suggestions, and quality scores. These features help in identifying and addressing data quality issues automatically, ensuring reliable data for analysis and decision-making.

    • Data Profiling: Data profiling tools analyze the content and structure of datasources, providing statistical summaries, detecting patterns, assessing completeness, and evaluating data uniqueness. These insights help in understanding and improving data quality.

    • PII Identification and Management: Fabric detects and manages Personally Identifiable Information (PII) within datasources. It includes automatic PII detection, masking tools, and compliance reporting to protect sensitive data and ensure regulatory compliance.

    • Centralized Repository: Fabric Datasources serve as a centralized repository for data quality discovery and management. They provide a single point of access for all data assets, simplifying discovery, monitoring, and governance, and improving overall data management efficiency.

    "},{"location":"data_catalog/datasources/pii/","title":"PII identification","text":"

    To overcome the concerns around data privacy and enable secure data sharing, Fabric incorporates an automated Personal Identifiable Information (PII) identification engine to help detect and handle potential PII.

    What can be considered Personal Identifiable Information (PII)?

    PII is information that, when used alone or with other relevant data, can uniquely identify an individual. PII may contain direct indentifiers (e.g., ID, VAT, Credit Card Number) and/or quasi-identifiers (e.g., age, gender, race, occupation). Correctly classifying these is crucial to reduce the risk of re-identification. Learn more about how Fabric mitigates the risk of re-identification using synthetic data.

    Fabric offers a standardized classification of PII that automatically highlights and tags potential PII. The automatic detection of PII can be enabled during the loading process of your datasets and can be leveraged to generate privacy-preserving synthetic data.

    After the detection, the PII information will be available through the Metadata > PII Types, where each column that may represent potential PII is associated to one or several tags that identify the type of information it might be leaking.

    You can review the automatic PII classification and add additional PII tags of your own by editing the metadata and select additional tags available in a pre-defined list of values, containing the most common types of potential PII information: email, phone, VAT, zip code, among others.

    Need a solution to enable data sharing and comply with GDPR and CCPA regulations?

    Using synthetic data has proven to foster a culture of data-sharing within organizations, overcoming the limitations of traditional privacy methods and maximizing data value. Try Fabric Community Version to enable secure data sharing.

    "},{"location":"data_catalog/datasources/profiling/","title":"Profiling","text":"Profiling sections"},{"location":"data_catalog/datasources/profiling/#data-structures-supported","title":"Data Structures supported","text":"

    The profiling offers comprehensive insights into various types of data, including tabular, time-series text and image data.

    • Tabular data: when dealing with tabular data, such as spreadsheets or databases, the profiling provides valuable statistics on data distribution, central tendencies, and categorical variable frequencies. It identifies multivariate relations such as correlations and interactions in a visual manner. It also identifies missing data.
    • Time-series data: when dealing with data with temporal dimensions, the profiling extends its capabilities to capture trends, seasonality, cyclic patterns and missing data gaps. It can reveal information about data volatility, periodicity, and anomalies, facilitating a deeper understanding of time-dependent trends.
    • Text: when it comes to text data, such as strings or documents, the profiling offers insightful statistics on the distribution of word frequencies, common phrases, and unique words.
    "},{"location":"data_catalog/datasources/profiling/#data-types","title":"Data types","text":"

    Types, are a powerful abstraction for effective data analysis, allowing analysis under higher level lenses. Fabric Profiling is backed by a powerful type system developed specifically for data analysis that allows an automated detection for different data types. Currently, the following types are recognized:

    • Numerical
    • Categorical
    • Date (and Datetime)
    • String
    • Time-series
    • LongText
    "},{"location":"data_catalog/datasources/profiling/#univariate-profiling","title":"Univariate profiling","text":"

    This section provides a comprehensive overview of individual variables within a given dataset, this feature is particularly useful for exploratory data analysis (EDA) as it automatically calculated detailed statistics, visualizations, and insights for each variable in the dataset. It offers information such as data type, missing values, unique values, basic descriptive statistics , histogram plots, and distribution plots. This allows data analysts and scientists to quickly understand the characteristics of each variable, identify potential data quality issues, and gain initial insights into the data's distribution and variability.

    "},{"location":"data_catalog/datasources/profiling/#multivariate-profiling","title":"Multivariate profiling","text":"Multivariate profiling metrics and visualization

    This section provides essentials insights into the relationships between variables through correlations matrices and interactions. The correlation view computes and presents correlation coefficients between pairs of numerical variables, helping to identify potential linear relationships. This assists data analysts and scientists in understanding how variables change together and highlights possible multi collinearity issues.

    On the other hand, the interactions section goes beyond correlation by exploring potential nonlinear relationships and interactions between variables, providing a more comprehensive understanding of how variables interact with one another. This can be crucial in identifying hidden patterns that might not be captured through traditional correlation analysis.

    "},{"location":"data_catalog/datasources/profiling/#correlations","title":"Correlations","text":"Correlations section

    Fabric's intuitive correlation matrix and heatmap visualizations empower users to drill down into specific variable interactions and understand their dependencies. Additionally, Fabric\u2019s real-time interactivity allows users to adjust filters and refine their analysis dynamically, supporting deeper insights into correlations across complex datasets.

    Fabric Correlations are calculated pairwise and depending on the type schema: - numerical to numerical variable: Spearman correlation coefficient - categorical to categorical variable: Cramer's V association coefficient - numerical to categorical: Cramer's V association coefficient with the numerical variable discretized automatically

    "},{"location":"data_catalog/datasources/profiling/#interactions","title":"Interactions","text":"

    The interactions plot visually represents how two variables influence each other across different sections of the data. It goes beyond simple correlations by providing an interactive view of how one variable changes in relation to another. This plot helps users detect non-linear relationships and complex patterns, allowing for deeper insights during Exploratory Data Analysis (EDA). By dynamically highlighting these variable pair interactions, the interactions profile enables users to refine their understanding of data relationships, guiding more informed decisions in the data preparation process.

    "},{"location":"data_catalog/datasources/profiling/#missing-data","title":"Missing data","text":"

    This section offers valuable insights into the presence and distribution of missing data within a dataset. It can be particularly helpful for data preprocessing and quality assesment as provides a comprehensive summary of missing values across variables, indicating the percentage of missing data for each variable. Additionally, it displays a visual representation of missing data patterns through bar plots and heatmaps, allowing users to quickly identify which variables have the most significant amount of missing information.

    "},{"location":"data_catalog/datasources/profiling/#outliers","title":"Outliers **","text":"Outliers identification

    This section provides a comprehensive profiling over the potential dataset outliers. You can validate and observe outliers presence and deviation from the general distribution of numerical variables based on observed variance. The identification of outliers allows the data analyst or scientist to assess whether they are genuine data anomalies or erroneous entries, allowing for informed decisions on whether to retain, transform, or exclude these points in further analyses.

    "},{"location":"data_catalog/datasources/warnings/","title":"Warnings","text":"

    The first technical step in any data science project is to examine the data and understand its quality, value and fitness for purpose. For this reason, Fabric\u2019s Data Catalog includes an Overview and Warnings module for a better understanding of the available datasets.

    "},{"location":"data_catalog/datasources/warnings/#datasets","title":"Datasets","text":""},{"location":"data_catalog/datasources/warnings/#overview","title":"Overview","text":"

    When clicking on a Dataset available from the Data Catalog, it will show its details page, revealing an Overview and Warnings section.

    In the Overview, you\u2019ll get an overall perspective of your dataset\u2019s characteristics, where descriptive statistics will be presented, including:

    • Basic description and tags/concepts associated to the dataset
    • Memory consumption
    • Number of rows
    • Duplicate rows (percentage / number of records)
    • Number of columns
    • Total data types (numeric, categorical, string, long text, ID, date)
    • Missing data (percentage / number of cells)
    • Main data quality warnings
    "},{"location":"data_catalog/datasources/warnings/#data-quality-warnings","title":"Data Quality Warnings","text":"

    To enable data-centric development, Fabric automatically detects and signals potential data quality warnings. Warnings highlight certain peculiarities of data that might require further investigation prior to model development and deployment. However, the validity of each issued warning and whether follow-up mitigation work is needed will depend on the specific use case and on domain knowledge.

    Fabric currently supports the following warnings:

    • Constant: the column presents the same value for all observations
    • High: A high warning is raised whenever all the variables in a column have the same value
    • Zeros: the column presents the value \u201c0\u201d for several observations
    • Moderate: A moderate warning is raised if a column has between 10% and 25% of zeros
    • High: A high warning is raised if a column has more than 50% records as zeros
    • Unique: the column contains only unique/distinct values
    • High: A high warnins is raised if all the values of a column are different
    • Cardinality: the columns (categorical) has a large number of distinct values
    • Moderate: A moderate warning is raised if a column has a cardinality equivalent to between 75% and 90% of the number of rows
    • High: A high warning is raised if a column has a cardinality equivalent to more than 90% of the number of rows
    • Infinity: the column presents infinite (\\(\\inf\\)) values
    • High: A high warning is raised is all the values of a column are inf
    • Constant_length: the column (text) has constant length
    • High: A high warning is raised is all the values of a column have the same string length
    • Correlation: the columns is highly correlated with other(s)
    • Skeweness: the column distribution (numerical) is skewed
    • Moderate: A moderate warning is raised if the value for the calculated skewness is between [-1, -0.5] or [0.5, 1]
    • High: A high warning is raised if the value for the calculated skewness is lower than -1 or bigger than 1.
    • Missings: the column presents several missing values
    • Moderate: A moderate warning is raised if a column has a cardinality equivalent to between 30% and 60% of the number of rows
    • High: A high warning is raised if a column has a cardinality equivalent to more than 60% of the number of rows
    • Non-stationarity: the column (time series) presents statistical properties that change through time
    • Seasonal: the column (time series) exhibits a seasonal pattern
    • Imbalance: the column (categorical) presents a high imbalance ratio between existing categories Imbalancement is calculated as imbalanced_score: 1 - (entropy(value_counts) / log2(number_categories))
    • Moderate: A moderate warning is raised if imbalanced_score between 0.15 and 5.
    • High: A high warning is raised if imbalanced_score between 0.5 and 1.

    Fabric further enables the interactive exploration of warnings, filtering over specific warnings and severity types (i.e., Moderate and High):

    "},{"location":"data_catalog/datasources/warnings/#databases","title":"Databases","text":"

    When users create a database in YData Fabric's Data Catalog, they gain access to a powerful suite of tools designed to help them manage and understand the structure of their data. The Data Catalog provides a comprehensive view of each database, offering detailed insights into the schema and data quality. Here are the key features users can expect:

    "},{"location":"data_catalog/datasources/warnings/#schema-overview-with-database-specific-warnings","title":"Schema Overview with Database-Specific Warnings","text":"

    As soon as a database is added to the Fabric Data Catalog, users are presented with a complete overview of the schema. This includes a visual representation of the tables, columns, and relationships that exist within the database. In addition to the schema visualization, Fabric automatically scans the database for potential issues and displays database-specific warnings.

    These warnings help users identify problems that could impact data integrity, such as:

    • Self-references (columns that act as both primary and foreign keys)
    • Self-referencing columns, where a column serves as both a primary key and a foreign key in the same table, can create complexities for synthetic data generation. YData Fabric detects self-references and warns users when this relationship might lead to synthetic data inconsistency or improper referential integrity. The platform suggests creating a temporary schema or breaking these references into a more manageable structure to ensure clarity and accuracy in data generation.
    • Independent tables (tables with no defined relations)
    • Tables without relationships to other tables\u2014referred to as independent tables\u2014can result in isolated synthetic data that lacks the interdependencies present in the original data. YData Fabric flags such tables to alert users that these isolated data structures may need further inspection or modeling to ensure they align with the overall data environment.
    • Schemas with no defined relations (schemas missing foreign key constraints)
    • When a schema lacks defined relationships between tables, YData Fabric issues warnings to alert users of the absence of foreign key constraints or other relational ties. This warning is critical, as generating synthetic data without considering relationships can lead to inaccurate and fragmented datasets. Users are encouraged to define necessary relations or clarify dependencies to improve the quality of the synthetic data output.
    • Circular references (tables involved in a loop of dependencies)
    • Circular references occur when tables are interdependent in a closed loop (e.g., Table A references Table B, and Table B references Table A). These can cause significant complications during synthetic data generation, especially in maintaining referential integrity across the cycle. YData Fabric detects these loops and provides guidance on how to restructure the schema, such as breaking the cycle or temporarily isolating the tables, to avoid generating erroneous data.
    • Indirect relations between tables (complex chains of relationships)
    • YData Fabric also identifies indirect relationships between tables\u2014where two or more tables are connected via intermediary tables or columns. These complex relationships can introduce nuances that might not be immediately obvious during data modeling. The platform issues warnings to ensure that indirect relationships are clearly understood and accurately represented in synthetic data generation, preventing the loss of valuable data linkages.

    This automatic detection ensures that users can proactively address any schema complexities before they negatively impact data queries or synthetic data generation.

    "},{"location":"data_catalog/datasources/warnings/#table-level-navigation-with-detailed-warnings","title":"Table-Level Navigation with Detailed Warnings","text":"

    Similarly to datasets that are constituted by a single table, for each database table YData Fabric provides users with the ability to navigate through each table within the database individually. For every table, users can view a detailed overview that includes:

    • Column names and types,
    • Statistical overview
    • Warnings (see the previous section for more details)
    "},{"location":"deployment_and_security/deployment/aws/bastion_host/","title":"Bastion host","text":"

    During the installation, the user will be prompt with the possibility of allowing the creation of a bastion host. This bastion host is used by YData to give a closer support to the users. If you allow the creation of this bastion host, an EC2 will be created during installation with NO ingress rules on his security group.

    In case is needed, you will need to send the bastion host Elastic IP to YData Fabric and add an ingress rule to the security group as explained below. In the CloudFormation outputs you can find the relevant information of the EC2 bastion host, such as, elastic IP, the EC2 instance ID and the security group ID:

    "},{"location":"deployment_and_security/deployment/aws/bastion_host/#setting-the-sg-ingress-rule","title":"Setting the SG ingress rule","text":"
    • To give access to the bastion host, please go to the EC2 service \u2192 Security Groups.
    • You can search for the security group ID provided on the template outputs:
    • Go to the \"Inbound rules\" tab and click \"Edit\" inbound rules.
    • You can then, add an inbound rule to allow the access to the bastion host and click Save rules, as per the image below.
    • For single IP source, an IP will be given to you on the support time via email.
    "},{"location":"deployment_and_security/deployment/aws/bastion_host/#removing-the-sg-ingress-rule","title":"Removing the SG ingress rule","text":"
    • As soon the support for the specific case ends, you must remove the SG ingress rule and click Save rules.
    "},{"location":"deployment_and_security/deployment/aws/billing/","title":"Billing","text":"

    After the installation, the client will be billed for all the infrastructure costs plus the usage metrics describe in the offer. Using a usage-based pricing model you will only pay for what you use. The following metrics are calculated and sent to AWS in order to charge you at the current offer pricing:

    • CPU / Hour
    • Memory / Hour
    • GPU / Hour

    The following AWS services are mandatory for the platform to work and will be billed:

    • VPC
    • ACM
    • Secrets Manager
    • CloudWatch
    • EKS
    • EC2
    • EFS
    • RDS
    • Cognito
    • ECS
    • Lambda

    To check the infrastructure costs of the platform, you can use the AWS Cost Explorer and filter by the tag Environment = YData. This will aggregate all the resources deployed by the platform.

    "},{"location":"deployment_and_security/deployment/aws/billing/#cost-estimations","title":"Cost Estimations","text":"

    YData Fabric final cost can be estimated following the logic of a usage-based plan since it depends on your users and data. The following table provides a guideline of how to compute the total cost for different usage scenarios based on the deployed infrastructure.

    EKS Nodes Instance Type vCPUs Memory (GBi) GPUs Number of instances % Usage/ CPU/Hour % Usage/ Memory/Hour % Usage/ GPU/Hour Cost AWS/Hour Cost AWS/Day Cost YData/Hour Cost YData/Day System t3a.2xlarge 8 32 0 2 20 20 0 $0.30 $14.44 $0.38 $9.22 CPU Micro (labs) t3a.large 2 8 0 1 40 40 0 $0.08 $1.80 $0.10 $2.30 CPU Small (labs) t3a.xlarge 4 16 0 1 20 20 0 $0.15 $3.61 $0.10 $2.30 CPU Medium (labs) t3a.2xlarge 8 32 0 0 0 0 0 $0.30 $0.00 $0.00 $0.00 CPU Large (labs) m5a.4xlarge 16 64 0 0 0 0 0 $0.69 $0.00 $0.00 $0.00 CPU Compute Micro (computing) r5a.4xlarge 16 128 0 1 20 20 0 $0.90 $21.70 $0.64 $15.36 GPU Micro (labs) g4dn.xlarge 4 16 1 0 0 0 0 $0.53 $0.00 $0.00 $0.00 GPU Compute Micro (computing) g3.4xlarge 16 122 1 0 0 0 0 $1.14 $0.00 $0.00 $0.00

    The example above illustrates a scenario where the Micro and Small instances are used. It is also illustrated that despite the Nodes being available, they're not necessarily being used, hence billed - only when the infrastructure is required and actually used, it is measured and billed accordingly.

    "},{"location":"deployment_and_security/deployment/aws/clean/","title":"Clean","text":"

    The following procedure explains how to delete the platform. The full procedure takes around 45m to 1h to be completed. To clean up YData Fabric, you will need to delete the CloudFormation stack and remove the subscription.

    Please take in consideration that this will delete everything associated with the installation.

    "},{"location":"deployment_and_security/deployment/aws/clean/#deleting-the-stacks","title":"Deleting the stacks","text":"
    • Go to the regions where the product is installed
    • Go to the CloudFormation service
    • Select the ydata stack
    • Click in the Delete button
    • Select the Extension stack and click in the Delete button.

    Note

    This will disable the extension. If you are using this extension for any other project, please do not delete this stack.

    "},{"location":"deployment_and_security/deployment/aws/clean/#deleting-the-subscription","title":"Deleting the subscription","text":"
    • Go to the **AWS Marketplace Subscriptions** \u2192 Manage subscriptions
    • Click the YData product
    • Actions \u2192 Cancel subscription
    • Click the checkbox and click Yes, cancel subscription

    Following the above steps completes the process of deleting YData Fabric from your AWS Cloud instance.

    "},{"location":"deployment_and_security/deployment/aws/deploy/","title":"Deploy","text":""},{"location":"deployment_and_security/deployment/aws/deploy/#installation-process","title":"Installation process","text":"

    The following procedure explains how to install the platform using the CloudFormation template and how to connect to the platform after the installation. The full procedure takes around 45m to 1h to be completed. In order to install the platform in your account, the user must have basic knowledge with the used tools, such as CloudFormation, Route53 and Cognito.

    "},{"location":"deployment_and_security/deployment/aws/deploy/#configure-the-product","title":"Configure the product","text":"

    Make sure that you comply with the pre-flight checks

    You can check the prerequisites and pre-deploy checks.

    Start with the basic configuration for the app installation:

    • Ensure you are in the right region.
    • Choose the stack name \"ydata-platform\" is the default name
    "},{"location":"deployment_and_security/deployment/aws/deploy/#network","title":"Network","text":"

    Define your network configurations to access the platform. Using the ACM Certificate ARN OR the Hosted Zone ID and the Domain chosen from the preflight checklist, fill up the following parameters:

    "},{"location":"deployment_and_security/deployment/aws/deploy/#oauth","title":"OAuth","text":"

    Define how your users will authenticate in the platform (you can use multiple providers).

    "},{"location":"deployment_and_security/deployment/aws/deploy/#analytics","title":"Analytics","text":"

    You can opt for allowing or not the collection of metrics in order to help us understand how users interact with the product. No user data is collected at any point. You can find our privacy policy at ydata.ai/privacy.

    "},{"location":"deployment_and_security/deployment/aws/deploy/#bastion-host","title":"Bastion host","text":"

    A bastion host is created and used to give closer support to the users. The bastion host is only accessible on user demand, giving us access to EC2 setting an SG ingress rule. Set it to \"Allow\" to have it available. More information here.

    "},{"location":"deployment_and_security/deployment/aws/deploy/#create","title":"Create","text":"
    • Check the \u201cI acknowledge that AWS CloudFormation might create IAM resources with custom names.\u201d
    • Click Create Stack
    "},{"location":"deployment_and_security/deployment/aws/deploy/#2-following-the-installation-process","title":"2. Following the installation process","text":"

    Now we can follow the step-by-step for the installation of YData Fabric.

    • Click the \u201cCreate\u201d button, the installation of the platform will start:

    The process will take approximately 45-60 minutes.

    • If the installation process occurs without any issues, you will see the CREATE_COMPLETE status in the stack:

    • If any error occur during installation, please open a support case at support.ydata.ai.
    "},{"location":"deployment_and_security/deployment/aws/deploy/#3-post-installation-configuration","title":"3. Post installation configuration","text":""},{"location":"deployment_and_security/deployment/aws/deploy/#dns-configuration","title":"DNS Configuration","text":"

    If you have your domain registered in Route53, you can check the CF Outputs, and click the domain name to access the platform:

    If you are using another DNS provider or a Route53 in another account, you will need to create a CNAME record pointing to the ALB endpoint (ALBDNSName). As an example: CNAME \u2192 ydata-alb-xxxxxxxxx.eu-west-1.elb.amazonaws.com

    "},{"location":"deployment_and_security/deployment/aws/deploy/#4-connecting-to-the-platform","title":"4. Connecting to the platform","text":"

    To connect the platform, please allow 20-30m so the platform is completed initialised and access using the URL displayed in the CF Outputs. For the login process, if you choose a customer custom login provider, you need to ensure that the users are created.

    Otherwise, you will need to create the users in the Cognito generated by the CloudFormation stack.

    More information under can be found at Login providers.

    \ud83d\ude80 Congratulations you are now ready to start exploring your data with YData Fabric!

    "},{"location":"deployment_and_security/deployment/aws/instance_types/","title":"Instance types","text":"Name ID System Pool CPU MIcro Pool CPU Small Pool CPU Medium Pool CPU Large Pool CPU Compute Micro Pool GPU MIcro Pool GPU Compute Micro Pool Bastion Host N. Virginia us-east-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Ohio us-east-2 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano N. California us-west-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Oregon us-west-2 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Cape Town af-south-1 t3.xlarge t3.large t3.xlarge t3.2xlarge m5.4xlarge r5.4xlarge g4dn.xlarge g4dn.2xlarge t3.nano Hong Kong ap-east-1 t3.xlarge t3.large t3.xlarge t3.2xlarge m5.4xlarge r5.4xlarge g4dn.xlarge g4dn.2xlarge t3.nano Mumbai ap-south-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g4dn.2xlarge t3a.nano Osaka ap-northeast-3 t3.xlarge t3.large t3.xlarge t3.2xlarge m5.4xlarge r5.4xlarge g4dn.xlarge g4dn.2xlarge t3.nano Seoul ap-northeast-2 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Singapore ap-southeast-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Sydney ap-southeast-2 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Tokyo ap-northeast-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Canada Central ca-central-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Frankfurt eu-central-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Ireland eu-west-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano London eu-west-2 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g3.4xlarge t3a.nano Milan eu-south-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g4dn.2xlarge t3a.nano Paris eu-west-3 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g4dn.2xlarge t3a.nano Stockholm eu-north-1 t3.xlarge t3.large t3.xlarge t3.2xlarge m5.4xlarge r5.4xlarge g4dn.xlarge g4dn.2xlarge t3.nano Bahrain me-south-1 t3.xlarge t3.large t3.xlarge t3.2xlarge m5.4xlarge r5.4xlarge g4dn.xlarge g4dn.2xlarge t3.nano S\u00e3o Paulo sa-east-1 t3a.xlarge t3a.large t3a.xlarge t3a.2xlarge m5a.4xlarge r5a.4xlarge g4dn.xlarge g4dn.2xlarge t3a.nano"},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/","title":"Checklist and Prerequisites","text":"

    Deploying YData Fabric in the AWS cloud offers a scalable and efficient solution for managing and generating synthetic data. AWS provides a robust infrastructure that ensures high availability, security, and performance, making it an ideal platform for YData Fabric.

    This cloud deployment allows for rapid scaling of resources to meet varying workloads, ensuring optimal performance and cost-efficiency.

    With AWS's comprehensive security features, including data encryption, network firewalls, and identity management, your synthetic data and models are protected against unauthorized access and threats. Additionally, AWS's global infrastructure allows for the deployment of YData Fabric in multiple regions, ensuring low latency and high availability for users worldwide.

    Prerequisites

    If you don't have an AWS account, create a free account before you begin.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#basic-configuration","title":"Basic Configuration","text":"
    • Stack name: The name of the CloudFormation stack
    • Location: where to install the platform and create the resources. You can check the available supported regions here:
    • **Available regions: ** You can find the aws regions where YData Fabric is available here.
    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#permissions","title":"Permissions","text":"

    Check and add (if needed) the necessary permissions to the account and region where the platform will be installed.

    • Go to Identity and Access Management (IAM)
    • Select your user or role used for deployment
    • Under the permissions tab, check if you have the following permissions:
      • AdministratorAccess

    *this will be updated in the future with only the necessary permissions to create and access the application.

    You can find AWS official documentation here.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#service-linked-roles","title":"Service Linked Roles","text":"

    During the deployment all the required Service-Linked Roles are created by AWS by default with the exception of the EKS Service-Linked Role.

    Please go to IAM \u2192 Roles Verify that the following Service-Linked role exists in IAM:

    • AWSServiceRoleForAmazonEKS

    Otherwise, please create the missing service linked role:

    • Click \u201cCreate role\u201d
    • Choose AWS service and EKS:

    • Click \u201cNext\u201d \u2192 \u201cNext\u201d
    • Click \u201cCreate role\u201d

    *You can find AWS official documentation for service-linked roles.*

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#quotas","title":"Quotas","text":"

    Check and set (if needed) new quotas for the region where the application will be installed.

    • Go to Service Quotas (ensure that you are in the right region).
    • Select AWS Services \u2192 Amazon Elastic Compute Cloud (Amazon EC2)
    • Check for the following quota limits:
    Quota Minimum Recommended Running On-Demand Standard (A, C, D, H, I, M, R, T, Z) instances 50\u00b9 100\u00b2 Running On-Demand G and VT instances 0\u00b9 20\u00b2

    1. These limits are the required only for the installation of the platform. Usage is limited. 2. Each limit will depend on the platform usage and each client requirements.

    If needed, request for a new limit to the AWS support team. More on available instance types can be found here.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#network-configuration","title":"Network configuration","text":"

    Choose how you want to connect to the platform.

    The parameters below will be used during the deployment process.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#dns-configuration","title":"DNS Configuration:","text":"

    In AWS, you will connect the platform providing your own DNS custom domain, for example: platform.ydata.ai. For that, a registered domain is necessary.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#domain-name-and-route53-hosted-zone-id","title":"Domain Name and Route53 Hosted Zone ID","text":"

    If you have your domain registered in Route53, you can pass the Route53 Hosted Zone ID and the Domain Name, and the CloudFormation template will create an ACM certificate and a Route53 record pointing to the ALB used to connect the platform. So no steps are required before or after the installation.

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#domain-name-and-acm-certificate-arn","title":"Domain Name and ACM Certificate ARN","text":"

    Otherwise, if you have your domain registered in another provider or in a route53 in another account, you will need to do one of the following steps:

    Create the certificate on ACM and validate it Request public certificate Certificate granted

    After the certificate is requested, copy the CNAME value and name, and create the record in your DNS provider so the certificate can be validated.

    Import the certificate to ACM Request public certificate Certificate granted

    After the certificate is imported, ensure the certificate is validated.

    After the installation, you will need to create another CNAME record pointing to the ALB endpoint, available in the CF Outputs.

    For example: CNAME \u2192 ydata-alb-xxxxxxxxx.eu-west-1.elb.amazonaws.com

    "},{"location":"deployment_and_security/deployment/aws/pre_deploy_checklist/#login-provider","title":"Login Provider","text":"

    In AWS you can use multiple providers to connect to the platform. During the parameter section you can choose to create a Cognito or to use one on your own:

    Setting this to True, unless you want to use a custom one, you don\u2019t need to specify any other parameters under the OAuth Configuration.

    You can only have one Cognito

    You can only choose one Cognito:

    • The created during the platform installation.
    • One created by you, where you need to pass the credentials parameters.

    If both are set, the provided parameters will be ignored and the one created during installation will be used.

    Some regions do not support Cognito

    This is not currently supported for some regions! For this regions you will need to use the region specific template and pass your own custom oauth configuration!

    Check regions information here.

    You can log in to our app currently using the following providers - at least one is required, but you can choose multiple ones:

    • Google
    • Microsoft
    • Cognito (you own or the default created during installation)
    • GitHub

    More detailed instructions for each login provider can be found here. If you required another authentication method, please fill up a support case at support.ydata.ai

    After configuring your login provider, please save the values. This values will be used during the deployment process.

    As soon as the above steps are all completed, you are ready to start the deployment.

    "},{"location":"deployment_and_security/deployment/aws/regions/","title":"\ud83c\udf10 Regions","text":"Name ID Supported Notes N. Virginia us-east-1 \u2705 \u2796 Ohio us-east-2 \u2705 \u2796 N. California us-west-1 \u2705 \u2796 Oregon us-west-2 \u2705 \u2796 Cape Town af-south-1 \u2705 \u2796 Melbourne ap-southeast-4 \ud83d\udd34 No GPU machine types available at the moment Hong Kong ap-east-1 \u2705 \u2796 Hyderabad ap-south-2 \ud83d\udd34 No GPU machine types available at the moment Jakarta ap-southeast-3 \ud83d\udd34 No GPU machine types available at the moment Mumbai ap-south-1 \u2705 \u2796 Osaka ap-northeast-3 \u2705 \u2796 Seoul ap-northeast-2 \u2705 \u2796 Singapore ap-southeast-1 \u2705 \u2796 Sydney ap-southeast-2 \u2705 \u2796 Tokyo ap-northeast-1 \u2705 \u2796 Canada Central ca-central-1 \u2705 \u2796 Frankfurt eu-central-1 \u2705 \u2796 Ireland eu-west-1 \u2705 \u2796 London eu-west-2 \u2705 \u2796 Milan eu-south-1 \u2705 \u2796 Paris eu-west-3 \u2705 \u2796 Spain eu-south-2 \ud83d\udd34 No GPU machine types available at the moment Stockholm eu-north-1 \u2705 \u2796 Zurich eu-central-2 \ud83d\udd34 No GPU machine types available at the moment Bahrain me-south-1 \u2705 \u2796 UAE me-central-1 \ud83d\udd34 No GPU machine types available at the moment Tel Aviv il-central-1 \ud83d\udd34 No GPU machine types available at the moment S\u00e3o Paulo sa-east-1 \u2705 \u2796"},{"location":"deployment_and_security/deployment/aws/update/","title":"Update Fabric","text":"

    YData is committed to providing our users with cutting-edge tools and features to enhance their data management and synthetic data generation capabilities. Our solution updates policy is designed to ensure that YData Fabric remains at the forefront of technological advancements while maintaining the highest standards of reliability, security, and user satisfaction.

    Key Aspects of Our Update Policy

    • Regular Updates: We release regular updates that include new features, performance improvements, and bug fixes. These updates are aimed at enhancing the overall functionality and user experience of YData Fabric.
    • User Feedback Integration: We actively seek and incorporate feedback from our user community. This ensures that our updates address real-world challenges and meet the evolving needs of our users.
    • Seamless Deployment: Updates are designed to be deployed seamlessly with minimal disruption to ongoing operations. Our team provides detailed documentation and support to facilitate smooth transitions.
    • Security Enhancements: We prioritize the security of our platform. Each update undergoes rigorous testing to ensure that it enhances the security posture of YData Fabric without introducing vulnerabilities.
    • Compatibility and Compliance: Updates are developed to ensure compatibility with existing systems and compliance with industry standards and regulations, safeguarding the integrity and continuity of user operations.

    By adhering to this policy, YData ensures that users consistently benefit from the latest advancements in data technology, reinforcing our commitment to innovation and excellence in the field of data science and synthetic data generation.

    All updates to Fabric are user/organization triggered and by following the next steps to update your CloudFormation stack.

    "},{"location":"deployment_and_security/deployment/aws/update/#1-get-the-most-recent-version","title":"1. Get the most recent version","text":"
    • Go to the **AWS Marketplace Subscriptions** \u2192 Manage subscriptions
    • Click the YData Fabric subscription
    • Click Launch more software.
    • Check for new versions and click Continue to Launch. At this stage you will find the link for the new version.

    Click the deployment template associated with your installation.

    • Here you will have the new template URL. Copy the link as per the image below:

    • Go to the deployed CloudFormation stack and clink in \"Update\" button.
    • Choose \u201cReplace current template\u201d and provide the new stack URL.

    • For the parameters, use the same parameters or change if needed. Click Next \u2192 Next \u2192 Submit

    1. Following the installation process

    Now you can follow the installation process. Different from the initial deploy, the update process will only take approximately 15-60 minutes depending on the update complexity.

    \ud83d\ude80 Congratulations you have now the latest version of YData Fabric!

    "},{"location":"deployment_and_security/deployment/azure/billing/","title":"Billing","text":"

    After the installation, the client will be billed for all the infrastructure costs plus the usage metrics describe in the offer.

    Using a usage-based pricing model you will only pay for what you use.

    The following metrics are calculated and sent to Azure in order to charge you at the current offer pricing:

    • CPU / Hour
    • Memory / Hour
    • GPU / Hour

    The following Azure services are mandatory for the platform to work and will be billed:

    • Virtual networks
    • IP Address
    • Private DNS Zones
    • Container Registry
    • Storage Account
    • MySQL Server
    • Deployment Scripts
    • Kubernetes Services
    • Key Vault
    • Container Instances

    To check the infrastructure costs of the platform, you can use the Azure Cost analysis (under the Cost Management + Billing service) and filter by the created resource groups during the deployment. This will aggregate all the resources deployed by the platform.

    "},{"location":"deployment_and_security/deployment/azure/billing/#cost-estimations","title":"Cost Estimations","text":"

    YData Fabric final cost can be estimated following the logic of a usage-based plan since it depends on your users and data. The following table provides a guideline of how to compute the total cost for different usage scenarios based on the deployed infrastructure.

    AKS Nodes Instance Type vCPUs Memory (GBi) GPUs Number of instances % Usage/ CPU/Hour % Usage/ Memory/Hour % Usage/ GPU/Hour Cost Azure/Hour Cost Azure/Day Cost YData/Hour Cost YData/Day System Standard_D2s_v3 8 32 0 2 30 30 0 0.4800 23.04 0.288 6.912 CPU Micro (labs) Standard_D2s_v3 2 8 0 1 50 50 0 0.1200 2.88 0.06 1.44 CPU Small (labs) Standard_D4s_v3 4 16 0 1 50 50 0 0.2400 5.76 0.12 2.88 CPU Medium (labs) Standard_D8s_v3 8 32 0 0 0 0 0 0.4800 0 0 0 CPU Large (labs) Standard_D16s_v3 16 64 0 0 0 0 0 0.9600 0 0 0 CPU Compute Micro (computing) Standard_D32s_v3 32 128 0 1 80 80 0 1.9200 46.08 1.536 36.864 GPU Micro (labs) Standard_NC6s_v3 6 112 1 0 0 0 0 3.8230 0 0 0 GPU Compute Micro (computing) Standard_NC6s_v3 6 112 1 0 0 0 0 3.8230 0 0 0

    The example above illustrates a scenario where the Micro and Small instances are used. It is also illustrated that despite the Nodes being available, they're not necessarily being used, hence billed - only when the infrastructure is required and actually used, it is measured and billed accordingly.

    "},{"location":"deployment_and_security/deployment/azure/clean/","title":"Clean","text":"

    The following procedure explains how to delete the platform. The full procedure takes around 45m to 1h to be completed. To clean up YData Fabric, you will need to delete the managed app.

    Please take in consideration that this will delete everything associated with the installation.

    • Start by opening the resource group where the managed app is installed, select the Managed Application and click \"Delete\".

    This will delete the managed app and the managed resource group where all the components are installed.

    "},{"location":"deployment_and_security/deployment/azure/deploy/","title":"Deploy","text":""},{"location":"deployment_and_security/deployment/azure/deploy/#installation-process","title":"Installation process","text":"

    Ensure that you have completed the pre-deploy checklist

    Validate if you have checked all the deploy requirements before moving forward with the deploy.

    "},{"location":"deployment_and_security/deployment/azure/deploy/#basic-configuration","title":"Basic configuration","text":"
    • Start by defining the basic configuration for the app installation.
    "},{"location":"deployment_and_security/deployment/azure/deploy/#jit-access","title":"JIT Access","text":"
    • Enable the Just in Time (JIT) access for the app installation as shown in the image below. You can see more about JIT access in the pre-deploy checklist.
    "},{"location":"deployment_and_security/deployment/azure/deploy/#network-configuration","title":"Network configuration","text":"
    • Define your network configuration to access YData Fabric.
    New Public IPExisting Public IP
    • If you choose a new Public IP, you can choose the name or leave it as (new) default, but the remaining properties are ignored since the SKU standard and assignment static is the recommended by Azure.
    • After that, choose a DNS label for the domain as shown below.
    • If you opt for an existing Public IP, you can choose that IP from the dropdown. The DNS Public Endpoint is automatically filled since this is configured on the IP Address level. If your IP is disabled, please ensure you have the DNS name label defined and the IP is not allocated to any other resource.

    For the DNS Custom Domain, you can use a custom domain, such as, for example platform.ydata.ai. After the installation process you will need to create a CNAME or an A record in your DNS provider. More information in the Post installation step.

    "},{"location":"deployment_and_security/deployment/azure/deploy/#oauth","title":"OAuth","text":"
    • Define how you will authenticate to the app after the deployment is completed.
    "},{"location":"deployment_and_security/deployment/azure/deploy/#analytics","title":"Analytics","text":"
    • You can opt for allowing or not the collection of metrics in order to help us understand how users interact with the product. No user data is collected at any point. Read more about YData privacy policy.
    "},{"location":"deployment_and_security/deployment/azure/deploy/#create","title":"Create","text":"
    • Click \u201cNext\u201d. Check the provided parameters.
    • Insert the contact information
    • Read and accept the terms and conditions. Finally click in \"Create\".
    "},{"location":"deployment_and_security/deployment/azure/deploy/#following-the-installation-process","title":"Following the installation process","text":"
    • After click the \u201cCreate\u201d button, the installation of the managed app will start, as shown in the image below.

    The process will take approximately 45-60 minutes.

    • If any error occur during installation, please open a support case at support.ydata.ai.
    "},{"location":"deployment_and_security/deployment/azure/deploy/#post-installation-configuration","title":"Post installation configuration","text":""},{"location":"deployment_and_security/deployment/azure/deploy/#ip-configuration","title":"IP configuration","text":"

    If you choose to use one existing IP for the platform, you will need to create a role assignment to the resource group where the IP is located. To do this, open your managed resource group (where the resources are created) and open the ydata-cluster-managed-identity Managed Identity.

    • Click \u201cAzure Role Assignments\u201d

    • Click \u201cAdd role assignment\u201d as shown in the image below.

    • Choose the Scope \u201cResource group\u201d.
    • Choose the subscription where the resource group is located.
    • Select the resource group where the IP is located.
    • Add the role \u201cNetwork Contributor\u201d and \"Save\".
    "},{"location":"deployment_and_security/deployment/azure/deploy/#dns-configuration","title":"DNS Configuration","text":"

    If you opt for the DNS Custom Domain, you will need to create a CNAME record pointing to the DNS Public Endpoint or an A record pointing to the IP. Example in Route53:

    "},{"location":"deployment_and_security/deployment/azure/deploy/#connecting-to-ydata-fabric","title":"Connecting to YData Fabric","text":"

    You can get the full URL in the Managed APP \u2192 \u201cParameters and Outputs\u201d tab \u2192 Outputs

    \ud83d\ude80 Congratulations you are now ready to start exploring your data with YData Fabric!

    "},{"location":"deployment_and_security/deployment/azure/instance_types/","title":"Instance types","text":"Name ID System Pool CPU MIcro Pool CPU Small Pool CPU Medium Pool CPU Large Pool CPU Compute Micro Pool GPU MIcro Pool GPU Compute Micro Pool West Europe westeurope Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3 West US westus Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3 West US 2 westus2 Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3 Canada Central canadacentral Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3 Sweden Central swedencentral Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3 Australia East australiaeast Standard_D2s_v3 Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_NC6s_v3 Standard_NC6s_v3"},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/","title":"Checklist and Prerequisites","text":"

    Deploying YData Fabric in the Microsoft Azure offers a scalable and efficient solution for managing and generating synthetic data. AWS provides a robust infrastructure that ensures high availability, security, and performance, making it an ideal platform for YData Fabric.

    This cloud deployment allows for rapid scaling of resources to meet varying workloads, ensuring optimal performance and cost-efficiency.

    With Microsoft's comprehensive security features, including data encryption, network firewalls, and identity management, your synthetic data and models are protected against unauthorized access and threats. Additionally, Azure's global infrastructure allows for the deployment of YData Fabric in multiple regions, ensuring low latency and high availability for users worldwide.

    Prerequisites

    If you don't have an Azure account, create a free account before you begin.

    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#basic-configuration","title":"Basic Configuration","text":"
    • Subscription: where the platform will be installed
    • Resource group: where the managed app will be installed:

      • A new one is recommended and can be created automatically during the deployment.
    • Location: where to install the Managed APP and create the resource groups. The available location for now are:

      • West Europe - Netherlands [westeurope]
      • West US - California [westus]
      • West US - Washington [westus2]
      • Canada Central [canadacentral]
      • Sweden Central [swedencentral]*

      If you need another region, please fill up a support case at support.ydata.ai.

      *Regions without available GPU\u2019s machine types at the time

    • Application Name: the Managed APP name

    • Managed Resource Group: the resource group created by the Managed APP and where all the infrastructure services will be created (this is created automatically).
    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#permissions","title":"Permissions","text":"

    Check and add (if needed) the necessary permissions to the subscription where the platform will be installed.

    • Go to Subscriptions.
    • Select the subscription where YData Fabric will be installed.
    • Click \u201cView my access\u201d as shown in the image below.

    • Check if you have at least the following configurations:

    • Contributor

    And the following permissions:

    • Microsoft.Authorization/roleAssignments/read

    • Microsoft.Authorization/roleAssignments/write

    • If not, please create a custom role with this two permissions and create the role assignment to the user in the subscription.

    For more information check Azure's official documentation on Azure custom roles and Azure built-in roles.

    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#resource-providers","title":"Resource Providers","text":"

    Check and activate (if needed) resource providers for the subscription where the YData platform will be installed following the next steps.

    • Go to Subscriptions
    • Select the subscription where YData Fabric will be installed
    • Go to Resource Providers
    • Using the filter, check if you have the following resource providers registered. If not, please click the resource provider and click \u201cRegister\u201d.

      • Microsoft.Compute
      • Microsoft.ContainerInstance

    For more information check Azure's official documentation on resource providers and Azure Resource Manager.

    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#register-features","title":"Register features","text":"

    Check and register (if needed) the required features. - Install and update the aks-preview extension:

        az extension add --name aks-preview\n    az extension update --name aks-preview\n
    • Register the 'EnableWorkloadIdentityPreview' feature flag
        az feature register --namespace \"Microsoft.ContainerService\" --name \"EnableWorkloadIdentityPreview\"\n
    • Wait until feature to be registered:

        az feature show --namespace \"Microsoft.ContainerService\" --name \"EnableWorkloadIdentityPreview\"\n
        {\n        \"id\": \"/subscriptions/xxxxx/providers/Microsoft.Features/providers/Microsoft.ContainerService/features/EnableWorkloadIdentityPreview\",\n        \"name\": \"Microsoft.ContainerService/EnableWorkloadIdentityPreview\",\n        \"properties\": {\n            \"state\": \"Registered\"\n        },\n        \"type\": \"Microsoft.Features/providers/features\"\n    }\n

    • After the feature status is \u201cRegistered\u201d, refresh the registration of the container service resource provider:
        az provider register --namespace Microsoft.ContainerService\n

    Read more in Azure's official documentation on Azure Kubernetes Services (AKS).

    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#resource-compute-quotas","title":"Resource compute quotas","text":"

    Check and set (if needed) new quotas for the region where the managed app will be installed.

    • Go to Subscriptions.
    • Select the subscription where YData Fabric will be installed
    • Click \u201cUsage + quotas\u201d
    • Filter by the region where YData Fabric will be installed

    • Check for the following quota limits:
    Quota Minimum Recommended Total Regional vCPUs 16* 100** Standard DSv3 Family vCPUs 16* 100** Standard NCSv3 Family vCPUs* 6* 20** Standard DDSv4 Family vCPUs 10 10

    *These limits are the required only for the installation of the platform. Usage is limited.

    ** Each limit will depend on the platform usage and each client requirements.

    *** Not available in Sweden region

    • If needed, request for a new limit to the azure support team as per the image below.

    Check Azure's official documentation on quotas, increase regional vCPU quotas and increase VM-family quotas.

    More on available instance types can be found here.

    "},{"location":"deployment_and_security/deployment/azure/pre_deploy_checklist/#jit-access","title":"JIT Access","text":"

    The JIT Access feature will prevent YData Fabric from having write access to the managed app at any time.

    • To use the just-in-time access, you must have an Azure Active Directory P2 license.
    • Without this license and with the JIT enable, YData will not be able to give any closer support or make updates to the solution.

    To check your current license, go to the Azure Portal \u2192 Azure Active Directory \u2192 Licenses and check your license. To activate the P2 license, click the \u201cTry/Buy\u201d button.

    For more information check Azure's official documentation on assigning and removing licenses to Azure Active directory. To learn how to enable JIT access and approve requests.

    After accepting the request, the YData team will have access in order to make updates and give you closer support. Any other requests open a support case at support.ydata.ai.

    "},{"location":"deployment_and_security/deployment/azure/regions/","title":"\ud83c\udf10 Regions","text":"Name ID Supported Notes West Europe westeurope \u2705 \u2796 West US westus \u2705 \u2796 West US 2 westus2 \u2705 \u2796 CanadaCentral canadacentral \u2705 \u2796 (Europe) Sweden Central swedencentral \u2705 \u2796 (Asia Pacific) Australia East australiaeast \u2705 \u2796

    For more zone please contact us through support@ydata.ai

    "},{"location":"deployment_and_security/deployment/google/deploy/","title":"Deploy","text":""},{"location":"deployment_and_security/deployment/google/deploy/#installation-process","title":"Installation process","text":"

    The following information needs to be passed to YData team:

    • The SA JSON file generated in the preflight-checklist.
    • Project ID
    • Region
    • DNS Cloud Zone name
    • Domain name - the domain that will be used to connect to the platform
    • Login provider credentials (ex on google: Client ID, Client Secret, Domain)
    "},{"location":"deployment_and_security/deployment/google/deploy/#wait-for-the-installation-to-be-done","title":"Wait for the installation to be done","text":"

    YData team will take care of the deployment for you. As soon as it is finished the team will let you know.

    "},{"location":"deployment_and_security/deployment/google/deploy/#post-installation-configuration","title":"Post installation configuration","text":"

    A DNS configuration is needed. For that, if you opt for the IP, you will need to create a record pointing to the Load Balancer IP, as shown in the image below.

    "},{"location":"deployment_and_security/deployment/google/deploy/#connecting-to-ydata-fabric","title":"Connecting to YData Fabric","text":"

    YData team will share with you the link/URL that you can now use to access YData Fabric.

    \ud83d\ude80 Congratulations you are now ready to start exploring your data with YData Fabric!

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/","title":"Checklist and Prerequisites","text":"

    The deployment will be executed using terraform, and it is fully automated. It is triggered by YData\u2019s team and the progress can be monitored on the client side.

    As a pre-condition, the client must create a service account and share it with YData\u2019s team. The required permissions will be shared in this document.

    The bastion host will be used to provide technical support to the team in case of issues and troubleshooting with the usage of the platform, and this access will only be used for this purpose.

    Prerequisites

    If you don't have an GCP subscription, create a free account before you begin.

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#observations-prerequisites","title":"Observations & prerequisites","text":"
    • The deployment will create one public and private key to establish the connection to the bastion host.
    • With this deployment, a security group allowing YData\u2019s IP to establish the connection to the bastion host via SSH will be created. This should be deleted after the deployment and added in case it is needed.
    • The Bastion host can be stopped after the deployment to prevent any charges and created/started to give support.
    • The private subnets will have a NAT Gateway attached \u2013 this is needed since the GKE needs access to the public internet to connect the Data Sources and to pull images from the public registries.
    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#basic-configuration","title":"Basic Configuration","text":"
    • Project: where the platform will be installed.
    • Location: where to install the YData fabric inside the project.
    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#enable-apis","title":"Enable API's","text":"
    • Please check if the following API\u2019s for the chosen project are enabled:
      • API Keys API
      • Artifact Registry API
      • Certificate Manager API
      • Cloud Resource Manager API
      • Cloud Key Management Service (KMS) API
      • Compute Engine API
      • Kubernetes Engine API
      • ^^Cloud DNS API
      • Cloud Filestore API
      • Cloud Run API
      • Identity and Access Management (IAM) API
      • Services Networking API
      • Cloud SQL Admin API
      • Cloud Storage
      • Serverless VPC Access API
      • Secret Manager API
      • Cloud Scheduler API
      • Service Usage API
    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#permissions","title":"Permissions","text":"

    The following service account should be created and transferred to YData so the deployment can be triggered. It is recommended (but not required) that you create a new project for the YData platform. This will make it easier to control costs and to ensure that YData only have access to their resources. You can create the service account using the provided commands using the gcloud cli (recommended) or create the service manually using the google cloud UI.

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#gcloud-cli","title":"GCloud CLI","text":"

    The following commands will create a new service account with the required permissions to complete the deployment. The generated JSON file must be sent to YData.

    1. Download the following file: https://raw.githubusercontent.com/ydataai/gcp-deploy-permissions/main/clients_custom_role.yaml
    2. Create the new SA for the deployment
        export PROJECT_ID=\n    export SERVICE_ACCOUNT_NAME=\n\n    gcloud config set project $PROJECT_ID\n
    • Create a new SA
        gcloud iam service-accounts create $SERVICE_ACCOUNT_NAME --display-name \"GCP Service Account for the Ydata platform\"\n
    • Get the new key file for the created SA
        export SA_EMAIL=$(gcloud iam service-accounts list --filter $SERVICE_ACCOUNT_NAME --format 'value(email)')\n\n    gcloud iam service-accounts keys create gcp-ydata-platform-service-account.json --iam-account $SA_EMAIL\n
    • Create a new role and associate this role to the new SA
        gcloud iam roles create ydata_platform_gcp_iam_role --project $PROJECT_ID --file clients_custom_role.yaml\n\n    gcloud projects add-iam-policy-binding $PROJECT_ID --member \"serviceAccount:$SA_EMAIL\" --role \"projects/$PROJECT_ID/roles/ydata_platform_gcp_iam_role\"\n
    • Activate the new SA locally
        gcloud auth activate-service-account --project=$PROJECT_ID --key-file=gcp-ydata-platform-service-account.json\n
    • Test the new SA by setting the new account
        gcloud config set account $SA_EMAIL\n    gcloud config set project $PROJECT_ID\n
    • Check if you are logged in with the new SA:
        gcloud auth list\n
    • Try a command.
        gcloud container clusters list\n
    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#gcp-console","title":"GCP Console","text":"

    Go to IAM -> Service Accounts -> Create Service Account Choose a name for the service account and click \u201cCreate and Continue\u201d. For the Roles add the following ones (you can search by these terms and select the resulting role):

    • roles/container.admin
    • roles/compute.admin
    • roles/iam.serviceAccountAdmin
    • roles/dns.admin
    • roles/iam.roleAdmin
    • roles/resourcemanager.projectIamAdmin
    • roles/cloudsql.admin
    • roles/servicenetworking.networksAdmin
    • roles/iam.serviceAccountKeyAdmin
    • roles/serviceusage.serviceUsageAdmin
    • roles/file.editor
    • roles/storage.admin
    • roles/cloudkms.admin
    • roles/serviceusage.apiKeysAdmin
    • roles/artifactregistry.admin
    • roles/secretmanager.admin
    • roles/vpcaccess.admin
    • roles/run.admin
    • roles/deploymentmanager.editor
    • roles/cloudscheduler.admin

    After it finished, click Continue and Done. Open the service account and create a new JSON key: The transferred key will be used by YData.

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#resource-compute-quotas","title":"Resource Compute Quotas","text":"

    Check and set (if needed) new quotas for the region where Fabric will be installed.

    • Go to IAM & Admin
    • Click \u201cQuotas & System Limits\u201d on the left
    • Filter by your region and check for the following quotas
    Quota Recommended CPUs (all regions) >200** C2D CPUs 200** N2D CPUs 24** Zonal & Regional 1-10 TiB (Enterprise) capacity (GB) per region 1024GiB *Each limit will depend on the platform usage and each client requirements.*
    • If needed, request for a new limit to the Google's support team:

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#network-configuration","title":"Network configuration","text":"

    Choose how you want to connect to the platform.

    In GCP, it\u2019s possible to connect to YData Fabric using your own DNS custom domain, for example: ydatafabric.yourdomain.com. (It\u2019s necessary to have a domain registered).

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#domain-name-and-gcp-cloud-dns-zone","title":"Domain Name and GCP Cloud DNS Zone","text":"

    If you have your domain registered in GCP Cloud DNS, you can use the Zone Name and the Domain Name, and the Deployment will create a Managed Certificate and the Cloud DNS record pointing to the Load Balancer used to connect the platform.

    Otherwise, if you have the domain registered in another provider, it is recommended to create a Public Cloud DNS Zone and point and create a new record in your provider pointing to the NS of Google and pass this Zone Name and Domain name, so the deployment occurs without any issues.

    If you don\u2019t want to create the Public Cloud DNS Zone you can point your to the IP available after the installation creating an A record.

    These parameters will be used during the deployment process.

    "},{"location":"deployment_and_security/deployment/google/pre_deploy_checklist/#login-provider","title":"Login Provider","text":"

    Choose how you want to login to the platform. You can log in to our app currently using the following providers - at least one is required, but you can choose multiple ones: - Google - Microsoft - Cognito - GitHub

    You can find detailed instructions for each type of login provider in the Login Providers page After configuring your login provider, please save the values. This values will be used during the deployment process.

    If you required another authentication method, please fill up a support case at support.ydata.ai.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/","title":"Login Providers","text":"

    YData Fabric offers a flexible and secure authentication system, allowing users to log in using a variety of trusted identity providers. This technical documentation provides a comprehensive guide to configuring and managing login providers for YData Fabric, including Google, Microsoft, and Amazon Cognito. By leveraging these providers, users can benefit from seamless and secure access to YData Fabric, ensuring a smooth and efficient user experience.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#google","title":"Google","text":"
    1. Open the Google Cloud Console.
    2. At the top-left, click Menu>APIs & Services>Credentials.
    3. Click Create Credentials>OAuth client ID.
    4. Click Application type>Web application.
    5. In the \"Name\" field, type a name for the credential. This name is only shown in the Cloud Console.
    6. Leave the \u201cAuthorized JavaScript origins\u201d empty. Add a new \u201cAuthorized redirect URIs\u201d with the platform endpoint with a suffix */dex/callback* For the provided example:

      If you are using the DNS Public EndpointOr, if you are using the DNS Custom Domain

    7. Click \u201cCreate\u201d

    8. Save the following credentials:

      • a. Client ID

        The Client ID for the Web Application - b. Client Secret

        The Client Secret for the Web Application - c. APP Hosted domain

        Google supports whitelisting allowed domains when using G Suite For example, for one company with the emails like person@example.com, the APP Hosted domain is example.com

    9. Use the credentials as inputs for YData Fabric.

    You can find more details in Google's official documentation.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#microsoft","title":"Microsoft","text":"
    1. Open the Azure Portal
    2. Go to \u201cEntra ID\u201d
    3. Click \u201cApp registrations\u201d
    4. Click \u201cNew registration\u201d
    5. Choose a name
    6. For the supported account types, choose the most appropriated choice for you.
    7. For the Redirect URI, choose \u201cWeb\u201d, and fill with the platform endpoint with a suffix */dex/callback*. For the provided example:

      If you are using the DNS Public EndpointOr, if you are using the DNS Custom Domain

    8. Click \u201cRegister\u201d

    9. Go to \u201cCertificates & Secrets\u201d, generate a new secret and save the value (not the secret id). Please choose a large expiration date. This value cannot be changed after the installation of the platform.
    10. Go to \u201cOverview\u201d and save the following credentials:

      • a. Client ID

        The Application (client) ID

      • b. Client Secret

        The secret generated in step 9 (not the secret id).

      • c. Tenant ID

        The Directory (tenant) ID

    11. Use the credentials as inputs for YData Fabric.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#consent-workflow","title":"Consent workflow","text":"

    The admin consent workflow is necessary to configure, so you can access the platform using the app registered above.

    1. Open the Azure Portal
    2. Go to \u201cAzure Active Directory\u201d
    3. Click \"Enterprise applications\u201d
    4. Open the \u201cConsent and permissions\u201d page \u2192 \u201cUser consent settings\u201d
    5. Check with the AD administrator if an administrator is required to login to the app, or if all users can consent for the apps.
    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#give-access-only-to-a-set-of-users-andor-groups","title":"Give access only to a set of users and/or groups","text":"
    1. In order to give access only to a set of users or groups, open your app and click the link \u201cManaged application in local directory\u201d on the right side:
    2. Then, click in \u201cProperties\u201d and enable the \u201cAssignment required\u201d
    3. To add users and/or groups, go to \u201cUsers and Groups\u201d and click \u201cAdd user/group\u201d.

    With the above steps, only the users and groups listed here can access YData Fabric. For more information check Microsoft's official documentation for Microsoft identy platform and Microsoft Entra.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#aws-cognito","title":"AWS Cognito","text":"
    1. Go to the Amazon Cognito console. If prompted, enter your AWS credentials.
    2. Choose User Pools. Create a new User Pool.
    3. For the \u201cConfigure security requirements\u201d, \u201cConfigure sign-up experience\u201d and \u201cConfigure message delivery\u201d tabs are up to your choices or leave as the default.
    4. In the \u201cIntegrate your app\u201d please set the attributes as the following:

      1. \u201cUser Pool Name\u201d - a name of your choice
      2. Tick the \u201cUse the Cognito Hosted UI\u201d check box.
      3. \u201cDomain type\u201d, you can use a cognito or a custom domain.
      4. \u201cInitial app client\u201d choose \u201cPublic client\u201d and set a \u201cApp client name\u201d
      5. For \u201cClient secret\u201d, choose \u201cGenerate a client secret\u201d
      6. In the \u201cAllowed callback URLs\u201d, set your callback URL with the platform endpoint with a suffix */dex/callback* For the provided example:
      If you are using the DNS Public EndpointOr, if you are using the DNS Custom Domain

      1. In the \u201cAdvanced app client settings\u201d \u2192 \u201cAuthentication flows\u201d step, choose \u201cALLOW_USER_PASSWORD_AUTH\u201d
      2. For the \u201cOpenID Connect scopes\u201d choose: \u201cEmail\u201d, \u201cOpenID\u201d and \u201cProfile\u201d.
      3. Review your settings, and \u201cCreate User Pool\u201d.
      4. Click your new user pool, go to the \u201cApp integration\u201d tab and \u201cApp clients and analytics\u201d.
      5. Copy and save the Client ID and Client secret.
      6. For the \u201cIssuer URL\u201d, get your URL by going to https://cognito-idp.[region].amazonaws.com/[user_pool_id]/.well-known/openid-configuration And copy and save the \"issuer URL.
      7. Use these credentials as inputs for YData Fabric.
    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#adding-new-users","title":"Adding new users","text":"
    1. Go to the Cognito service.
    2. Click the YData platform Cognito user pool.
    3. Go to the Users tab
    4. Click Create user
    5. Create the users:
    6. The user will receive an e-mail with the temporary credentials.

    For more information check Amazon's Cognito official documentation on user pools^ and ^^user pool app client.

    "},{"location":"deployment_and_security/deployment/login_support/login_providers/#github","title":"Github","text":"
    1. Go to the GitHub OAuth Application page. If prompted, enter your GitHub credentials.
    2. For the \u201cApplication Name\u201d, choose anything.
    3. For the \u201cHomepage URL\u201d and \u201cAuthorization callback URL\u201d, fill with the platform endpoint and platform endpoint with a suffix */dex/callback* correspondingly. For the provided example:
    If you are using the DNS Public EndpointOr, if you are using the DNS Custom Domain
    1. Open your new APP and generate a new secret
    2. Save the Client ID and Client secret
    3. For the org, use your GitHub organization name.

    Finally, use these credentials as inputs for to login YData Fabric. For more information check GitHub's official login documentation.

    "},{"location":"deployment_and_security/deployment/login_support/support/","title":"Support","text":"

    The YData Fabric support ticketing mechanism is designed to ensure that our users receive timely and efficient assistance for any issues they encounter while using our platform. This guide provides an in-depth overview of how the support ticketing system works, including how to submit a ticket and communicate with our support team.

    "},{"location":"deployment_and_security/deployment/login_support/support/#submitting-a-support-ticket","title":"Submitting a Support Ticket","text":"

    While logged into your YData Fabric instance, navigate to the Support section from the main dashboard, as shown in the image below.

    To create a new ticket, make sure to fill in the following fields:

    • Subject: The subject summary of your problem
    • Description: The detailed description of your issue. Please make sure to be thorough in your description, as it will help the team to provide you with better support. If you can describe the steps that you've made until you've found and issue or the blocker that you are asking support for.
    • Fabric Modules: Optionally, but highly recommend. If the issue happened while creating or interacting with the Data Catalog, Labs or Synthetic Data generation module, users can attach the operational logs (which the platform collects). The logs are fully operational and relate only to the selected component. Include no user data whatsoever (for instance, datasets are never sent). The files are uploaded in the background to a location accessible by YData\u2019s support team (private Amazon S3 Storage bucket in eu-west-1 region).

    Considerably increase the ability of YData\u2019s support team to offer timely and effective support. After receiving the ticket (and any attached logs), YData\u2019s support team will diagnose the issue and follow-up via e-mail as soon as possible. E-mail is used as the default communication channel from that moment onwards.

    "},{"location":"deployment_and_security/security/","title":"Security","text":"

    This section describes YData\u2019s security measures to provide a best-in-class experience for its customers, ensuring not only a good product and service but also risk management and compliance.

    Visit YData's Trust page to check all the Policies, Controls and Monitoring in place.

    "},{"location":"deployment_and_security/security/#hosting-security","title":"Hosting security","text":"

    YData is not a cloud service provider, however, we use providers which are hosted on their data centers, such as Google, Microsoft and Amazon Web Services, when the setup is not made on the customer premises. They are leading cloud infrastructure providers with top-class safety standards. They are able to respond quickly to both operational and security, including well-defined change management policies and procedures to determine when and how change occurs.

    "},{"location":"deployment_and_security/security/#clouds-compliance-standards","title":"Clouds compliance standards","text":"GoogleAWSMicrosoft Azure
    • CSA
    • ISO 27018
    • SOC 3
    • ISO 27001
    • SOC 1
    • ISO 27017
    • SOC 2
    • CSA
    • ISO 27017
    • SOC 2
    • ISO 9001
    • ISO 27018
    • SOC 3
    • ISO 27001
    • SOC 1
    • CSA
    • ISO 27017
    • ISO 22301
    • SOC
    • ISO 9001
    • ISO 27018
    • ISO 20000-1
    • ISO 27001
    • ISO 27701
    • WCAG

    Both physical access perimeters and entry points are strictly controlled by professional security personnel. Authorized personnel must pass a minimum of two-step verification to gain access to the authorized center floors.

    "},{"location":"deployment_and_security/security/#corporate-security","title":"Corporate security","text":"

    YData has applied internal security policies that are in line with the industry's ISO 27001 and SOC 2. We are regularly training our employees in safety and privacy awareness, which protects technical and non-technical roles. Training materials are developed for individual roles so that employees can fulfill their responsibilities appropriately.

    • Two-step verification for all services is enforced
    • Encrypted hard drives of our devices is enforced
    • Hard password requirements and rotation is enforced
    "},{"location":"deployment_and_security/security/#verification-and-access-management","title":"Verification and Access Management","text":"

    Users can log in via a secured Authentication provider, such as Security Assurance Markup Language, Microsoft Active Directory, Google Sign In or OpenID services. All requests to any of YData\u2019s APIs must be approved. Data writing requests require at least reporting access as well as an API key. Data reading requests require full user access as well as application keys. These keys act as carrier tokens to allow access to the YData service functionality. We also use Auth0 in user identification. Auth0 can never save a password because the password is encrypted when the user logs in, and compares with AuthO's encrypted password to see if they are using the correct password.

    The user can change and save the password as they wish. The user can use all types of characters to strengthen his password.

    "},{"location":"deployment_and_security/security/#certificate-management-communications","title":"Certificate Management & Communications","text":"

    All certificates are generated and used inside the Kubernetes cluster, using cert-manager. Exceptions for cloud providers for specific certificates and described below. Every component inside the cluster uses its own certificate, sharing the same issuer so all the components exchange encrypted communication between them.

    AWSMicrosoft Azure

    \"During the deployment, a certificate is requested and provisioned by Let\u2019s Encrypt to the specified domain.\"

    \"The public certificate is generated using the AWS Certificate Manager service.\"

    "},{"location":"deployment_and_security/security/#protection-of-customer-data","title":"Protection of Customer Data","text":"

    User uploaded information or data will be considered confidential, which is stored in encrypted form, separate from other networks, including the public network if available. Data for a limited time without user request, not allowed to come out. All data transmitted layer protection (TSL) and HTTP sent by users protected using Strike Transport Security (HSTS). The application is usable if encrypted communication is compromised. User uploaded data is not transferred from one data center to another. Encryption is used in many places to protect customer information, such as: IS-266 with encryption at rest, incomplete encryption (PGP) for system backups, KMS-based protection for privacy protection, and GPG encryption. Users can use the data stored for business or administrative purposes, but they have to go through many security levels, including multifactor authentication (MFA).

    "},{"location":"deployment_and_security/security/#secure-build-materials-sbom","title":"Secure Build Materials (SBOM)","text":"

    To enhance transparency and facilitate security assessments, we provide access to Secure Build Materials (SBOM) for our products and services. SBOM files offer detailed insights into the components, dependencies, and associated vulnerabilities within our software stack. These files enable stakeholders, including customers, auditors, and security researchers, to evaluate the security posture of our offerings comprehensively. For access to SBOM files and additional security-related information, please visit our Security Resources page at: Find more information here.

    "},{"location":"deployment_and_security/security/#certification-attestation-and-framework","title":"Certification, Attestation and Framework","text":"

    YData uses a frontend framework React (originally maintained by Facebook) which combines the use of unique user tokens to protect your users against common threats such as cross-site scripting (CSS / XSS) and cross-site request fraud (CSRF / XSRF). This makes it impossible for the user to access data from another user's account.

    "},{"location":"deployment_and_security/security/#laws-and-regulations","title":"Laws and Regulations","text":"

    The cloud service providers used by YData are compatible with the General Data Protection Resolution (GDPR). GDPR is working to expand its products, methods and processes to fulfill its responsibilities as a data processor. YData's security and privacy teams have established a vendor management program that determines the need for YData to be approved when it involves third parties or external vendors. Our security team recognizes that the company\u2019s information resources and vendor reliance are critical to our continued activities and service delivery. These spaces are designed to evaluate technical, physical and administrative controls and ensure that it meets the expectations of it and its customers. It is a monitoring service for infrastructure and applications. Our CCPA compliance process may provide additions so that our customers can fulfill their obligations under the CCPA if there is access to personal data, while we make no plans to transfer, process, use or store personal information.

    "},{"location":"deployment_and_security/security/#data-security","title":"Data Security","text":"
    • No data ever leaves the costumer client cloud.
    • All the data is stored using cloud specific services to ensure security, privacy and compliance with YData\u2019s customers requirements.
    "},{"location":"deployment_and_security/security/#data-encryption","title":"Data Encryption","text":"

    The way YData\u2019s customers communicate with the servers is through SSL / TLS connections, which are encrypted. YData protects the servers where YData Fabric is deployed from DDOS, SQL injection and other fraudulent activities. If one wants to interrupt the data transfer, one can only see a mixture of some characters, which is not possible to decrypt. All data in databases is encrypted with industry standard AES-256.

    "},{"location":"deployment_and_security/security/#api-security","title":"API Security","text":"

    To use the API the user needs to have a JWT token that is automatically generated by Fabric for a specific user. The token is signed and encrypted using a random key created during the deployment and only known by the service responsible for its provisioning.

    "},{"location":"deployment_and_security/security/#availability-and-disaster-recovery","title":"Availability and disaster recovery","text":"

    When using one of the cloud providers, the data stored in the bucket and database is distributed and copied to different servers. If a bucket or database fails, it is usually recovered from a different server without targeting other users.Databases are backed up on a daily basis and can be restored if the software or server fails significantly. Backups are stored in various European and North American data centers (depending on the customer location) for extra protection. It is not possible for YData to recover individual customer information - if you delete something in your account, it will be permanently deleted, and we will not be able to recover it.

    "},{"location":"deployment_and_security/security/#monitoring","title":"Monitoring","text":"

    The functionality of our applications and databases is monitored 24/7 through in-built monitoring tools provided by Google, Azure and Amazon Web Services. Internal errors or failures of our various integrations trigger logins and notifications. This usually helps us to identify the problem very quickly and remedy the situation.

    "},{"location":"deployment_and_security/security/#full-disclosure-policy","title":"Full disclosure policy","text":"

    If something serious happens and your data is damaged as required by GDPR, we will disclose in full (such as a data breach). Transparency is important to us and we will provide you with all the necessary information to properly assess the situation and potential impact. So far no customer data has been compromised and we aim to keep it that way.

    "},{"location":"deployment_and_security/security/security_building_materials/","title":"Secure Build Materials (SBOM)","text":"

    To enhance transparency and facilitate security assessments, we provide access to Secure Build Materials (SBOM) for our products and services.

    SBOM files offer detailed insights into the components, dependencies, and associated vulnerabilities within our software stack. These files enable stakeholders, including customers, auditors, and security researchers, to evaluate the security posture of our offerings comprehensively.

    "},{"location":"deployment_and_security/security/security_building_materials/#all-files","title":"All files","text":"

    https://s3.console.aws.amazon.com/s3/buckets/repos-sboms?region=eu-west-1&bucketType=general&tab=objects

    "},{"location":"deployment_and_security/security/security_building_materials/#individual-raw-files","title":"Individual raw files","text":"
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/api-gateway/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/api-gateway/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/authentication-service/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/authentication-service/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-adapter/metering-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-adapter/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-adapter/quota-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-asg-tags-lambda/command-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-asg-tags-lambda/lambda-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/aws-asg-tags-lambda/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/azure-adapter/metering-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/azure-adapter/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/azure-adapter/quota-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/backoffice-console/command-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/backoffice-console/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/backoffice/api-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/backoffice/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dashboard-app/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dashboard-app/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/datasource-controller/api-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/datasource-controller/manager-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/datasource-controller/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dex-theme/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dex-theme/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/dask-gateway-scheduler/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/dask-gateway-worker/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/h2oflow/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/h2oflow/gpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_python/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_python_community/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_python_tensorflow/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_python_torch/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_r/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/jupyterlab_r/gpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/pipelines_python_tensorflow/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/pipelines_python_torch/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/pipelines_python_ydata/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/pipelines_ydata/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/visualcode/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/visualcode/gpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/visualcode_tensorflow/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/visualcode_torch/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/visualcode_ydata/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/dockerfiles/ydata/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/gcp-adapter/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/gcp-adapter/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/laboratory-controller/api-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/laboratory-controller/manager-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/laboratory-controller/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/metering-service/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/metering-service/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/profile-controller/api-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/profile-controller/manager-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/profile-controller/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/quota-manager/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/quota-manager/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/static-content-server/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/static-content-server/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/synthesizer-controller/api-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/synthesizer-controller/manager-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/synthesizer-controller/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/uploader-service/docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/uploader-service/package-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/ydata-lib-platform-integration-tool/cpu-docker-sbom.cyclonedx.json
    • https://repos-sboms.s3.eu-west-1.amazonaws.com/ydata-lib-platform-integration-tool/package-sbom.cyclonedx.json
    "},{"location":"get-started/","title":"Get started with Fabric","text":"

    The get started is here to help you if you are not yet familiar with YData Fabric or if you just want to learn more about data quality, data preparation workflows and how you can start leveraging synthetic data. Mention to YData Fabric Community

    "},{"location":"get-started/#create-your-first-dataset-with-the-data-catalog","title":"\ud83d\udcda Create your first Dataset with the Data Catalog","text":""},{"location":"get-started/#create-your-multi-table-dataset-with-the-data-catalog","title":"\ud83d\udcbe Create your Multi-Table Dataset with the Data Catalog","text":""},{"location":"get-started/#create-your-first-synthetic-data-generator","title":"\u2699\ufe0f Create your first Synthetic Data generator","text":""},{"location":"get-started/#create-a-relational-database-synthetic-data-generator","title":"\ud83d\uddc4\ufe0f Create a Relational Database Synthetic Data generator","text":""},{"location":"get-started/#create-your-first-lab","title":"\ud83e\uddea Create your first Lab","text":""},{"location":"get-started/#create-your-first-data-pipeline","title":"\ud83c\udf00 Create your first data Pipeline","text":""},{"location":"get-started/create_database_sd_generator/","title":"How to create your first Relational Database Synthetic Data generator","text":"

    Check this quickstart video on how to create your first Relational Database Synthetic Data generator.

    To generate your first synthetic relational database, you need to have a Multi-Dataset already available in your Data Catalog. Check this tutorial to see how you can add your first dataset to Fabric\u2019s Data Catalog.

    With your database created as a Datasource, you are now able to start configure your Synthetic Data (SD) generator to create a replicate of your database. You can either select \"Synthetic Data\" from your left side menu, or you can select \"Create Synthetic Data\" in your project Home as shown in the image below.

    You'll be asked to select the dataset you wish to generate synthetic data from and verify the tables you'd like to include in the synthesis process, validating their data types - Time-series or Tabular.

    Table data types are relevant for synthetic data quality

    In case some of your tables hold time-series information (meaning there is a time relation between records) it is very important that during the process of configuring your synthetic data generator you do change update your tables data types accordingly. This will not only ensure the quality of that particular table, but also the overall database quality and relations.

    All the PK and FK identified based on the database schema definition, have an automatically created anonymization setting defined. Aa standard and incremental integer will be used as the anonymization configuration, but user can change to other pre-defined generation options or regex base (user can provide the expected pattern of generation).

    Finally, as the last step of our process it comes the Synthetic Data generator specific configurations, for this particular case we need to define both Display Name and the Destination connector. The Destination connector it is mandatory and allow to select the database where the generated synthetic database is expected to be written. After providing both inputs we can finish the process by clicking in the \"Save\" button as per the image below.

    Your Synthetic Data generator is now training and listed under \"Synthetic Data\". While the model is being trained, the Status will be \ud83d\udfe1, as soon as the training is completed successfully it will transition to \ud83d\udfe2. Once the Synthetic Data generator has finished training, you're ready to start generating your first synthetic dataset. You can start by exploring an overview of the model configurations and even validate the quality of the synthetic data generator from a referential integrity point of view.

    Next, you can generate synthetic data samples by accessing the Generation tab or click on \"Go to Generation\". In this section, you are able to generate as many synthetic samples as you want. For that you need to define the size of your database in comparison to the real one. This ratio is provided as a percentage. In the example below, we have asked a sample with 100% size, meaning, a synthetic database with the same size as the original.

    A new line in your \"Sample History\" will be shown and as soon as the sample generation is completed you will be able to check the quality the synthetic data already available in your destination database.

    Congrats! \ud83d\ude80 You have now successfully created your first Relation Synthetic Database with Fabric. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_lab/","title":"How to create your first Lab environment","text":"

    Labs are code environments for a more flexible development of data-driven solutions while leveraging Fabric capabilities combined with already loved tools such as scikit-learn, numpy and pandas. To create your first Lab, you can use the \u201cCreate Lab\u201d from Fabric\u2019s home, or you can access it from the Labs module by selecting it on the left side menu, and clicking the \u201cCreate Lab\u201d button.

    Next, a menu with different IDEs will be shown. As a quickstart select Jupyter Lab. As labs are development environments you will be also asked what language you would prefer your environment to support: R or Python. Select Python.

    Select IDE Select language

    Bundles are environments with pre-installed packages. Select YData bundle, so we can leverage some other Fabric features such as Data Profiling, Synthetic Data and Pipelines.

    As a last step, you will be asked to configure the infrastructure resources for this new environment as well as giving it a Display Name. We will keep the defaults, but you have flexibility to select GPU acceleration or whether you need more computational resources for your developments.

    Finally, your Lab will be created and added to the \"Labs\" list, as per the image below. The status of the lab will be \ud83d\udfe1 while preparing, and this process takes a few minutes, as the infrastructure is being allocated to your development environment. As soon as the status changes to \ud83d\udfe2, you can open your lab by clicking in the button as shown below:

    Create a new notebook in the JupyterLab and give it a name. You are now ready to start your developments!

    Create a new notebook Notebook created

    Congrats! \ud83d\ude80 You have now successfully created your first Lab a code environment, so you can benefit from the most advanced Fabric features as well as compose complex data workflows. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_multitable_dataset/","title":"How to create your first Relational database in Fabric's Catalog","text":"

    To create your first multi-table dataset in the Data Catalog, you can start by clicking on \"Add Dataset\" from the Home section. Or click to Data Catalog (on the left side menu) and click \u201cAdd Dataset\u201d.

    After that the below modal will be shown. You will need to select a connector. To create a multi-table dataset, we need to choose an RDBMS connector like Azure SQL, Snowflake or MySQL. In this case let's select MySQL.

    Once you've selected the \u201cMySQL\u201d connector, a new screen will appear, enabling you to introduce the connection details such as database username, host, password as well as the database name.

    With the Connector created, you'll be able to add a dataset and specify its properties:

    • Name: The name of your dataset;
    • Table: You can create a dataset with all the tables from the schema or select the tables that you need in your project.
    • Query: Create a single table dataset by providing a query

    Now both the Connector to the MySQL Berka database and Berka dataset will be added to our Catalog. As soon as the status is green, you can navigate your Dataset. Click in Open dataset as per the image below.

    Within the Dataset details, you can gain valuable insights like your database schema.

    For each an every table you can explore the both an overview on the structure (number of columns, number of rows, etc.) but also a useful summary of the quality and warnings regarding your dataset behaviour.

    Congrats! \ud83d\ude80 You have now successfully created your first Connector and Multi-table Dataset in Fabric\u2019s Data Catalog. To get the both the ID of your database and project you can decompose the URL from the Database schema overview page. The structure is as follows:

        https://fabric.ydata.ai/rdbms/{your-dataset-id}?ns={your-project-id}\n

    Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_pipeline/","title":"How to create your first Pipeline","text":"

    Check this quickstart video on how to create your first Pipeline.

    The best way to get started with Pipelines is to use the interactive Pipeline editor available in the Labs with Jupyter Lab set as IDE. If you don't have a Lab yet, or you don't know how to create one, check our quickstart guide on how to create your first lab.

    Open an already existing lab.

    A Pipeline comprises one or more nodes that are connected (or not!) with each other to define execution dependencies. Each pipeline node is and should be implemented as a component that is expected to manage a single task, such as read the data, profiling the data, training a model, or even publishing a model to production environments.

    In this tutorial we will build a simple and generic pipeline that use a Dataset from Fabric's Data Catalog and profile to check it's quality. We have the notebooks template already available. For that you need to access the \"Academy\" folder as per the image below.

    Make sure to copy all the files in the folder \"3 - Pipelines/quickstart\" to the root folder of your lab, as per the image below.

    Now that we have our notebooks we need to make a small change in the notebook \"1. Read dataset\". Go back to your Data Catalog, from one of the datasets in your Catalog list, select the three vertical dots and click in \"Explore in Labs\" as shown in the image below.

    The following screen will be shown. Click in copy.

    Now that we have copied the code, let's get back to our \"1. Read data.ipynb\" notebook, and replace the first code cell by with the new code. This will allow us to use a dataset from the Data Catalog in our pipeline.

    Placeholder code Replaced with code snippet

    With our notebooks ready, we can now configure our Pipeline. For this quickstart we will be leveraging an already existing pipeline - double-click the file my_first_pipeline.pipeline. You should see a pipeline as depicted in the images below. To create a new Pipeline, you can open the lab launcher tab and select \"Pipeline Editor\".

    Open Pipeline My first pipeline

    Before running the pipeline, we need to check each component/step properties and configurations. Right-click each one of the steps, select \"Open Properties\", and a menu will be depicted in your right side. Make sure that you have \"YData - CPU\" selected as the Runtime Image as show below.

    Open properties Runtime image

    We are now ready to create and run our first pipeline. In the top left corner of the pipeline editor, the run button will be available for you to click.

    Accept the default values shown in the run dialog and start the run

    If the following message is shown, it means that you have create a run of your first pipeline.

    Now that you have created your first pipeline, you can select the Pipeline from Fabric's left side menu.

    Your most recent pipeline will be listed, as shown in below image.

    To check the run of your pipeline, jump into the \"Run\" tab. You will be able to see your first pipeline running!

    By clicking on top of the record you will be able to see the progress of the run step-by-step, and visualize the outputs of each and every step by clicking on each step and selecting the Visualizations tab.

    Congrats! \ud83d\ude80 You have now successfully created your first Pipeline a code environment, so you can benefit from Fabric's orchestration engine to crate scalable, versionable and comparable data workflows. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/create_syntheticdata_generator/","title":"How to create your first Synthetic Data generator","text":"

    Check this quickstart video on how to create your first Synthetic Data generator.

    To generate your first synthetic data, you need to have a Dataset already available in your Data Catalog. Check this tutorial to see how you can add your first dataset to Fabric\u2019s Data Catalog.

    With your first dataset created, you are now able to start the creation of your Synthetic Data generator. You can either select \"Synthetic Data\" from your left side menu, or you can select \"Create Synthetic Data\" in your project Home as shown in the image below.

    You'll be asked to select the dataset you wish to generate synthetic data from and verify the columns you'd like to include in the synthesis process, validating their Variable and Data Types.

    Data types are relevant for synthetic data quality

    Data Types are important to be revisited and aligned with the objectives for the synthetic data as they can highly impact the quality of the generated data. For example, let's say we have a column that is a \"Name\", while is some situations it would make sense to consider it a String, under the light of a dataset where \"Name\" refers to the name of the product purchases, it might be more beneficial to set it as a Category.

    Finally, as the last step of our process it comes the Synthetic Data specific configurations, for this particular case we only need to define a Display Name, and we can finish the process by clicking in the \"Save\" button as per the image below.

    Your Synthetic Data generator is now training and listed under \"Synthetic Data\". While the model is being trained, the Status will be \ud83d\udfe1, as soon as the training is completed successfully it will transition to \ud83d\udfe2 as per the image below.

    Once the Synthetic Data generator has finished training, you're ready to start generating your first synthetic dataset. You can start by exploring an overview of the model configurations and even download a PDF report with a comprehensive overview of your Synthetic Data Quality Metrics. Next, you can generate synthetic data samples by accessing the Generation tab or click on \"Go to Generation\".

    In this section, you are able to generate as many synthetic samples as you want. For that you need to define the number rows to generate and click \"Generate\", as depicted in the image below.

    A new line in your \"Sample History\" will be shown and as soon as the sample generation is completed you will be able to \"Compare\" your synthetic data with the original data, add as a Dataset with \"Add to Data Catalog\" and last but not the least download it as a file with \"Download csv\".

    Congrats! \ud83d\ude80 You have now successfully created your first Synthetic Data generator with Fabric. Get ready for your journey of improved quality data for AI.

    "},{"location":"get-started/fabric_community/","title":"Get started with Fabric Community","text":"

    Fabric Community is a SaaS version that allows you to explore all the functionalities of Fabric first-hand: free, forever, for everyone. You\u2019ll be able to validate your data quality with automated profiling, unlock data sharing and improve your ML models with synthetic data, and increase your productivity with seamless integration:

    • Build 1 personal project;
    • Create your first Data Catalog and benefit from automated data profiling;
    • Train and generate synthetic data up to 2 models and datasets with 50 columns and 100K rows;
    • Optimize synthetic data quality for your use cases with an evaluation PDF report;
    • Create 1 development environment (Labs) and integrate it with your familiar ML packages and workflows.
    "},{"location":"get-started/fabric_community/#register","title":"Register","text":"

    To register for Fabric Community:

    • Access the Fabric Community Try Now and create your YData account by submitting the form
    • Check your email for your login credentials
    • Login into fabric.ydata.ai and enjoy!

    Once you login, you'll access the Home page and get started with your data preparation!

    "},{"location":"get-started/upload_csv/","title":"How to create your first Dataset from a CSV file","text":"

    Check this quickstart video on how to create your first Dataset from a CSV file.

    To create your first dataset in the Data Catalog, you can start by clicking on \"Add Dataset\" from the Home section. Or click to Data Catalog (on the left side menu) and click \u201cAdd Dataset\u201d.

    After that the below modal will be shown. You will need to select a connector. To upload a CSV file, we need to select \u201cUpload CSV\u201d.

    Once you've selected the \u201cUpload CSV\u201d connector, a new screen will appear, enabling you to upload your file and designate a name for your connector. This file upload connector will subsequently empower you to create one or more datasets from the same file at a later stage.

    Loading area Upload csv file

    With the Connector created, you'll be able to add a dataset and specify its properties:

    • Name: The name of your dataset;
    • Separator: This is an important parameter to make sure that we can parse your CSV correctly. The default value is \u201c,\u201d.
    • Data Type: Whether your dataset contains tabular or time-series (i.e., containing temporal dependency) data.

    Your created Connector (\u201cCensus File\u201d) and Dataset (\u201cCensus\u201d) will be added to the Data Catalog. As soon as the status is green, you can navigate your Dataset. Click in Open Dataset as per the image below.

    Within the Dataset details, you can gain valuable insights through our automated data quality profiling. This includes comprehensive metadata and an overview of your data, encompassing details like row count, identification of duplicates, and insights into the overall quality of your dataset.

    Or perhaps, you want to further explore through visualization, the profile of your data with both univariate and multivariate of your data.

    Congrats! \ud83d\ude80 You have now successfully created your first Connector and Dataset in Fabric\u2019s Data Catalog. Get ready for your journey of improved quality data for AI.

    "},{"location":"integrations/","title":"Integrations","text":"

    Recognizing the modern enterprise data stack comprises a vast array of services and tools, YData Fabric is augmented by a growing ecosystem of partners and integrations, acting both upstream and downstream in the lifecycle of an AI project.

    The list below is a non-exhaustive compilation of MLOps, Data and Cloud Providers which smoothly integrate with Fabric:

    • DVC: Enhancing data versioning
    • Databricks: Enhancing feature/data engineering before improving with YData

      • \ud83d\udcda Follow Databricks step-by-step tutorials
      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • Snowflake: Enhancing feature/data engineering before improving with YData

      • \ud83d\udcda Follow Snowflake step-by-step tutorials
      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • H2O: Framework available through code and Fabric Labs (H2O Flow)

    • Algorithmia: Integration for easy model deployment

      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • UbiOps: Integration for easy model deployment

      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • Great Expectations: Data profiling is integrated with Great Expectations

    • Azure ML: Integration for easy model deployment

      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • AWS SageMaker: Integration for easy model deployment

      • \ud83d\udc68\u200d\ud83d\udcbb Check code example in YData Academy
    • Google Vertex AI: Integration for easy model deployment

    Up-to-date examples

    \ud83d\udc49 For the most up-to-date examples and ready-to-use recipes of how to integrate with YData Fabric with some services above, check out the Integrations section of YData\u2019s Academy.

    "},{"location":"integrations/databricks/integration_connectors_catalog/","title":"Connectors & Catalog","text":"

    YData Fabric provides a seamless integration with Databricks, allowing you to connect, query, and manage your data in Databricks Unity Catalog and Delta Lake with ease. This section will guide you through the benefits, setup, and usage of the Databricks' available connector in Fabric.

    Prerequisites

    Before using the YData SDK in Databricks notebooks, ensure the following prerequisites are met:

    • Access to a Databricks workspace
    • A valid YData Fabric account and API key
    • Credentials for Databricks (tokens, Databricks host, warehouse, database, schema, etc.).
    "},{"location":"integrations/databricks/integration_connectors_catalog/#delta-lake","title":"Delta Lake","text":"

    Databricks Delta Lake is an open-source storage layer that brings reliability to data lakes. Built on top of Apache Spark, Delta Lake provides ACID (Atomicity, Consistency, Isolation, Durability) transaction guarantees, scalable metadata handling, and unifies streaming and batch data processing.

    In this tutorial it will be covered how you can leverage YData Fabric connectors to integrate with Databricks Delta Lake.

    "},{"location":"integrations/databricks/integration_connectors_catalog/#setting-up-the-delta-lake-connector","title":"Setting Up the Delta Lake Connector","text":"

    To create a Delta Lake connector in YData Fabric Ui you need to meet the following pre-requisites.

    "},{"location":"integrations/databricks/integration_connectors_catalog/#step-by-step-creation-through-the-ui","title":"Step-by-step creation through the UI","text":"

    To create a connector in YData Fabric, select the \"Connectors\" page from the left side menu, as illustrated in the image below.

    Now, click in the \"Create Connector\" button and the following menu with the available connectors will be shown.

    Depending on the cloud vendor that you have your Databricks' instance deployed, select the Delta Lake connector for AWS or Azure. After selecting the connector type \"Databricks Delta Lake\" the below menu will be shown. This is where you can configure the connection to your Delta Lake. For that you will need the following information:

    • Databricks Host: The URL of your Databricks cluster
    • Access token: Your Databricks' user token
    • Catalog: The name of a Catalog that you want to connect to
    • Schema: The name of the schema that you want to connect to

    Depending on the cloud selected, you will be asked for the credentials to your staging storage (AWS S3 or Azure Blob Storage). In this example we are using AWS and for that reason the below inputs refer to AWS S3.

    • Key ID: The Snowflake database to connect to.
    • Key Secret: The schema within the database.

    And finally, the name for your connector: - Display name: A unique name for your connector. Test your connection and that's it! \ud83d\ude80

    You are now ready to create different Datasources using this connector - read the data from a table, evaluate the quality of the data or even read a full database and generate a synthetic replica of your data! Read more about Fabric Datasources in here.

    "},{"location":"integrations/databricks/integration_connectors_catalog/#use-it-inside-the-labs","title":"Use it inside the Labs","text":"

    \ud83d\udc68\u200d\ud83d\udcbb Full code example and recipe can be found here.

    In case you prefer a Python interface, we also have connectors available through Fabric SDK inside the labs. For a seamless integration between the UI and the Labs environment, Fabric offers an SDK that allows you to re-use connectors, datasources and even synthesizers.

    Start by creating your code environment through the Labs. In case you need to get started with the Labs, check this step-by-step guide.

        # Importing YData's packages\n    from ydata.labs import Connectors\n    # Getting a previously created Connector\n    connector = Connectors.get(uid='insert-connector-id',\n                               namespace='indert-namespace-id')\n    print(connector)\n
    "},{"location":"integrations/databricks/integration_connectors_catalog/#read-from-your-delta-lake","title":"Read from your Delta Lake","text":"

    Using the Delta Lake connector it is possible to:

    • Get the data from a Delta Lake table
    • Get a sample from a Delta Lake table
    • Get the data from a query to a Delta Lake instance
    "},{"location":"integrations/databricks/integration_connectors_catalog/#unity-catalog","title":"Unity Catalog","text":"

    Databricks Unity Catalog is a unified governance solution for all data and AI assets within the Databricks Lakehouse Platform.

    Databricks Unity Catalog leverages the concept of Delta Sharing, meaning this is a great way not only to ensure alignment between Catalogs but also to limit the access to data. This means that byt leveraging the Unity Catalog connector, users can only access a set of data assets that were authorized for a given Share.

    "},{"location":"integrations/databricks/integration_connectors_catalog/#step-by-step-creation-through-the-ui_1","title":"Step-by-step creation through the UI","text":"

    How to create a connector to Databricks Unity Catalog in Fabric?

    The process to create a new connector is similar to what we have covered before to create a new Databricks Unity Catalog connector in YData Fabric.

    After selecting the connector \"Databricks Unity Catalog\", you will be requested to upload your Delta Sharing token as depicted in the image below.

    Test your connection and that's it! \ud83d\ude80

    "},{"location":"integrations/databricks/integration_connectors_catalog/#use-it-inside-the-labs_1","title":"Use it inside the Labs","text":"

    \ud83d\udc68\u200d\ud83d\udcbb Full code example and recipe can be found here.

    In case you prefer a Python interface, we also have connectors available through Fabric inside the labs. Start by creating your code environment through the Labs. In case you need to get started with the Labs, check this step-by-step guide.

    "},{"location":"integrations/databricks/integration_connectors_catalog/#navigate-your-delta-share","title":"Navigate your Delta Share","text":"

    With your connector created you are now able to explore the schemas and tables available in a Delta share.

    List available shares
        #List the available shares for the provided authentication\n    connector.list_shares()\n
    List available schemas
        #List the available schemas for a given share\n    connector.list_schemas(share_name='teste')\n
    List available tables
        #List the available tables for a given schema in a share\n    connector.list_tables(schema_name='berka',\n                           share_name='teste')\n\n    #List all the tables regardless of share and schema\n    connector.list_all_tables()\n
    "},{"location":"integrations/databricks/integration_connectors_catalog/#read-from-your-delta-share","title":"Read from your Delta Share","text":"

    Using the Delta Lake connector it is possible to:

    • Get the data from a Delta Lake table
    • Get a sample from a Delta Lake table
    Read the data from a table
        #This method reads all the data records in the table\n    table = connector.read_table(table_name='insert-table-name',\n                                 schema_name='insert-schema-name',\n                                 share_name='insert-share-name')\n    print(table)\n
    Read a data sample from a table
        #This method reads all the data records in the table\n    table = connector.read_table(table_name='insert-table-name',\n                                 schema_name='insert-schema-name',\n                                 share_name='insert-share-name',\n                                 sample_size=100)\n    print(table)\n

    I hope you enjoyed this quick tutorial on seamlessly integrating Databricks with your data preparation workflows. \ud83d\ude80

    "},{"location":"integrations/databricks/integration_with_sdk/","title":"YData SDK in Databricks Notebooks","text":"

    The YData Fabric SDK provides a powerful set of tools for integrating and enhancing data within Databricks notebooks. This guide covers the installation, basic usage, and advanced features of the Fabric SDK, helping users maximize the potential of their data for AI and machine learning applications.

    \ud83d\udc68\u200d\ud83d\udcbb Full code example and recipe can be found here.

    Prerequisites

    Before using the YData Fabric SDK in Databricks notebooks, ensure the following prerequisites are met:

    • Access to a Databricks workspace
    • A valid YData Fabric account and API key
    • Basic knowledge of Python and Databricks notebooks
    • A safe connection between your Databricks cluster and Fabric

    Best Practices

    • Data Security: Ensure API keys and sensitive data are securely managed.
    • Efficient Coding: Use vectorized operations for data manipulation where possible.
    • Resource Management: Monitor and manage the resources used by your clusters (Databricks and Fabric) Databricks cluster to optimize performance.
    "},{"location":"integrations/databricks/integration_with_sdk/#installation","title":"Installation","text":"

    To install the YData SDK in a Databricks notebook, use the following command:

    %pip install ydata-sdk\ndbutils.library.restartPython()\n
    Ensure the installation is successful before proceeding to the next steps.

    "},{"location":"integrations/databricks/integration_with_sdk/#basic-usage-data-integration","title":"Basic Usage - data integration","text":"

    This section provides step-by-step instructions on connecting to YData Fabric and performing essential data operations using the YData SDK within Databricks notebooks. This includes establishing a secure connection to YData Fabric and accessing datasets.

    "},{"location":"integrations/databricks/integration_with_sdk/#connecting-to-ydata-fabric","title":"Connecting to YData Fabric","text":"

    First, establish a connection to YData Fabric using your API key:

    import os\n\n# Add your Fabric token as part of your environment variables for authentication\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'\n
    "},{"location":"integrations/databricks/integration_with_sdk/#data-access-manipulation","title":"Data access & manipulation","text":"

    Once connected, you can access and manipulate data within YData Fabric. For example, to list available datasets:

    from ydata.sdk.datasources import DataSource\n\n#return the list of available DataSources\nDataSource.list()\n

    To load a specific dataset into a Pandas DataFrame:

    #get the data from an existing datasource\ndataset = DataSource.get('<DATASOURCE-ID>')\n
    "},{"location":"integrations/databricks/integration_with_sdk/#advanced-usage-synthetic-data-generation","title":"Advanced Usage - Synthetic data generation","text":"

    This section explores one of the most powerful features of the Fabric SDK for enhancing and refining data within Databricks notebooks. This includes as generating synthetic data to augment datasets or to generate privacy-preserving data. By leveraging these advanced capabilities, users can significantly enhance the robustness and performance of their AI and machine learning models, unlocking the full potential of their data.

    "},{"location":"integrations/databricks/integration_with_sdk/#privacy-preserving","title":"Privacy-preserving","text":"

    Leveraging synthetic data allows to create privacy-preserving datasets that maintain real-world value, enabling users to work with sensitive information securely while accessing utility of real data.

    Check the SDK documentation for more information regarding privacy-controls and anonymization.

    "},{"location":"integrations/databricks/integration_with_sdk/#from-a-datasource-in-ydata-fabric","title":"From a datasource in YData Fabric","text":"

    Users can generate synthetic data from datasource's existing in Fabric:

    Train a synthetic data generator
    # From an existing Fabric datasource\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n\nsynth = RegularSynthesizer(name='<NAME-YOUR-MODEL>')\nsynth.fit(X=dataset)\n

    Sample from a Synthetic data generator

    # From an existing Fabric datasource\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n\nsynth = RegularSynthesizer(name='<NAME-YOUR-MODEL>')\nsynth.fit(X=dataset)\n
    After your synthetic data generator have been trained successfully you can generate as many synthetic datasets as needed Sampling from the model that we have just trained
    from ydata.sdk.synthesizers import RegularSynthesizer\nsample = synth.sample(100)\nsample.head()\n

    It is also possible to generate data from other synthetic data generation models previously trained:

    Generating synthetic data from a previously trained model
    from ydata.sdk.synthesizers import RegularSynthesizer\n\nexisting_synth = RegularSynthesizer('<INSERT-SYNTHETIC-DATA-GENERATOR-ID>').get()\nsample = existing_synth.sample(100)\n
    "},{"location":"integrations/databricks/integration_with_sdk/#from-a-datasource-in-databricks","title":"From a datasource in Databricks","text":"

    Another important integration is to train a synthetic data generator from a dataset that you are currently exploring in your notebook environment. In order to do so, we recommend that you create your dataset using YData Fabric integration connector to your Delta Lake and follow the flow for the creation of a synthetic data generation models from Fabric existing dasources.

    For a small dataset you can also follow this tutorial.

    "},{"location":"integrations/databricks/integration_with_sdk/#data-augmentation","title":"Data augmentation","text":"

    Another key focus is on generating synthetic data to augment existing datasets. This technique, particularly through conditional synthetic data generation, allows users to create targeted, realistic datasets. By addressing data imbalances and enriching the training data, conditional synthetic data generation significantly enhances the robustness and performance of machine learning (ML) models, leading to more accurate and reliable outcomes.

    Read data from a delta table
    # Read data from the catalog\ndf = spark.sql(\"SELECT * FROM ydata.default.credit_scoring_labeled\")\n\n# Display the dataframe\ndisplay(df)\n

    After reading the data we need to convert it to pandas dataframe in order to create our synthetic data generation model. For the augmentation use-case we will be leveraging Conditional Synthetic data generation.

    Training a conditional synthetic data generator
    from ydata.sdk.synthesizers import RegularSynthesizer\n\n# Convert Spark dataframe to pandas dataframe\npandas_df = df.toPandas()\npandas_df = pandas_df.drop('ID', axis=1)\n\n# Train a synthetic data generator using ydata-sdk\nsynth = RegularSynthesizer(name='Synth credit scoring | Conditional')\nsynth.fit(pandas_df, condition_on='Label')\n\n# Display the synthetic dataframe\ndisplay(synth)\n

    Now that we have a trained conditional synthetic data generator we are able to generate a few samples controlling the population behaviour based on the columns that we have conditioned the process to.

    Generating a synthetic sample conditioned to column 'Label'
    #generate synthetic samples condition to Label\nsynthetic_sample = synth.sample(n_samples=len(pandas_df), condition_on={\n            \"Label\": {\n                        \"categories\": [{\n                            \"category\": 1,\n                            \"percentage\": 0.7\n                        }]\n        }\n    }\n)\n

    After generating the synthetic data we can combine it with our dataset.

    Convert the dataframe to Spark dataframe
    # Enable Arrow-based columnar data transfers\nspark.conf.set(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\n\n#Create a spark dataframe from the synthetic dataframe\nsynthetic_df = spark.createDataFrame(synthetic_sample)\n\ndisplay(synthetic_df)\n
    Combining the datasets
    # Concatenate the original dataframe with the synthetic dataframe\n#removing the column ID as it is not used\ndf = df.drop('ID')\nconcatenated_df = df.union(synthetic_df)\n\n# Display the concatenated dataframe\ndisplay(concatenated_df)\n

    Afterwards you can use your augmented dataset to train a Machine Learning model using MLFlow.

    "},{"location":"integrations/databricks/overview/","title":"Overview","text":"

    This sections provides a detailed guide on integrating YData Fabric with Databricks. By combining Databricks and YData Fabric, users gain a comprehensive AI solution. Fabric enables access to previously siloed data, enhances understanding, and improves data quality. Meanwhile, Databricks provides the scalability needed to deliver robust AI capabilities.

    "},{"location":"integrations/databricks/overview/#integration-benefits","title":"Integration benefits","text":"
    • Enhanced Data Accessibility: Seamlessly access and integrate previously siloed data.
    • Improved Data Quality: Use YData Fabric's tools to enhance the quality of your data through data preparation and augmentation.
    • Scalability: Leverage Databricks' robust infrastructure to scale data processing and AI workloads.
    • Streamlined Workflows: Simplify data workflows with connectors and SDKs, reducing manual effort and potential errors.
    • Comprehensive Support: Benefit from extensive documentation and support for both platforms, ensuring smooth integration and operation.
    "},{"location":"integrations/databricks/overview/#integration-methods","title":"Integration methods","text":""},{"location":"integrations/databricks/overview/#data-catalog-connectors","title":"Data Catalog - Connectors","text":"

    YData Fabric provides a range of connectors that enable direct integration with Databricks' Unity Catalog and Delta Lake. These connectors streamline data transfer and ensure seamless interoperability between the two platforms.

    Key Features:

    • Easy configuration
    • Secure data transfer
    • Data synchronization
    "},{"location":"integrations/databricks/overview/#sdk","title":"SDK","text":"

    The YData Fabric SDK offers a programmatic approach to integrating with Databricks. It provides developers with the tools and libraries needed to automate and customize data workflows between YData Fabric and Databricks.

    Key Features:

    • Python based interface
    • Flexible and customizable
    • Comprehensive documentation and support

    Find a comprehensive guideline on using YData Fabric SDK in Databricks Notebooks.

    "},{"location":"integrations/databricks/overview/#api","title":"API","text":"

    The YData Fabric API allows for integration via RESTful services, providing a versatile method to interact with Databricks. This approach is ideal for applications requiring direct API calls and custom integrations.

    Key Features:

    • RESTful architecture
    • Language-agnostic integration
    • Detailed API documentation
    • Support for a wide range of operations
    "},{"location":"integrations/databricks/overview/#integration-diagram","title":"Integration diagram","text":"

    The integration diagram below illustrates the interaction between YData Fabric and Databricks, highlighting the data flow and key components involved in the integration process.

    "},{"location":"integrations/snowflake/integration_snowflake/","title":"\u2744\ufe0f Integrate Fabric with Snowflake - from Analytics to Machine Learning","text":"

    YData Fabric provides a seamless integration with Snowflake, allowing you to connect, query, and manage your data in Snowflake with ease. This section will guide you through the benefits, setup, and usage of the Snowflake connector within YData Fabric.

    "},{"location":"integrations/snowflake/integration_snowflake/#benefits-of-integration","title":"Benefits of Integration","text":"

    Integrating YData Fabric with Snowflake offers several key benefits:

    • Scalability: Snowflake's architecture scales effortlessly with your data needs, while YData Fabric's tools ensure efficient data integration and management.
    • Performance: Leveraging Snowflake's high performance for data querying and YData Fabric's optimization techniques enhances overall data processing speed.
    • Security: Snowflake's robust security features, combined with YData Fabric's data governance capabilities, ensure your data remains secure and compliant.
    • Interoperability: YData Fabric simplifies the process of connecting to Snowflake, allowing you to quickly set up and start using the data without extensive configuration. Benefit from the unique Fabric functionalities like data preparation with Python, synthetic data generation and data profiling.
    "},{"location":"integrations/snowflake/integration_snowflake/#setting-up-the-snowflake-connector","title":"Setting Up the Snowflake Connector","text":"

    How to create a connector to Snowflake in Fabric?

    To create a Snowflake connector in YData Fabric Ui you need to meet the following pre-requisites and steps:

    Prerequisites

    Before setting up the connector, ensure you have the following:

    • A Snowflake account with appropriate access permissions.
    • YData Fabric installed and running in your environment.
    • Credentials for Snowflake (username, password, account identifier, warehouse, database, schema).
    "},{"location":"integrations/snowflake/integration_snowflake/#step-by-step-creation-through-the-ui","title":"Step-by-step creation through the UI","text":"

    To create a connector in YData Fabric, select the \"Connectors\" page from the left side menu, as illustrated in the image below.

    Now, click in the \"Create Connector\" button and the following menu with the available connectors will be shown.

    After selecting the connector type \"Snowflake\" the below menu will be shown. This is where you can configure the connection to your Snowflake instance. For that you will need the following information:

    • Username: Your Snowflake username.
    • Password: Your Snowflake password.
    • Host/Account Identifier: Your Snowflake account identifier (e.g., xy12345.us-east-1).
    • Port: The Snowflake port number.
    • Database: The Snowflake database to connect to.
    • Schema: The schema within the database.
    • Warehouse: The Snowflake warehouse to use.
    • Display Name: A unique name for your connector.

    Test your connection and that's it! \ud83d\ude80

    You are now ready to create different Datasources using this connector - read the data from a query, evaluate the quality of the data from a table or even read a full database and generate a synthetic replica of your data! Read more about Fabric Datasources in here.

    "},{"location":"integrations/snowflake/integration_snowflake/#use-it-inside-the-labs","title":"Use it inside the Labs","text":"

    \ud83d\udc68\u200d\ud83d\udcbb Full code example and recipe can be found here.

    In case you prefer a Python interface, we also have connectors available through Fabric SDK inside the labs. For a seamless integration between the UI and the Labs environment, Fabric offers an SDK that allows you to re-use connectors, datasources and even synthesizers.

    Start by creating your code environment through the Labs. In case you need to get started with the Labs, check this step-by-step guide.

        # Importing YData's packages\n    from ydata.labs import Connectors\n    # Getting a previously created Connector\n    connector = Connectors.get(uid='insert-connector-id',\n                               namespace='indert-namespace-id')\n    print(connector)\n
    "},{"location":"integrations/snowflake/integration_snowflake/#navigate-your-database","title":"Navigate your database","text":"

    With your connector created you are now able to explore your database and available datasets.

    List available schemas and get the metadata of a given schema
        # returns a list of schemas\n    schemas = connector.list_schemas()\n\n    # get the metadata of a database schema, including columns and relations between tables (PK and FK)\n    schema = connector.get_database_schema('PATIENTS')\n
    "},{"location":"integrations/snowflake/integration_snowflake/#read-from-a-snowflake-instance","title":"Read from a Snowflake instance","text":"

    Using the Snowflake connector it is possible to:

    • Get the data from a Snowflake table
    • Get a sample from a Snowflake table
    • Get the data from a query to a Snowflake instance
    • Get the full data from a selected database
    Read full and a sample from a table
        # returns the whole data from a given table\n    table = connector.get_table('cardio_test')\n    print(table)\n\n    # Get a sample with n rows from a given table\n    table_sample = connector.get_table_sample(table='cardio_test', sample_size=50)\n    print(table_sample)\n
    Get the data from a query
        # returns the whole data from a given table\n    query_output = connector.query('SELECT * FROM patients.cardio_test;')\n    print(query_output)\n
    "},{"location":"integrations/snowflake/integration_snowflake/#write-to-a-snowflake-instance","title":"Write to a Snowflake instance","text":"

    If you need to write your data into a Snowflake instance you can also leverage your Snowflake connector for the following actions:

    • Write the data into a table
    • Write a new database schema

    The if_exists parameter allow you to decide whether you want to append, replace or fail in case a table with the same name already exists in the schema.

    Writing a dataset to a table in a Snowflake schema
        connector.write_table(data=tables['cardio_test'],\n                          name='cardio',\n                          if_exists='fail')\n

    table_names allow you to define a new name for the table in the database. If not provided it will be assumed the table names from your dataset. Writing a full database to a Snowflake schema

        connector.write_database(data=database,\n                         schema_name='new_cardio',\n                         table_names={'cardio_test': 'cardio'})\n

    I hope you enjoyed this quick tutorial on seamlessly integrating Snowflake with your data preparation workflows. \u2744\ufe0f\ud83d\ude80

    "},{"location":"labs/","title":"Fabric coding environment","text":"

    YData Fabric Labs are on-demand, cloud-based data development environments with automatically provisioned hardware (multiple infrastructure configurations, including GPUs, are possible) and full platform integration via a Python interface (allowing access to Data Sources, Synthesizers, and the Workspace\u2019s shared files).

    Wit Labs, you can create environment with the support to familiar IDEs like Visual Studio Code, **Jupyter Lab** and H20 Flow, with support for both Python and R are included.

    For Python specifically, pre-configured bundles including TensorFlow, PyTorch and/or the main popular data science libraries are also available, jumpstarting data development. Additional libraries can be easily installed leveraging a simple !pip install

    "},{"location":"labs/#get-started-with-your-first-lab","title":"Get started with your first lab","text":"

    \ud83e\uddea Follow this step-by-step guided tutorial to create your first Lab.

    "},{"location":"labs/#tutorials-recipes","title":"Tutorials & recipes","text":"

    Leverage YData extensive collection of tutorials and recipes that you can find in YData Academy. Quickstart or accelerate your data developments with recipes and tutorial use-cases.

    "},{"location":"labs/overview/","title":"Overview","text":"

    Labs exist for Data practitioners to tackle more complex use cases through a familiar environment supercharged with infrastructure, integration with other Fabric modules and access to advanced synthesis and profiling technology via a familiar python interface.

    It is the preferred environment for Data practitioners to express their domain expertise with all the required tools, technology and computational power at their fingertips. It is thus the natural continuation of the data understanding works which started in Data Sources.

    "},{"location":"labs/overview/#supported-ides-and-images","title":"Supported IDE's and images","text":""},{"location":"labs/overview/#ides","title":"IDEs","text":"

    YData Fabric supports integration with various Integrated Development Environments (IDEs) to enhance productivity and streamline workflows. The supported IDEs include:

    • Visual Studio Code (VS Code): A highly versatile and widely-used code editor that offers robust support for numerous programming languages and frameworks. Its integration with Git and extensions like GitLens makes it ideal for version control and collaborative development.
    • Jupyter Lab: An interactive development environment that allows for notebook-based data science and machine learning workflows. It supports seamless Git integration through extensions and offers a user-friendly interface for managing code, data, and visualizations.
    • H2O Flow: A web-based interface specifically designed for machine learning and data analysis with the H2O platform. It provides a flow-based, interactive environment for building and deploying machine learning models.
    "},{"location":"labs/overview/#labs-images","title":"Labs images","text":"

    In the Labs environment, users have access to the following default images, tailored to different computational needs:

    "},{"location":"labs/overview/#python","title":"Python","text":"

    All the below images support Python as the programming language. Current Python version is x

    • YData CPU: Optimized for general-purpose computing and data analysis tasks that do not require GPU acceleration. This image includes access to YData Fabric unique capabilities for data processing (profiling, constraints engine, synthetic data generation, etc).
    • YData GPU: Designed for tasks that benefit from GPU acceleration, providing enhanced performance for large-scale data processing and machine learning operations. Also includes access to YData Fabric unique capabilities for data processing.
    • YData GPU TensorFlow: Specifically configured for TensorFlow-based machine learning and deep learning applications, leveraging GPU capabilities to accelerate training and inference processes. These images ensure that users have the necessary resources and configurations to efficiently conduct their data science and machine learning projects within the Labs environment.
    • YData GPU Torch: Specifically configured for Torch-based machine learning and deep learning applications, leveraging GPU capabilities to accelerate training and inference processes. These images ensure that users have the necessary resources and configurations to efficiently conduct their data science and machine learning projects within the Labs environment.
    "},{"location":"labs/overview/#r","title":"R","text":"

    An image for R, that allows you to leverage the latest version of the language as well as the most user libraries.

    "},{"location":"labs/overview/#existing-labs","title":"Existing Labs","text":"

    Existing Labs appear in the Labs pane of the web application. Besides information about its settings and status, three buttons exist:

    • Open: Open the Lab\u2019s IDE in a new browser tab
    • Pause: Pause the Lab. When resumed, all data will be available.
    • Delete: Lab will be deleted. Data not saved in the workspace\u2019s shared folder (see below) will be deleted.

    The details list of a Lab, with the status and its main actions.

    The Status column indicates the Labs\u2019 status. A Lab can have 4 statuses:

    • \ud83d\udfe2 Lab is running
    • \ud83d\udfe1 Lab is being created (hardware is being provisioned) or is either pausing or starting
    • \ud83d\udd34 Lab was shutdown due to an error. A common error is the Lab going out-of-memory. Additional details are offered in the web application.
    • \u26ab Lab is paused
    "},{"location":"labs/overview/#git-integration","title":"Git integration","text":"

    Integrating Git with Jupyter Notebooks and Visual Studio Code (VS Code) streamlines version control and collaborative workflows for data developers. This integration allows you to track changes, manage project versions, and collaborate effectively within familiar interfaces.

    "},{"location":"labs/overview/#jupyter-lab","title":"Jupyter Lab","text":"

    Inside of Labs that use Jupyter Lab as IDE, you will find the jupyterlab-git extension installed in the environment.

    To create or clone a new repository you need to perform the following steps:

    Select Jupyter Lab Git extension Cloning a repository to your local env

    For more complex actions like forking and merging branches, see the gif below:

    "},{"location":"labs/overview/#visual-code-vs-code","title":"Visual Code (VS Code)","text":"

    To clone or create a new git repository you can click in \"Clone Git Repository...\" and paste it in the text box in the top center area of screen as depicted in the image below.

    Clone Git repository Cloning a repository to your local env"},{"location":"labs/overview/#building-pipelines","title":"Building Pipelines","text":"

    Building data pipelines and breaking them down into modular components can be challenging. For instance, a typical machine learning or deep learning pipeline starts with a series of preprocessing steps, followed by experimentation and optimization, and finally deployment. Each of these stages presents unique challenges within the development lifecycle.

    Fabric Jupyter Labs simplifies this process by incorporating Elyra as the Pipeline Visual Editor. The visual editor enables users to build data pipelines from notebooks, Python scripts, and R scripts, making it easier to convert multiple notebooks or script files into batch jobs or workflows.

    Currently, these pipelines can be executed either locally in JupyterLab or on Kubeflow Pipelines, offering flexibility and scalability for various project needs. Read more about pipelines.

    "},{"location":"pipelines/","title":"Pipelines","text":"

    The Pipelines module of YData Fabric is a general-purpose job orchestrator with built-in scalability and modularity plus reporting and experiment tracking capabilities. With automatic hardware provisioning, on-demand or scheduled execution, run fingerprinting and a UI interface for review and configuration, Pipelines equip the Fabric with operational capabilities for interfacing with up/downstream systems (for instance to automate data ingestion, synthesis and transfer workflows) and with the ability to experiment at scale (crucial during the iterative development process required to discover the data improvement pipeline yielding the highest quality datasets).

    YData Fabric's Pipelines are based on Kubeflow Pipelines and can be created via an interactive interface in Labs with Jupyter Lab as the IDE (recommended) or via Kubeflow Pipeline\u2019s Python SDK.

    With its full integration with Fabric's scalable architecture and the ability to leverage Fabric\u2019s Python interface, Pipelines are the recommended tool to scale up notebook work to experiment at scale or move from experimentation to production.

    "},{"location":"pipelines/#benefits","title":"Benefits","text":"

    Using Pipelines for data preparation offers several benefits, particularly in the context of data engineering, machine learning, and data science workflows. Here are some key advantages:

    • Modularity: they allow to break down data preparation into discrete, reusable steps. Each step can be independently developed, tested, and maintained, enhancing code modularity and readability.
    • Automation: they automate the data preparation process, reducing the need for manual intervention and ensuring that data is consistently processed. This leads to more efficient workflows and saves time.
    • Scalability: Fabric's distributed infrastructure combined with kubernetes based pipelines allows to handle large volumes of data efficiently, making them suitable for big data environments.
    • Reproducibility: By defining a series of steps that transform raw data into a ready-to-use format, pipelines ensure that the same transformations are applied every time. This reproducibility is crucial for maintaining data integrity and for validating results. Maintainability:
    • Versioning: support versioning of the data preparation steps. This versioning is crucial for tracking changes, auditing processes, and rolling back to previous versions if needed.
    • Flexibility: and above all they can be customized to fit specific requirements of different projects. They can be adapted to include various preprocessing techniques, feature engineering steps, and data validation processes.
    "},{"location":"pipelines/#related-materials","title":"Related Materials","text":"
    • \ud83d\udcd6 How to create your first Pipeline
    • How to build a pipeline with YData Fabric
    "},{"location":"pipelines/concepts/","title":"Concepts","text":"

    An example pipeline (as seen in the Pipelines module of the dashboard), where each single-responsibility block corresponds to a step in a typical machine learning workflow

    Each Pipeline is a set of connected blocks. A block is a self-contained set of code, packaged as a container, that performs one step in the Pipeline. Usually, each Pipeline block corresponds to a single responsibility task in a workflow. In a machine learning workflow, each step would correspond to one block, i.e, data ingestion, data cleaning, pre-processing, ML model training, ML model evaluation.

    Each block is parametrized by:

    • code: it executes (for instance, a Jupyter Notebook, a Python file, an R script)
    • runtime: which specifies the container environment it runs in, allowing modularization and inter-step independence of software requirements (for instance, specific Python versions for different blocks)
    • hardware requirements: depending on the workload, a block may have different needs regarding CPU/GPU/RAM. These requirements are automatically matched with the hardware availability of the cluster the Platform\u2019s running in. This, combined with the modularity of each block, allows cost and efficiency optimizations by up/downscaling hardware according to the workload.
    • file dependencies: local files that need to be copied to the container environment
    • environment variables, useful, for instance to apply specific settings or inject authentication credentials
    • output files: files generated during the block\u2019s workload, which will be made available to all subsequent Pipeline steps

    The hierarchy of a Pipeline, in an ascending manner, is as follows:

    • Run: A single execution of a Pipeline. Usually, Pipelines are run due to changes on the code, on the data sources or on its parameters (as Pipelines can have runtime parameters)
    • Experiment: Groups of runs of the same Pipeline (may have different parameters, code or settings, which are then easily comparable). All runs must have an Experiment. An Experiment can contain Runs from different Pipelines.
    • Pipeline Version: Pipeline definitions can be versioned (for instance, early iterations on the flow of operations; different versions for staging and production environments)
    • Pipeline

    \ud83d\udcd6 Get started with the concepts and a step-by-step tutorial

    "},{"location":"pipelines/concepts/#runs-recurring-runs","title":"Runs & Recurring Runs","text":"

    A run is a single execution of a pipeline. Runs comprise an immutable log of all experiments that you attempt, and are designed to be self-contained to allow for reproducibility. You can track the progress of a run by looking at its details page on the pipeline's UI, where you can see the runtime graph, output artifacts, and logs for each step in the run.

    A recurring run, or job in the backend APIs, is a repeatable run of a pipeline. The configuration for a recurring run includes a copy of a pipeline with all parameter values specified and a run trigger. You can start a recurring run inside any experiment, and it will periodically start a new copy of the run configuration. You can enable or disable the recurring run from the pipeline's UI. You can also specify the maximum number of concurrent runs to limit the number of runs launched in parallel. This can be helpful if the pipeline is expected to run for a long period and is triggered to run frequently.

    "},{"location":"pipelines/concepts/#experiment","title":"Experiment","text":"

    An experiment is a workspace where you can try different configurations of your pipelines. You can use experiments to organize your runs into logical groups. Experiments can contain arbitrary runs, including recurring runs.

    "},{"location":"pipelines/concepts/#pipeline-pipeline-version","title":"Pipeline & Pipeline Version","text":"

    A pipeline is a description of a workflow, which can include machine learning (ML) tasks, data preparation or even the generation of synthetic data. The pipeline outlines all the components involved in the workflow and illustrates how these components interrelate in the form of a graph. The pipeline configuration defines the inputs (parameters) required to run the pipeline and specifies the inputs and outputs of each component.

    When you run a pipeline, the system launches one or more Kubernetes Pods corresponding to the steps (components) in your workflow. The Pods start Docker containers, and the containers, in turn, start your programs.

    Pipelines can be easily versioned for reproducibility of results.

    "},{"location":"pipelines/concepts/#artifacts","title":"Artifacts","text":"

    For each block/step in a Run, Artifacts can be generated. Artifacts are raw output data which is automatically rendered in the Pipeline\u2019s UI in a rich manner - as formatted tables, text, charts, bar graphs/scatter plots/line graphs, ROC curves, confusion matrices or inline HTML.

    Artifacts are useful to attach, to each step/block of a data improvement workflow, relevant visualizations, summary tables, data profiling reports or text analyses. They are logged by creating a JSON file with a simple, pre-specified format (according to the output artifact type). Additional types of artifacts are supported (like binary files - models, datasets), yet will not benefit from rich visualizations in the UI.

    Compare side-by-side

    \ud83d\udca1 Artifacts and Metrics can be compared side-by-side across runs, which makes them a powerful tool when doing iterative experimentation over data quality improvement pipelines.

    "},{"location":"pipelines/concepts/#pipelines-examples-in-ydata-academy","title":"Pipelines examples in YData Academy","text":"

    \ud83d\udc49 Use cases on YData\u2019s Academy contain examples of full use-cases as well as Pipelines interface to log metrics and artifacts.

    "},{"location":"pipelines/runs/","title":"Creating & managing runs","text":""},{"location":"pipelines/runs/#viewing-run-details","title":"Viewing Run details","text":"

    To view a specific Run, we need to go into the Experiments list and click on the desired Run. Alternatively, accessing Runs and selecting directly the desired run is possible.

    Acessing Runs through its Experiment

    Viewing the full list of Runs, for all Pipelines and Experiments. Runs can be filtered and sorted based on different fields (including Metrics).

    Once a Run is selected, its graph can be viewed (and in real-time, if the Run is being executing). The graph shows the execution status of each log. Clicking on each block will reveal the block\u2019s details, including artifacts, various configuration details and logs (useful for troubleshooting).

    The details page of a step, showing a profiling report (as HTML) as an Artifact

    The Run Output tab includes outputs such as metrics or binary artifacts.

    "},{"location":"pipelines/runs/#creating-runs","title":"Creating Runs","text":"

    Besides triggering Execution via the pipeline editor in Jupyter Lab or the Python SDK, the Pipelines management UI can also be used.

    "},{"location":"pipelines/runs/#one-off","title":"One-off","text":"

    To create a one-off run of a Pipeline, choose a Pipeline in the Pipelines section (including the specific Pipeline version, in case there are multiple definitions) and click + Create Run.

    Creating a Run of a specific Pipeline

    To finish creating the Run, additional information is needed:

    • a Description (optional)
    • the Experiment (mandatory and can be chosen from the list of existing ones)
    • the Run Type (which should be one-off)
    • any eventual runtime parameters of the Pipeline.

    Clicking Start **will trigger execution. Each Run will have a unique, automatically created ID.

    \ud83d\udca1 One-off runs are useful for, for instance, quickly trying out different parameters or for stable data pipelines where the input data has changed (unexpectedly) and the pipelines needs to be ran again."},{"location":"pipelines/runs/#recurring","title":"Recurring","text":"

    To create a Recurring Run, the procedure shown above should be followed, but instead a Recurring Run Type should be chosen.

    The main configuration parameters of a Recurring Run are the frequency, start date and end date, as well as the maximum number of concurrent Runs of the Pipeline. The maximum number of concurrent Runs is a particularly relevant parameter for Pipelines whose execution time may stretch into the following\u2019s scheduled Run start time - it should be tweaked to avoid overwhelming the available infrastructure. Recurrency can also be configured via cron-like definitions.

    Configuring a Recurrent Run

    The recurring run will keep on executing until its end date or until it is manually disabled. Configured Recurrent Runs are listed on the Recurring Runs section.

    \ud83d\udca1 Recurring runs are useful in several situations: - determining the average execution time of a Pipeline (in case there are run-dependent time fluctuations) - when any of the inputs (for instance, input data read from a remote location) changes at a predictable pace"},{"location":"pipelines/runs/#creating-a-pipeline","title":"Creating a Pipeline","text":"

    The recommended way to create a Pipeline is to use the interactive Pipeline editor available on Labs with Jupyter Lab set as IDE. It allows the:

    • addition of blocks by dragging and dropping notebooks/Python scripts/R scripts (can be a mixture)
    • connecting blocks in linear and non-linear ways to define the execution sequence
    • configuring the parameters of each block in-line.

    Building a simple synthetic data generation pipeline in the interactive editor by dragging and dropping Jupyter Notebooks (Python/R files could also be dragged), leveraging input files for credentials, environment variables for workflow settings, software runtime specification and per-block hardware needs.

    Building a simple synthetic data generation pipeline in the interactive editor by dragging and dropping Jupyter Notebooks (Python/R files could also be dragged), leveraging input files for credentials, environment variables for workflow settings, software runtime specification and per-block hardware needs.

    The built Pipeline can be directly ran from the editor. It will then be automatically available in the dashboard\u2019s web UI, where it can be viewed and managed.

    \ud83d\udc49 To build Pipelines fully via code (in any Python IDE), refer to the [Kubeflow Pipelines SDK](https://www.kubeflow.org/docs/components/pipelines/sdk/sdk-overview/)."},{"location":"pipelines/runs/#managing-pipelines","title":"Managing Pipelines","text":"

    The Pipelines management interface is accessible in the platform\u2019s dashboard, via the sidebar item Pipelines.

    The Pipelines management module

    It has 6 main sub-modules:

    • Pipelines: list of existing Pipelines, which can be further drilled-down into the versions of each Pipeline, as Pipeline definitions can be versioned.
    • Experiments: a **list of all available Experiments (groups of Runs), regardless of their origin Pipeline.
    • Runs: a **list of all available Runs, regardless of their origin Pipeline/Experiment.
    • Recurring Runs: an interface to view and configure the Runs triggered on a schedule.
    • Artifacts: list of Artifacts generated by all Runs of all Pipelines
    • Executions: a list of all executed blocks/steps across all Runs of all Pipelines
    \ud83d\udca1 Pipelines created via code can be compiled to a `.pipeline` file, which can then be submited via the *+ Upload pipeline* button."},{"location":"pipelines/runs/#creating-a-new-experiment","title":"Creating a new Experiment","text":"

    An experiment is used to group together the runs of a single or different Pipelines. It is particularly useful for organization and Artifacts/Metrics comparison purposes.

    To create a new Experiment, access the Experiments section and click + Create Experiment. An Experiment requires a name and an optional description.

    "},{"location":"pipelines/runs/#comparing-runs","title":"Comparing Runs","text":"

    Comparing runs is particularly useful in iterative data improvement scenarios, as Artifacts, Metrics and Parameters can be directly compared side-by-side. Runs using different pre-processing techniques, settings, algorithms can be put against each other side-by-side in a visual and intuitive interface.

    To compare multiple Runs, select the Runs of interest (either from the Experiments or Runs pane) and select Compare runs:

    Selecting Runs to compare from the Experiments list

    In case of this particular data quality improvement Pipeline, the Metrics of each Run are shown side by side.

    Up to 10 runs can be selected for side-by-side comparison. In case any step of the Run has logged Artifacts, the equivalent Artifacts are shown in a comparative interface.

    Comparing the confusion matrices of three Runs of a Pipeline, which were logged as Artifacts during one of the Pipeline\u2019s steps.

    "},{"location":"pipelines/runs/#cloning-runs","title":"Cloning Runs","text":"

    For full reproducibility purposes, it is possible to select a previous run and clone it. Cloned runs will use exactly the same runtime input parameters and settings. However, any time dependent inputs (like the state of a remote data source at a particular point in time) will not be recreated.

    To clone a Run, click the Clone run button available in a Run\u2019s detail page or in the list of Runs/Experiment (when a single Run is selected). It will be possible to review the settings prior to triggering the execution.

    "},{"location":"pipelines/runs/#archiving-runs","title":"Archiving Runs","text":"

    Archiving a Run will move it to the Archived section the Runs and Experiments list. This section can be used to save older executions, to highlight best runs or to record anomalous executions which require further digging into.

    Archive a Run by clicking the Archive button from the Run\u2019s details page (or from the list of Runs/Experiments when a Run is selected).

    The Archived section, which is in all ways similar to the list of Active buttons. The Restore button (highlighted) moves Runs between the two sections.

    When a Run is archived, it can be restored through the Restore button.

    \ud83d\udca1 **Learn by example** To understand how to best apply the full capabilities of Pipelines in real world use cases, check out the [use cases section of YData\u2019s Academy](https://github.com/ydataai/academy/tree/master/5%20-%20use-cases). Most use cases include a pipeline leveraging common and use case specific features of the Pipelines module. These pipelines are offered in `.pipeline` files which can be interactively explored in Jupyter Lab, inside Labs.

    "},{"location":"sdk/","title":"Overview","text":"

    The Fabric SDK is an ecosystem of methods that allows users to, through a python interface, adopt data development focused on improving the quality of the data. The solution includes a set of integrated components for data ingestion, standardized data quality evaluation and data improvement, such as synthetic data generation, allowing an iterative improvement of the datasets used in high-impact business applications.

    YData Fabric SDK for improved data quality everywhere!

    To start using create a Fabric community account at ydata.ai/register

    "},{"location":"sdk/#benefits","title":"Benefits","text":"

    Fabric SDK interface enables the ability to integrate data quality tooling with other platforms offering several beneficts in the realm of data science development and data management:

    • Interoperability: seamless integration with other data platform and systems like Databricks, Snowflake, etc. This ensures that all your software will work cohesively with all the elements from your data architecture.
    • Collaboration: ease of integration with a multitude of tools and services, reducing the need to reinvent the wheel and fostering a collaborative environment for all developers (data scientists, data engineers, software developers, etc.)
    • Improved usage experience: Fabric SDK enables a well-integrated software solution, which allows a seamless transition between different tools or platforms without facing compatibility issues.
    "},{"location":"sdk/#current-functionality","title":"Current functionality","text":"

    Fabric SDK is currently composed by the following main modules:

    • Datasources

      • YData\u2019s SDK includes several connectors for easy integration with existing data sources. It supports several storage types, like filesystems and RDBMS. Check the list of connectors.
      • SDK\u2019s Datasources run on top of Dask, which allows it to deal with not only small workloads but also larger volumes of data.
    • Synthetic data generators

      • Simplified interface to train a generative model and learn in a data-driven manner the behavior, the patterns and original data distribution. Optimize your model for privacy or utility use-cases.
      • From a trained synthetic data generator, you can generate synthetic samples as needed and parametrise the number of records needed.
      • Anonymization and privacy preserving capabilities to ensure that synthetic datasets does not contain Personal Identifiable Information (PII) and can safely be shared!
      • Conditional sampling can be used to restrict the domain and values of specific features in the sampled data.
    • Synthetic data quality report Coming soon

      • An extensive synthetic data quality report that measures 3 dimensions: privacy, utility and fidelity of the generated data. The report can be downloaded in PDF format for ease of sharing and compliance purposes or as a JSON to enable the integration in data flows.
    • Profiling Coming soon

      • A set of metrics and algorithms summarizes datasets quality in three main dimensions: warnings, univariate analysis and a multivariate perspective.
    "},{"location":"sdk/#supported-data-formats","title":"Supported data formats","text":"TabularTime-SeriesTransactionalRelational databases

    The RegularSynthesizer is perfect to synthesize high-dimensional data, that is time-indepentent with high quality results.

    The TimeSeriesSynthesizer is perfect to synthesize both regularly and not evenly spaced time-series, from smart-sensors to stock.

    The TimeSeriesSynthesizer supports transactional data, known to have highly irregular time intervals between records and directional relations between entities.

    Coming soon

    The MultiTableSynthesizer is perfect to learn how to replicate the data within a relational database schema.

    "},{"location":"sdk/installation/","title":"Installation","text":"

    YData SDK is generally available through both Pypi and Conda allowing an easy process of installation. This experience allows combining YData SDK with other packages such as Pandas, Numpy or Scikit-Learn.

    YData SDK is available for the public through a token-based authentication system. If you don\u2019t have one yet, you can get your free license key during the installation process. You can check what features are available in the free version here.

    "},{"location":"sdk/installation/#installing-the-package","title":"Installing the package","text":"

    YData SDK supports python versions bigger than python 3.8, and can be installed in Windows, Linux or MacOS operating systems.

    Prior to the package installation, it is recommended the creation of a virtual or conda environment:

    pyenv
    pyenv virtualenv 3.10 ydatasdk\n

    And install ydata-sdk

    pypi
    pip install ydata-sdk\n
    "},{"location":"sdk/installation/#authentication","title":"Authentication","text":"

    Once you've installed ydata-sdk package you will need a token to run the functionalities. YData SDK uses a token based authentication system. To get access to your token, you need to create a YData account.

    YData SDK offers a free-trial and an enterprise version. To access your free-trial token, you need to create a YData account.

    The token will be available here, after login:

    With your account toke copied, you can set a new environment variable YDATA_TOKEN in the beginning of your development session.

        import os\n\n    os.setenv['YDATA_TOKEN'] = '{add-your-token}'\n

    Once you have set your token, you are good to go to start exploring the incredible world of data-centric AI and smart synthetic data generation!

    Check out our quickstart guide!

    "},{"location":"sdk/quickstart/","title":"Quickstart","text":"

    YData SDK allows you to with an easy and familiar interface, to adopt a Data-Centric AI approach for the development of Machine Learning solutions. YData SDK features were designed to support structure data, including tabular data, time-series and transactional data.

    "},{"location":"sdk/quickstart/#read-data","title":"Read data","text":"

    To start leveraging the package features you should consume your data either through the Connectors or pandas.Dataframe. The list of available connectors can be found here [add a link].

    From pandas dataframeFrom a connector
        # Example for a Google Cloud Storage Connector\n    credentials = \"{insert-credentials-file-path}\"\n\n    # We create a new connector for Google Cloud Storage\n    connector = Connector(connector_type='gcs', credentials=credentials)\n\n    # Create a Datasource from the connector\n    # Note that a connector can be re-used for several datasources\n    X = DataSource(connector=connector, path='gs://<my_bucket>.csv')\n
        # Load a small dataset\n    X = pd.read_csv('{insert-file-path.csv}')\n\n    # Init a synthesizer\n    synth = RegularSynthesizer()\n\n    # Train the synthesizer with the pandas Dataframe as input\n    # The data is then sent to the cluster for processing\n    synth.fit(X)\n

    The synthesis process returns a pandas.DataFrame object. Note that if you are using the ydata-sdk free version, all of your data is sent to a remote cluster on YData's infrastructure.

    "},{"location":"sdk/quickstart/#data-synthesis-flow","title":"Data synthesis flow","text":"

    The process of data synthesis can be described into the following steps:

    stateDiagram-v2\n  state read_data\n  read_data --> init_synth\n  init_synth --> train_synth\n  train_synth --> generate_samples\n  generate_samples --> [*]

    The code snippet below shows how easy can be to start generating new synthetic data. The package includes a set of examples datasets for a quickstart.

        from ydata.sdk.dataset import get_dataset\n\n    #read the example data\n    X = get_dataset('census')\n\n    # Init a synthesizer\n    synth = RegularSynthesizer()\n\n    # Fit the synthesizer to the input data\n    synth.fit(X)\n\n    # Sample new synthetic data. The below request ask for new 1000 synthetic rows\n    synth.sample(n_samples=1000)\n

    Do I need to prepare my data before synthesis?

    The sdk ensures that the original behaviour is replicated. For that reason, there is no need to preprocess outlier observations or missing data.

    By default all the missing data is replicated as NaN.

    "},{"location":"sdk/examples/synthesize_tabular_data/","title":"Synthesize tabular data","text":"

    Use YData's RegularSynthesizer to generate tabular synthetic data

    For a more detailed tutorial please check YData Fabric Academy ydata-sdk notebooks.

    import os\n\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\n\n\ndef main():\n    \"\"\"In this example, we demonstrate how to train a synthesizer from a pandas\n    DataFrame.\n\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\n    X = get_dataset('census')\n\n    # We initialize a regular synthesizer\n    # As long as the synthesizer does not call `fit`, it exists only locally\n    synth = RegularSynthesizer()\n\n    # We train the synthesizer on our dataset\n    synth.fit(X)\n\n    # We request a synthetic dataset with 50 rows\n    sample = synth.sample(n_samples=50)\n\n    print(sample.shape)\n\n\nif __name__ == \"__main__\":\n    main()\n
    "},{"location":"sdk/examples/synthesize_timeseries_data/","title":"Synthesize time-series data","text":"

    Use YData's TimeSeriesSynthesizer to generate time-series synthetic data

    Tabular data is the most common type of data we encounter in data problems.

    When thinking about tabular data, we assume independence between different records, but this does not happen in reality. Suppose we check events from our day-to-day life, such as room temperature changes, bank account transactions, stock price fluctuations, and air quality measurements in our neighborhood. In that case, we might end up with datasets where measures and records evolve and are related through time. This type of data is known to be sequential or time-series data.

    Thus, sequential or time-series data refers to any data containing elements ordered into sequences in a structured format. Dissecting any time-series dataset, we see differences in variables' behavior that need to be understood for an effective generation of synthetic data. Typically any time-series dataset is composed of the following:

    • Variables that define the order of time (these can be simple with one variable or composed)
    • Time-variant variables
    • Variables that refer to entities (single or multiple entities)
    • Variables that are attributes (those that don't depend on time but rather on the entity)

    For a more detailed tutorial please check YData Fabric Academy ydata-sdk notebooks.

    Below find an example:

    # -*- coding: utf-8 -*-\n\n# Authentication\nimport os\n\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import TimeSeriesSynthesizer\n\n# Do not forget to add your token as env variable\nos.environ[\"YDATA_TOKEN\"] = '{insert-token}'\n\n\n# Sampling an example dataset for a multientity & multivariate time-series dataset\"\"\"\n\n# Generate the dataset\ntime_series_data = get_dataset('timeseries')\n\n# Print the first few rows of the dataset\nprint(time_series_data.head())\n\n# Train a Synthetic data generator\n\n# From a pandas dataframe\n\n# We initialize a time series synthesizer\n# As long as the synthesizer does not call `fit`, it exists only locally\nsynth = TimeSeriesSynthesizer(name='Time-series synth')\n\n# We train the synthesizer on our dataset\n# sortbykey -> variable that define the time order for the sequence\nsynth.fit(time_series_data, sortbykey='time', entities='entity_id')\n\n# Generate samples from an already trained synthesizer\n# From the synthesizer in context in the notebook\n\n\n# Generate a sample with x number of entities\n# In this example the objective is to generate a dataset with the same size as the original. For that reason, 5 entities will be generated.\nsample = synth.sample(n_entities=5)\n\nsample.head()\n\n# From a previously trained synthetic data generation model\n# List the trained synthetic data generators to get the uid synthetisizer\nTimeSeriesSynthesizer.list()\n\nsynth = TimeSeriesSynthesizer(uid='{insert-synth-id}').get()\n\n# Generate a new synthetic dataset with the sample method\nsample = synth.sample(n_entities=5)\n\nsample.head()\n
    "},{"location":"sdk/examples/synthesize_with_anonymization/","title":"Anonymization","text":"

    YData Synthesizers offers a way to anonymize sensitive information such that the original values are not present in the synthetic data but replaced by fake values.

    Does the model retain the original values?

    No! The anonymization is performed before the model training such that it never sees the original values.

    The anonymization is performed by specifying which columns need to be anonymized and how to perform the anonymization. The anonymization rules are defined as a dictionary with the following format:

    {column_name: anonymization_rule}

    While here are some predefined anonymization rules such as name, email, company, it is also possible to create a rule using a regular expression. The anonymization rules have to be passed to a synthesizer in its fit method using the parameter anonymize.

    What is the difference between anonymization and privacy?

    Anonymization makes sure sensitive information are hidden from the data. Privacy makes sure it is not possible to infer the original data points from the synthetic data points via statistical attacks.

    Therefore, for data sharing anonymization and privacy controls are complementary.

    The example below demonstrates how to anonymize the column Name by fake names and the column Ticket by a regular expression:

    import os\n\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\n\n\ndef main():\n    \"\"\"In this example, we demonstrate how to train a synthesizer from a pandas\n    DataFrame.\n\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\n    X = get_dataset('titanic')\n\n    # We initialize a regular synthesizer\n    # As long as the synthesizer does not call `fit`, it exists only locally\n    synth = RegularSynthesizer(name=\"Titanic\")\n\n    # We define anonymization rules, which is a dictionary with format:\n    # {column_name: anonymization_rule, ...}\n    # while here are some predefined anonymization rules like: name, email, company\n    # it is also possible to create a rule using a regular expression\n    rules = {\n        \"Name\": \"name\",\n        \"Ticket\": \"[A-Z]{2}-[A-Z]{4}\"\n    }\n\n    # or a different option for anonymization configuration\n\n    rules = {\n        'Name': {'type': 'name'},\n        'Ticket': {'type': 'regex',\n                   'regex': '[A-Z]{2}-[A-Z]{4}'}\n    }\n\n    # We train the synthesizer on our dataset\n    synth.fit(\n        X,\n        anonymize=rules\n    )\n\n    # We request a synthetic dataset with 50 rows\n    sample = synth.sample(n_samples=50)\n\n    print(sample[[\"Name\", \"Ticket\"]].head(3))\n\n\nif __name__ == \"__main__\":\n    main()\n

    "},{"location":"sdk/examples/synthesize_with_conditional_sampling/","title":"Conditional sampling","text":"

    YData Synthesizers support conditional sampling. The fit method has an optional parameter named condition_on, which receives a list of features to condition upon. Furthermore, the sample method receives the conditions to be applied through another optional parameter also named condition_on. For now, two types of conditions are supported:

    • Condition upon a categorical (or string) feature. The parameters are the name of the feature and a list of values (i.e., categories) to be considered. Each category also has its percentage of representativeness. For example, if we want to condition upon two categories, we need to define the percentage of rows each of these categories will have on the synthetic dataset. Naturally, the sum of such percentages needs to be 1. The default percentage is also 1 since it is the required value for a single category.
    • Condition upon a numerical feature. The parameters are the name of the feature and the minimum and maximum of the range to be considered. This feature will present a uniform distribution on the synthetic dataset, limited by the specified range.

    The example below demonstrates how to train and sample from a synthesizer using conditional sampling:

    import os\n\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import RegularSynthesizer\n\n# Do not forget to add your token as env variables.\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined.\n\n\ndef main():\n    \"\"\"In this example, we demonstrate how to train and\n    sample from a synthesizer using conditional sampling.\"\"\"\n    X = get_dataset('census')\n\n    # We initialize a regular synthesizer.\n    # As long as the synthesizer does not call `fit`, it exists only locally.\n    synth = RegularSynthesizer()\n\n    # We train the synthesizer on our dataset setting\n    # the features to condition upon.\n    synth.fit(\n        X,\n        name=\"census_synthesizer\",\n        condition_on=[\"sex\", \"native-country\", \"age\"]\n    )\n\n    # We request a synthetic dataset with specific condition rules.\n    sample = synth.sample(\n        n_samples=500,\n        condition_on={\n            \"sex\": {\n                \"categories\": [{\n                    \"category\": 'Female',\n                    \"percentage\": 0.7\n                }]\n            },\n            \"native-country\": {\n                \"categories\": [{\n                    \"category\": 'United-States',\n                    \"percentage\": 0.6\n                }, {\n                    \"category\": 'Mexico',\n                    \"percentage\": 0.4\n                }]\n            },\n            \"age\": {\n                \"minimum\": 55,\n                \"maximum\": 60\n            }\n        }\n    )\n    print(sample)\n\n\nif __name__ == \"__main__\":\n    main()\n
    "},{"location":"sdk/examples/synthesize_with_privacy_control/","title":"Privacy control","text":"

    YData Synthesizers offers 3 different levels of privacy:

    1. high privacy: the model is optimized for privacy purposes,
    2. high fidelity (default): the model is optimized for high fidelity,
    3. balanced: tradeoff between privacy and fidelity.

    The default privacy level is high fidelity. The privacy level can be changed by the user at the moment a synthesizer level is trained by using the parameter privacy_level. The parameter expect a PrivacyLevel value.

    What is the difference between anonymization and privacy?

    Anonymization makes sure sensitive information are hidden from the data. Privacy makes sure it is not possible to infer the original data points from the synthetic data points via statistical attacks.

    Therefore, for data sharing anonymization and privacy controls are complementary.

    The example below demonstrates how to train a synthesizer configured for high privacy:

    import os\n\nfrom ydata.sdk.dataset import get_dataset\nfrom ydata.sdk.synthesizers import PrivacyLevel, RegularSynthesizer\n\n# Do not forget to add your token as env variables\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\n\n\ndef main():\n    \"\"\"In this example, we demonstrate how to train a synthesizer\n    with a high-privacy setting from a pandas DataFrame.\n    After training a Regular Synthesizer, we request a sample.\n    \"\"\"\n    X = get_dataset('titanic')\n\n    # We initialize a regular synthesizer\n    # As long as the synthesizer does not call `fit`, it exists only locally\n    synth = RegularSynthesizer()\n\n    # We train the synthesizer on our dataset setting the privacy level to high\n    synth.fit(\n        X,\n        name=\"titanic_synthesizer\",\n        privacy_level=PrivacyLevel.HIGH_PRIVACY\n    )\n\n    # We request a synthetic dataset with 50 rows\n    sample = synth.sample(n_samples=50)\n    print(sample)\n\n\nif __name__ == \"__main__\":\n    main()\n
    "},{"location":"sdk/examples/synthesizer_multitable/","title":"Synthesize Relational databases","text":"

    Integrate Fabric's MultiTableSynthesizer in your data flows and generate synthetic relational databases or multi-table datasets

    The capability to generate synthetic data from relational databases is a powerful and innovative approach to streamline the access to data and improve data democratization strategy within the organization. Fabric's SDK makes available an easy-to-use code interface to integrate the process of generating synthetic multi-table databases into your existing data flows.

    How to get your datasource?

    Learn how to create your multi-table data in Fabric here before creating your first multi-table synthetic data generator!

    Get your datasource and connector ID

    Datasource uid: You can find your datasource ID through Fabric UI. Open your relational dataset and click in the \"Explore in Labs\" button. Copy the uid that you find available in the code snippet.

    Connector uid: You can find your connector ID through Fabric UI. Open the connector tab from your Data Catalog. Under the connector \"Actions\" select \"Explore in Lab\". Copy the uid available in the code snippet.

    Quickstart example:

    import os\n\nfrom ydata.sdk.datasources import DataSource\nfrom ydata.sdk.synthesizers import MultiTableSynthesizer\n\n# Authenticate to Fabric to leverage the SDK - https://docs.sdk.ydata.ai/latest/sdk/installation/\n# Make sure to add your token as env variable.\nos.environ[\"YDATA_TOKEN\"] = '<TOKEN>'  # Remove if already defined\n\n# In this example, we demonstrate how to train a synthesizer from an existing RDBMS Dataset.\n# Make sure to follow the step-by-step guide to create a Dataset in Fabric's catalog: https://docs.sdk.ydata.ai/latest/get-started/create_multitable_dataset/\nX = DataSource.get('<DATASOURCE_UID>')\n\n# Init a multi-table synthesizer. Provide a connector so that the process of data synthesis write the\n# synthetic data into the destination database\n# Provide a connector ID as the write_connector argument. See in this tutorial how to get a connector ID\nsynth = MultiTableSynthesizer(write_connector='<CONNECTOR_UID')\n\n# Start the training of your synthetic data generator\nsynth.fit(X)\n\n# As soon as the training process is completed you are able to sample a synthetic database\n# The input expected is a percentage of the original database size\n# In this case it was requested a synthetic database with the same size as the original\n# Your synthetic sample was written to the database provided in the write_connector\nsynth.sample(frac=1.)\n
    "},{"location":"sdk/modules/connectors/","title":"Connectors","text":"

    YData SDK allows users to consume data assets from remote storages through Connectors. YData Connectors support different types of storages, from filesystems to RDBMS'.

    Below the list of available connectors:

    Connector Name Type Supported File Types Useful Links Notes AWS S3 Remote object storage CSV, Parquet https://aws.amazon.com/s3/ Google Cloud Storage Remote object storage CSV, Parquet https://cloud.google.com/storage Azure Blob Storage Remote object storage CSV, Parquet https://azure.microsoft.com/en-us/services/storage/blobs/ File Upload Local CSV - Maximum file size is 220MB. Bigger files should be uploaded and read from remote object storages MySQL RDBMS Not applicable https://www.mysql.com/ Supports reading whole schemas or specifying a query Azure SQL Server RDBMS Not applicable https://azure.microsoft.com/en-us/services/sql-database/campaign/ Supports reading whole schemas or specifying a query PostgreSQL RDBMS Not applicable https://www.postgresql.org/ Supports reading whole schemas or specifying a query Snowflake RDBMS Not applicable https://docs.snowflake.com/en/sql-reference-commands Supports reading whole schemas or specifying a query Google BigQuery Data warehouse Not applicable https://cloud.google.com/bigquery Azure Data Lake Data lake CSV, Parquet https://azure.microsoft.com/en-us/services/storage/data-lake-storage/

    More details can be found at Connectors APi Reference Docs.

    "},{"location":"sdk/modules/synthetic_data/","title":"Synthetic data generation","text":""},{"location":"sdk/modules/synthetic_data/#data-formats","title":"Data formats","text":""},{"location":"sdk/modules/synthetic_data/#tabular-data","title":"Tabular data","text":""},{"location":"sdk/modules/synthetic_data/#time-series-data","title":"Time-series data","text":""},{"location":"sdk/modules/synthetic_data/#transactions-data","title":"Transactions data","text":""},{"location":"sdk/modules/synthetic_data/#best-practices","title":"Best practices","text":""},{"location":"sdk/reference/api/common/client/","title":"Get client","text":"

    Deduce how to initialize or retrieve the client.

    This is meant to be a zero configuration for the user.

    Create and set a client globally
    from ydata.sdk.client import get_client\nget_client(set_as_global=True)\n

    Parameters:

    Name Type Description Default client_or_creds Optional[Union[Client, dict, str, Path]]

    Client to forward or credentials for initialization

    None set_as_global bool

    If True, set client as global

    False wait_for_auth bool

    If True, wait for the user to authenticate

    True

    Returns:

    Type Description Client

    Client instance

    Source code in ydata/sdk/common/client/utils.py
    def get_client(client_or_creds: Optional[Union[Client, Dict, str, Path]] = None, set_as_global: bool = False, wait_for_auth: bool = True) -> Client:\n    \"\"\"Deduce how to initialize or retrieve the client.\n\n    This is meant to be a zero configuration for the user.\n\n    Example: Create and set a client globally\n            ```py\n            from ydata.sdk.client import get_client\n            get_client(set_as_global=True)\n            ```\n\n    Args:\n        client_or_creds (Optional[Union[Client, dict, str, Path]]): Client to forward or credentials for initialization\n        set_as_global (bool): If `True`, set client as global\n        wait_for_auth (bool): If `True`, wait for the user to authenticate\n\n    Returns:\n        Client instance\n    \"\"\"\n    client = None\n    global WAITING_FOR_CLIENT\n    try:\n\n        # If a client instance is set globally, return it\n        if not set_as_global and Client.GLOBAL_CLIENT is not None:\n            return Client.GLOBAL_CLIENT\n\n        # Client exists, forward it\n        if isinstance(client_or_creds, Client):\n            return client_or_creds\n\n        # Explicit credentials\n        ''' # For the first version, we deactivate explicit credentials via string or file for env var only\n        if isinstance(client_or_creds, (dict, str, Path)):\n            if isinstance(client_or_creds, str):  # noqa: SIM102\n                if Path(client_or_creds).is_file():\n                    client_or_creds = Path(client_or_creds)\n\n            if isinstance(client_or_creds, Path):\n                client_or_creds = json.loads(client_or_creds.open().read())\n\n            return Client(credentials=client_or_creds)\n\n        # Last try with environment variables\n        #if client_or_creds is None:\n        client = _client_from_env(wait_for_auth=wait_for_auth)\n        '''\n        credentials = environ.get(TOKEN_VAR)\n        if credentials is not None:\n            client = Client(credentials=credentials)\n\n    except ClientHandshakeError as e:\n        wait_for_auth = False  # For now deactivate wait_for_auth until the backend is ready\n        if wait_for_auth:\n            WAITING_FOR_CLIENT = True\n            start = time()\n            login_message_printed = False\n            while client is None:\n                if not login_message_printed:\n                    print(\n                        f\"The token needs to be refreshed - please validate your token by browsing at the following URL:\\n\\n\\t{e.auth_link}\")\n                    login_message_printed = True\n                with suppress(ClientCreationError):\n                    sleep(BACKOFF)\n                    client = get_client(wait_for_auth=False)\n                now = time()\n                if now - start > CLIENT_INIT_TIMEOUT:\n                    WAITING_FOR_CLIENT = False\n                    break\n\n    if client is None and not WAITING_FOR_CLIENT:\n        sys.tracebacklimit = None\n        raise ClientCreationError\n    return client\n

    Main Client class used to abstract the connection to the backend.

    A normal user should not have to instanciate a Client by itself. However, in the future it will be useful for power-users to manage projects and connections.

    Parameters:

    Name Type Description Default credentials Optional[dict]

    (optional) Credentials to connect

    None project Optional[Project]

    (optional) Project to connect to. If not specified, the client will connect to the default user's project.

    None Source code in ydata/sdk/common/client/client.py
    @typechecked\nclass Client(metaclass=SingletonClient):\n    \"\"\"Main Client class used to abstract the connection to the backend.\n\n    A normal user should not have to instanciate a [`Client`][ydata.sdk.common.client.Client] by itself.\n    However, in the future it will be useful for power-users to manage projects and connections.\n\n    Args:\n        credentials (Optional[dict]): (optional) Credentials to connect\n        project (Optional[Project]): (optional) Project to connect to. If not specified, the client will connect to the default user's project.\n    \"\"\"\n\n    codes = codes\n\n    DEFAULT_PROJECT: Optional[Project] = environ.get(\"DEFAULT_PROJECT\", None)\n\n    def __init__(self, credentials: Optional[Union[str, Dict]] = None, project: Optional[Project] = None, set_as_global: bool = False):\n        self._base_url = environ.get(\"YDATA_BASE_URL\", DEFAULT_URL).removesuffix('/')\n        self._verify_ssl = bool(int(environ.get('YDATA_VERIFY_SSL', 1)))\n        self._headers = {'Authorization': credentials}\n\n        if self._verify_ssl is False:\n            self._http_client = httpClient(\n                headers=self._headers, timeout=Timeout(10, read=None), verify=self._verify_ssl)\n        else:\n            self._http_client = httpClient(\n                headers=self._headers, timeout=Timeout(10, read=None))\n\n        self._handshake()\n\n        self._default_project = project or Client.DEFAULT_PROJECT or self._get_default_project(\n            credentials)\n        if set_as_global:\n            self.__set_global()\n\n    @property\n    def project(self) -> Project:\n        return Client.DEFAULT_PROJECT or self._default_project\n\n    @project.setter\n    def project(self, value: Project):\n        self._default_project = value\n\n    def post(\n        self, endpoint: str, content: Optional[RequestContent] = None, data: Optional[Dict] = None,\n        json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n        raise_for_status: bool = True\n    ) -> Response:\n        \"\"\"POST request to the backend.\n\n        Args:\n            endpoint (str): POST endpoint\n            content (Optional[RequestContent])\n            data (Optional[dict]): (optional) multipart form data\n            json (Optional[dict]): (optional) json data\n            files (Optional[dict]): (optional) files to be sent\n            raise_for_status (bool): raise an exception on error\n\n        Returns:\n            Response object\n        \"\"\"\n        url_data = self.__build_url(\n            endpoint, data=data, json=json, files=files, project=project)\n        response = self._http_client.post(**url_data)\n\n        if response.status_code != Client.codes.OK and raise_for_status:\n            self.__raise_for_status(response)\n\n        return response\n\n    def patch(\n        self, endpoint: str, content: Optional[RequestContent] = None, data: Optional[Dict] = None,\n        json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n        raise_for_status: bool = True\n    ) -> Response:\n        \"\"\"PATCH request to the backend.\n\n        Args:\n            endpoint (str): POST endpoint\n            content (Optional[RequestContent])\n            data (Optional[dict]): (optional) multipart form data\n            json (Optional[dict]): (optional) json data\n            files (Optional[dict]): (optional) files to be sent\n            raise_for_status (bool): raise an exception on error\n\n        Returns:\n            Response object\n        \"\"\"\n        url_data = self.__build_url(\n            endpoint, data=data, json=json, files=files, project=project)\n        response = self._http_client.patch(**url_data, content=content)\n\n        if response.status_code != Client.codes.OK and raise_for_status:\n            self.__raise_for_status(response)\n\n        return response\n\n    def get(\n        self, endpoint: str, params: Optional[Dict] = None, project: Optional[Project] = None,\n        cookies: Optional[Dict] = None, raise_for_status: bool = True\n    ) -> Response:\n        \"\"\"GET request to the backend.\n\n        Args:\n            endpoint (str): GET endpoint\n            cookies (Optional[dict]): (optional) cookies data\n            raise_for_status (bool): raise an exception on error\n\n        Returns:\n            Response object\n        \"\"\"\n        url_data = self.__build_url(endpoint, params=params,\n                                    cookies=cookies, project=project)\n        response = self._http_client.get(**url_data)\n\n        if response.status_code != Client.codes.OK and raise_for_status:\n            self.__raise_for_status(response)\n\n        return response\n\n    def get_static_file(\n        self, endpoint: str, project: Optional[Project] = None, raise_for_status: bool = True\n    ) -> Response:\n        \"\"\"Retrieve a static file from the backend.\n\n        Args:\n            endpoint (str): GET endpoint\n            raise_for_status (bool): raise an exception on error\n\n        Returns:\n            Response object\n        \"\"\"\n        from urllib.parse import urlparse\n        url_data = self.__build_url(endpoint, project=project)\n        url_parse = urlparse(self._base_url)\n        url_data['url'] = f\"\"\"{\n            url_parse.scheme}://{url_parse.netloc}/static-content{endpoint}\"\"\"\n        response = self._http_client.get(**url_data)\n\n        if response.status_code != Client.codes.OK and raise_for_status:\n            self.__raise_for_status(response)\n\n        return response\n\n    def _handshake(self):\n        \"\"\"Client handshake.\n\n        It is used to determine is the client can connect with its\n        current authorization token.\n        \"\"\"\n        response = self.get('/profiles', params={}, raise_for_status=False)\n        if response.status_code == Client.codes.FOUND:\n            parser = LinkExtractor()\n            parser.feed(response.text)\n            raise ClientHandshakeError(auth_link=parser.link)\n\n    def _get_default_project(self, token: str):\n        response = self.get('/profiles/me', params={}, cookies={'access_token': token})\n        data: Dict = response.json()\n        return data['myWorkspace']\n\n    def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,\n                    json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n                    cookies: Optional[Dict] = None) -> Dict:\n        \"\"\"Build a request for the backend.\n\n        Args:\n            endpoint (str): backend endpoint\n            params (Optional[dict]): URL parameters\n            data (Optional[Project]): (optional) multipart form data\n            json (Optional[dict]): (optional) json data\n            files (Optional[dict]): (optional) files to be sent\n            cookies (Optional[dict]): (optional) cookies data\n\n        Returns:\n            dictionary containing the information to perform a request\n        \"\"\"\n        _params = params if params is not None else {\n            'ns': project or self._default_project\n        }\n\n        url_data = {\n            'url': f\"\"\"{self._base_url}/{endpoint.removeprefix(\"/\")}\"\"\",\n            'headers': self._headers,\n            'params': _params,\n        }\n\n        if data is not None:\n            url_data['data'] = data\n\n        if json is not None:\n            url_data['json'] = json\n\n        if files is not None:\n            url_data['files'] = files\n\n        if cookies is not None:\n            url_data['cookies'] = cookies\n\n        return url_data\n\n    def __set_global(self) -> None:\n        \"\"\"Sets a client instance as global.\"\"\"\n        # If the client is stateful, close it gracefully!\n        Client.GLOBAL_CLIENT = self\n\n    def __raise_for_status(self, response: Response) -> None:\n        \"\"\"Raise an exception if the response is not OK.\n\n        When an exception is raised, we try to convert it to a ResponseError which is\n        a wrapper around a backend error. This usually gives enough context and provides\n        nice error message.\n\n        If it cannot be converted to ResponseError, it is re-raised.\n\n        Args:\n            response (Response): response to analyze\n        \"\"\"\n        try:\n            response.raise_for_status()\n        except HTTPStatusError as e:\n            with suppress(Exception):\n                e = ResponseError(**response.json())\n            raise e\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__build_url","title":"__build_url(endpoint, params=None, data=None, json=None, project=None, files=None, cookies=None)","text":"

    Build a request for the backend.

    Parameters:

    Name Type Description Default endpoint str

    backend endpoint

    required params Optional[dict]

    URL parameters

    None data Optional[Project]

    (optional) multipart form data

    None json Optional[dict]

    (optional) json data

    None files Optional[dict]

    (optional) files to be sent

    None cookies Optional[dict]

    (optional) cookies data

    None

    Returns:

    Type Description Dict

    dictionary containing the information to perform a request

    Source code in ydata/sdk/common/client/client.py
    def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,\n                json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n                cookies: Optional[Dict] = None) -> Dict:\n    \"\"\"Build a request for the backend.\n\n    Args:\n        endpoint (str): backend endpoint\n        params (Optional[dict]): URL parameters\n        data (Optional[Project]): (optional) multipart form data\n        json (Optional[dict]): (optional) json data\n        files (Optional[dict]): (optional) files to be sent\n        cookies (Optional[dict]): (optional) cookies data\n\n    Returns:\n        dictionary containing the information to perform a request\n    \"\"\"\n    _params = params if params is not None else {\n        'ns': project or self._default_project\n    }\n\n    url_data = {\n        'url': f\"\"\"{self._base_url}/{endpoint.removeprefix(\"/\")}\"\"\",\n        'headers': self._headers,\n        'params': _params,\n    }\n\n    if data is not None:\n        url_data['data'] = data\n\n    if json is not None:\n        url_data['json'] = json\n\n    if files is not None:\n        url_data['files'] = files\n\n    if cookies is not None:\n        url_data['cookies'] = cookies\n\n    return url_data\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__raise_for_status","title":"__raise_for_status(response)","text":"

    Raise an exception if the response is not OK.

    When an exception is raised, we try to convert it to a ResponseError which is a wrapper around a backend error. This usually gives enough context and provides nice error message.

    If it cannot be converted to ResponseError, it is re-raised.

    Parameters:

    Name Type Description Default response Response

    response to analyze

    required Source code in ydata/sdk/common/client/client.py
    def __raise_for_status(self, response: Response) -> None:\n    \"\"\"Raise an exception if the response is not OK.\n\n    When an exception is raised, we try to convert it to a ResponseError which is\n    a wrapper around a backend error. This usually gives enough context and provides\n    nice error message.\n\n    If it cannot be converted to ResponseError, it is re-raised.\n\n    Args:\n        response (Response): response to analyze\n    \"\"\"\n    try:\n        response.raise_for_status()\n    except HTTPStatusError as e:\n        with suppress(Exception):\n            e = ResponseError(**response.json())\n        raise e\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.__set_global","title":"__set_global()","text":"

    Sets a client instance as global.

    Source code in ydata/sdk/common/client/client.py
    def __set_global(self) -> None:\n    \"\"\"Sets a client instance as global.\"\"\"\n    # If the client is stateful, close it gracefully!\n    Client.GLOBAL_CLIENT = self\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.get","title":"get(endpoint, params=None, project=None, cookies=None, raise_for_status=True)","text":"

    GET request to the backend.

    Parameters:

    Name Type Description Default endpoint str

    GET endpoint

    required cookies Optional[dict]

    (optional) cookies data

    None raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def get(\n    self, endpoint: str, params: Optional[Dict] = None, project: Optional[Project] = None,\n    cookies: Optional[Dict] = None, raise_for_status: bool = True\n) -> Response:\n    \"\"\"GET request to the backend.\n\n    Args:\n        endpoint (str): GET endpoint\n        cookies (Optional[dict]): (optional) cookies data\n        raise_for_status (bool): raise an exception on error\n\n    Returns:\n        Response object\n    \"\"\"\n    url_data = self.__build_url(endpoint, params=params,\n                                cookies=cookies, project=project)\n    response = self._http_client.get(**url_data)\n\n    if response.status_code != Client.codes.OK and raise_for_status:\n        self.__raise_for_status(response)\n\n    return response\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.get_static_file","title":"get_static_file(endpoint, project=None, raise_for_status=True)","text":"

    Retrieve a static file from the backend.

    Parameters:

    Name Type Description Default endpoint str

    GET endpoint

    required raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def get_static_file(\n    self, endpoint: str, project: Optional[Project] = None, raise_for_status: bool = True\n) -> Response:\n    \"\"\"Retrieve a static file from the backend.\n\n    Args:\n        endpoint (str): GET endpoint\n        raise_for_status (bool): raise an exception on error\n\n    Returns:\n        Response object\n    \"\"\"\n    from urllib.parse import urlparse\n    url_data = self.__build_url(endpoint, project=project)\n    url_parse = urlparse(self._base_url)\n    url_data['url'] = f\"\"\"{\n        url_parse.scheme}://{url_parse.netloc}/static-content{endpoint}\"\"\"\n    response = self._http_client.get(**url_data)\n\n    if response.status_code != Client.codes.OK and raise_for_status:\n        self.__raise_for_status(response)\n\n    return response\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.patch","title":"patch(endpoint, content=None, data=None, json=None, project=None, files=None, raise_for_status=True)","text":"

    PATCH request to the backend.

    Parameters:

    Name Type Description Default endpoint str

    POST endpoint

    required data Optional[dict]

    (optional) multipart form data

    None json Optional[dict]

    (optional) json data

    None files Optional[dict]

    (optional) files to be sent

    None raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def patch(\n    self, endpoint: str, content: Optional[RequestContent] = None, data: Optional[Dict] = None,\n    json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n    raise_for_status: bool = True\n) -> Response:\n    \"\"\"PATCH request to the backend.\n\n    Args:\n        endpoint (str): POST endpoint\n        content (Optional[RequestContent])\n        data (Optional[dict]): (optional) multipart form data\n        json (Optional[dict]): (optional) json data\n        files (Optional[dict]): (optional) files to be sent\n        raise_for_status (bool): raise an exception on error\n\n    Returns:\n        Response object\n    \"\"\"\n    url_data = self.__build_url(\n        endpoint, data=data, json=json, files=files, project=project)\n    response = self._http_client.patch(**url_data, content=content)\n\n    if response.status_code != Client.codes.OK and raise_for_status:\n        self.__raise_for_status(response)\n\n    return response\n
    "},{"location":"sdk/reference/api/common/client/#ydata.sdk.common.client.client.Client.post","title":"post(endpoint, content=None, data=None, json=None, project=None, files=None, raise_for_status=True)","text":"

    POST request to the backend.

    Parameters:

    Name Type Description Default endpoint str

    POST endpoint

    required data Optional[dict]

    (optional) multipart form data

    None json Optional[dict]

    (optional) json data

    None files Optional[dict]

    (optional) files to be sent

    None raise_for_status bool

    raise an exception on error

    True

    Returns:

    Type Description Response

    Response object

    Source code in ydata/sdk/common/client/client.py
    def post(\n    self, endpoint: str, content: Optional[RequestContent] = None, data: Optional[Dict] = None,\n    json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,\n    raise_for_status: bool = True\n) -> Response:\n    \"\"\"POST request to the backend.\n\n    Args:\n        endpoint (str): POST endpoint\n        content (Optional[RequestContent])\n        data (Optional[dict]): (optional) multipart form data\n        json (Optional[dict]): (optional) json data\n        files (Optional[dict]): (optional) files to be sent\n        raise_for_status (bool): raise an exception on error\n\n    Returns:\n        Response object\n    \"\"\"\n    url_data = self.__build_url(\n        endpoint, data=data, json=json, files=files, project=project)\n    response = self._http_client.post(**url_data)\n\n    if response.status_code != Client.codes.OK and raise_for_status:\n        self.__raise_for_status(response)\n\n    return response\n
    "},{"location":"sdk/reference/api/common/types/","title":"Types","text":""},{"location":"sdk/reference/api/connectors/connector/","title":"Connector","text":"

    Bases: ModelFactoryMixin

    A Connector allows to connect and access data stored in various places. The list of available connectors can be found here.

    Parameters:

    Name Type Description Default connector_type Union[ConnectorType, str]

    Type of the connector to be created

    None credentials dict

    Connector credentials

    None name Optional[str]

    (optional) Connector name

    None project Optional[Project]

    (optional) Project name for this Connector

    None client Client

    (optional) Client to connect to the backend

    None

    Attributes:

    Name Type Description uid UID

    UID fo the connector instance (creating internally)

    type ConnectorType

    Type of the connector

    Source code in ydata/sdk/connectors/connector.py
    class Connector(ModelFactoryMixin):\n    \"\"\"A [`Connector`][ydata.sdk.connectors.Connector] allows to connect and\n    access data stored in various places. The list of available connectors can\n    be found [here][ydata.sdk.connectors.ConnectorType].\n\n    Arguments:\n        connector_type (Union[ConnectorType, str]): Type of the connector to be created\n        credentials (dict): Connector credentials\n        name (Optional[str]): (optional) Connector name\n        project (Optional[Project]): (optional) Project name for this Connector\n        client (Client): (optional) Client to connect to the backend\n\n    Attributes:\n        uid (UID): UID fo the connector instance (creating internally)\n        type (ConnectorType): Type of the connector\n    \"\"\"\n\n    _MODEL_CLASS = mConnector\n\n    _model: Optional[mConnector]\n\n    def __init__(\n            self, connector_type: Union[ConnectorType, str, None] = None, credentials: Optional[Dict] = None,\n            name: Optional[str] = None, project: Optional[Project] = None, client: Optional[Client] = None):\n        self._init_common(client=client)\n        self._model = _connector_type_to_model(ConnectorType._init_connector_type(connector_type))._create_model(\n            connector_type, credentials, name, client=client)\n\n        self._project = project\n\n    @init_client\n    def _init_common(self, client: Optional[Client] = None):\n        self._client = client\n        self._logger = create_logger(__name__, level=LOG_LEVEL)\n\n    @property\n    def uid(self) -> UID:\n        return self._model.uid\n\n    @property\n    def name(self) -> str:\n        return self._model.name\n\n    @property\n    def type(self) -> ConnectorType:\n        return ConnectorType(self._model.type)\n\n    @property\n    def project(self) -> Project:\n        return self._project or self._client.project\n\n    @staticmethod\n    @init_client\n    def get(\n        uid: UID, project: Optional[Project] = None, client: Optional[Client] = None\n    ) -> _T:\n        \"\"\"Get an existing connector.\n\n        Arguments:\n            uid (UID): Connector identifier\n            project (Optional[Project]): (optional) Project name from where to get the connector\n            client (Optional[Client]): (optional) Client to connect to the backend\n\n        Returns:\n            Connector\n        \"\"\"\n        response = client.get(f'/connector/{uid}', project=project)\n        data = response.json()\n        data_type = data[\"type\"]\n        connector_class = _connector_type_to_model(\n            ConnectorType._init_connector_type(data_type))\n        connector = connector_class._init_from_model_data(\n            connector_class._MODEL_CLASS(**data))\n        connector._project = project\n\n        return connector\n\n    @staticmethod\n    def _init_credentials(\n        connector_type: ConnectorType, credentials: Union[str, Path, Dict, Credentials]\n    ) -> Credentials:\n        _credentials = None\n\n        if isinstance(credentials, str):\n            credentials = Path(credentials)\n\n        if isinstance(credentials, Path):\n            try:\n                _credentials = json_loads(credentials.open().read())\n            except Exception:\n                raise CredentialTypeError(\n                    'Could not read the credentials. Please, check your path or credentials structure.')\n\n        try:\n            from ydata.sdk.connectors._models.connector_map import TYPE_TO_CLASS\n            credential_cls = TYPE_TO_CLASS.get(connector_type.value)\n            _credentials = credential_cls(**_credentials)\n        except Exception:\n            raise CredentialTypeError(\n                \"Could not create the credentials. Verify the path or the structure your credentials.\")\n\n        return _credentials\n\n    @staticmethod\n    def create(\n        connector_type: Union[ConnectorType, str], credentials: Union[str, Path, Dict, Credentials],\n        name: Optional[str] = None, project: Optional[Project] = None, client: Optional[Client] = None\n    ) -> _T:\n        \"\"\"Create a new connector.\n\n        Arguments:\n            connector_type (Union[ConnectorType, str]): Type of the connector to be created\n            credentials (dict): Connector credentials\n            name (Optional[str]): (optional) Connector name\n            project (Optional[Project]): (optional) Project where to create the connector\n            client (Client): (optional) Client to connect to the backend\n\n        Returns:\n            New connector\n        \"\"\"\n        connector_type = ConnectorType._init_connector_type(connector_type)\n        connector_class = _connector_type_to_model(connector_type)\n\n        payload = {\n            \"type\": connector_type.value,\n            \"credentials\": credentials.dict(by_alias=True)\n        }\n        model = connector_class._create(payload, name, project, client)\n\n        connector = connector_class._init_from_model_data(model)\n        connector._project = project\n        return connector\n\n    @classmethod\n    @init_client\n    def _create(\n        cls, payload: dict, name: Optional[str] = None, project: Optional[Project] = None,\n        client: Optional[Client] = None\n    ) -> _MODEL_CLASS:\n        _name = name if name is not None else str(uuid4())\n        payload[\"name\"] = _name\n        response = client.post('/connector/', project=project, json=payload)\n        data = response.json()\n\n        return cls._MODEL_CLASS(**data)\n\n    @staticmethod\n    @init_client\n    def list(project: Optional[Project] = None, client: Optional[Client] = None) -> ConnectorsList:\n        \"\"\"List the connectors instances.\n\n        Arguments:\n            project (Optional[Project]): (optional) Project name from where to list the connectors\n            client (Client): (optional) Client to connect to the backend\n\n        Returns:\n            List of connectors\n        \"\"\"\n        response = client.get('/connector', project=project)\n        data: list = response.json()\n        return ConnectorsList(data)\n\n    def __repr__(self):\n        return self._model.__repr__()\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.create","title":"create(connector_type, credentials, name=None, project=None, client=None) staticmethod","text":"

    Create a new connector.

    Parameters:

    Name Type Description Default connector_type Union[ConnectorType, str]

    Type of the connector to be created

    required credentials dict

    Connector credentials

    required name Optional[str]

    (optional) Connector name

    None project Optional[Project]

    (optional) Project where to create the connector

    None client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description _T

    New connector

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\ndef create(\n    connector_type: Union[ConnectorType, str], credentials: Union[str, Path, Dict, Credentials],\n    name: Optional[str] = None, project: Optional[Project] = None, client: Optional[Client] = None\n) -> _T:\n    \"\"\"Create a new connector.\n\n    Arguments:\n        connector_type (Union[ConnectorType, str]): Type of the connector to be created\n        credentials (dict): Connector credentials\n        name (Optional[str]): (optional) Connector name\n        project (Optional[Project]): (optional) Project where to create the connector\n        client (Client): (optional) Client to connect to the backend\n\n    Returns:\n        New connector\n    \"\"\"\n    connector_type = ConnectorType._init_connector_type(connector_type)\n    connector_class = _connector_type_to_model(connector_type)\n\n    payload = {\n        \"type\": connector_type.value,\n        \"credentials\": credentials.dict(by_alias=True)\n    }\n    model = connector_class._create(payload, name, project, client)\n\n    connector = connector_class._init_from_model_data(model)\n    connector._project = project\n    return connector\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.get","title":"get(uid, project=None, client=None) staticmethod","text":"

    Get an existing connector.

    Parameters:

    Name Type Description Default uid UID

    Connector identifier

    required project Optional[Project]

    (optional) Project name from where to get the connector

    None client Optional[Client]

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description _T

    Connector

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\n@init_client\ndef get(\n    uid: UID, project: Optional[Project] = None, client: Optional[Client] = None\n) -> _T:\n    \"\"\"Get an existing connector.\n\n    Arguments:\n        uid (UID): Connector identifier\n        project (Optional[Project]): (optional) Project name from where to get the connector\n        client (Optional[Client]): (optional) Client to connect to the backend\n\n    Returns:\n        Connector\n    \"\"\"\n    response = client.get(f'/connector/{uid}', project=project)\n    data = response.json()\n    data_type = data[\"type\"]\n    connector_class = _connector_type_to_model(\n        ConnectorType._init_connector_type(data_type))\n    connector = connector_class._init_from_model_data(\n        connector_class._MODEL_CLASS(**data))\n    connector._project = project\n\n    return connector\n
    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.connector.Connector.list","title":"list(project=None, client=None) staticmethod","text":"

    List the connectors instances.

    Parameters:

    Name Type Description Default project Optional[Project]

    (optional) Project name from where to list the connectors

    None client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description ConnectorsList

    List of connectors

    Source code in ydata/sdk/connectors/connector.py
    @staticmethod\n@init_client\ndef list(project: Optional[Project] = None, client: Optional[Client] = None) -> ConnectorsList:\n    \"\"\"List the connectors instances.\n\n    Arguments:\n        project (Optional[Project]): (optional) Project name from where to list the connectors\n        client (Client): (optional) Client to connect to the backend\n\n    Returns:\n        List of connectors\n    \"\"\"\n    response = client.get('/connector', project=project)\n    data: list = response.json()\n    return ConnectorsList(data)\n
    "},{"location":"sdk/reference/api/connectors/connector/#connectortype","title":"ConnectorType","text":"

    Bases: str, Enum

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AWS_S3","title":"AWS_S3 = 'aws-s3' class-attribute instance-attribute","text":"

    AWS S3 connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AZURE_BLOB","title":"AZURE_BLOB = 'azure-blob' class-attribute instance-attribute","text":"

    Azure Blob connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.AZURE_SQL","title":"AZURE_SQL = 'azure-sql' class-attribute instance-attribute","text":"

    AzureSQL connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.BIGQUERY","title":"BIGQUERY = 'google-bigquery' class-attribute instance-attribute","text":"

    BigQuery connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.FILE","title":"FILE = 'file' class-attribute instance-attribute","text":"

    File connector (placeholder)

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.GCS","title":"GCS = 'gcs' class-attribute instance-attribute","text":"

    Google Cloud Storage connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.MYSQL","title":"MYSQL = 'mysql' class-attribute instance-attribute","text":"

    MySQL connector

    "},{"location":"sdk/reference/api/connectors/connector/#ydata.sdk.connectors.ConnectorType.SNOWFLAKE","title":"SNOWFLAKE = 'snowflake' class-attribute instance-attribute","text":"

    Snowflake connector

    "},{"location":"sdk/reference/api/datasources/datasource/","title":"DataSource","text":"

    Bases: ModelFactoryMixin

    A DataSource represents a dataset to be used by a Synthesizer as training data.

    Parameters:

    Name Type Description Default connector Connector

    Connector from which the datasource is created

    required datatype Optional[Union[DataSourceType, str]]

    (optional) DataSource type

    TABULAR name Optional[str]

    (optional) DataSource name

    None project Optional[Project]

    (optional) Project name for this datasource

    None wait_for_metadata bool

    If True, wait until the metadata is fully calculated

    True client Client

    (optional) Client to connect to the backend

    None **config

    Datasource specific configuration

    {}

    Attributes:

    Name Type Description uid UID

    UID fo the datasource instance

    datatype DataSourceType

    Data source type

    status Status

    Status of the datasource

    metadata Metadata

    Metadata associated to the datasource

    Source code in ydata/sdk/datasources/datasource.py
    class DataSource(ModelFactoryMixin):\n    \"\"\"A [`DataSource`][ydata.sdk.datasources.DataSource] represents a dataset\n    to be used by a Synthesizer as training data.\n\n    Arguments:\n        connector (Connector): Connector from which the datasource is created\n        datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n        name (Optional[str]): (optional) DataSource name\n        project (Optional[Project]): (optional) Project name for this datasource\n        wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n        client (Client): (optional) Client to connect to the backend\n        **config: Datasource specific configuration\n\n    Attributes:\n        uid (UID): UID fo the datasource instance\n        datatype (DataSourceType): Data source type\n        status (Status): Status of the datasource\n        metadata (Metadata): Metadata associated to the datasource\n    \"\"\"\n\n    def __init__(\n        self, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR,\n        name: Optional[str] = None, project: Optional[Project] = None, wait_for_metadata: bool = True,\n        client: Optional[Client] = None, **config\n    ):\n        datasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\n        self._init_common(client=client)\n        self._model: Optional[mDataSource] = self._create_model(\n            connector=connector, datasource_type=datasource_type, datatype=datatype,\n            config=config, name=name, client=self._client)\n\n        if wait_for_metadata:\n            self._model = DataSource._wait_for_metadata(self)._model\n\n        self._project = project\n\n    @init_client\n    def _init_common(self, client: Optional[Client] = None):\n        self._client = client\n        self._logger = create_logger(__name__, level=LOG_LEVEL)\n\n    @property\n    def uid(self) -> UID:\n        return self._model.uid\n\n    @property\n    def datatype(self) -> DataSourceType:\n        return self._model.datatype\n\n    @property\n    def project(self) -> Project:\n        return self._project or self._client.project\n\n    @property\n    def status(self) -> Status:\n        try:\n            self._model = self.get(uid=self._model.uid,\n                                   project=self.project, client=self._client)._model\n            return self._model.status\n        except Exception:  # noqa: PIE786\n            return Status.unknown()\n\n    @property\n    def metadata(self) -> Optional[Metadata]:\n        return self._model.metadata\n\n    @staticmethod\n    @init_client\n    def list(project: Optional[Project] = None, client: Optional[Client] = None) -> DataSourceList:\n        \"\"\"List the  [`DataSource`][ydata.sdk.datasources.DataSource]\n        instances.\n\n        Arguments:\n            project (Optional[Project]): (optional) Project name from where to list the datasources\n            client (Client): (optional) Client to connect to the backend\n\n        Returns:\n            List of datasources\n        \"\"\"\n        def __process_data(data: list) -> list:\n            to_del = ['metadata']\n            for e in data:\n                for k in to_del:\n                    e.pop(k, None)\n            return data\n\n        response = client.get('/datasource', project=project)\n        data: list = response.json()\n        data = __process_data(data)\n\n        return DataSourceList(data)\n\n    @staticmethod\n    @init_client\n    def get(uid: UID, project: Optional[Project] = None, client: Optional[Client] = None) -> \"DataSource\":\n        \"\"\"Get an existing [`DataSource`][ydata.sdk.datasources.DataSource].\n\n        Arguments:\n            uid (UID): DataSource identifier\n            project (Optional[Project]): (optional) Project name from where to get the connector\n            client (Client): (optional) Client to connect to the backend\n\n        Returns:\n            DataSource\n        \"\"\"\n        response = client.get(f'/datasource/{uid}', project=project)\n        data: list = response.json()\n        datasource_type = CONNECTOR_TO_DATASOURCE.get(\n            ConnectorType(data['connector']['type']))\n        model = DataSource._model_from_api(data, datasource_type)\n        datasource = DataSource._init_from_model_data(model)\n        datasource._project = project\n        return datasource\n\n    @classmethod\n    def create(\n        cls, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR,\n        name: Optional[str] = None, project: Optional[Project] = None, wait_for_metadata: bool = True,\n        client: Optional[Client] = None, **config\n    ) -> \"DataSource\":\n        \"\"\"Create a new [`DataSource`][ydata.sdk.datasources.DataSource].\n\n        Arguments:\n            connector (Connector): Connector from which the datasource is created\n            datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n            name (Optional[str]): (optional) DataSource name\n            project (Optional[Project]): (optional) Project name for this datasource\n            wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n            client (Client): (optional) Client to connect to the backend\n            **config: Datasource specific configuration\n\n        Returns:\n            DataSource\n        \"\"\"\n        datasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\n        return cls._create(\n            connector=connector, datasource_type=datasource_type, datatype=datatype, config=config, name=name,\n            project=project, wait_for_metadata=wait_for_metadata, client=client)\n\n    @classmethod\n    def _create(\n        cls, connector: Connector, datasource_type: Type[mDataSource],\n        datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, config: Optional[Dict] = None,\n        name: Optional[str] = None, project: Optional[Project] = None, wait_for_metadata: bool = True,\n        client: Optional[Client] = None\n    ) -> \"DataSource\":\n        model = DataSource._create_model(\n            connector, datasource_type, datatype, config, name, project, client)\n        datasource = DataSource._init_from_model_data(model)\n\n        if wait_for_metadata:\n            datasource._model = DataSource._wait_for_metadata(datasource)._model\n\n        datasource._project = project\n\n        return datasource\n\n    @classmethod\n    @init_client\n    def _create_model(\n        cls, connector: Connector, datasource_type: Type[mDataSource],\n        datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR, config: Optional[Dict] = None,\n        name: Optional[str] = None, project: Optional[Project] = None, client: Optional[Client] = None\n    ) -> mDataSource:\n        _name = name if name is not None else str(uuid4())\n        _config = config if config is not None else {}\n        payload = {\n            \"name\": _name,\n            \"connector\": {\n                \"uid\": connector.uid,\n                \"type\": ConnectorType(connector.type).value\n            },\n            \"dataType\": DataSourceType(datatype).value\n        }\n        if connector.type != ConnectorType.FILE:\n            _config = datasource_type(**config).to_payload()\n        payload.update(_config)\n        response = client.post('/datasource/', project=project, json=payload)\n        data: list = response.json()\n        return DataSource._model_from_api(data, datasource_type)\n\n    @staticmethod\n    def _wait_for_metadata(datasource):\n        logger = create_logger(__name__, level=LOG_LEVEL)\n        while State(datasource.status.state) not in [State.AVAILABLE, State.FAILED, State.UNAVAILABLE]:\n            logger.info(f'Calculating metadata [{datasource.status}]')\n            datasource = DataSource.get(uid=datasource.uid, client=datasource._client)\n            sleep(BACKOFF)\n        return datasource\n\n    @staticmethod\n    def _model_from_api(data: Dict, datasource_type: Type[mDataSource]) -> mDataSource:\n        data['datatype'] = data.pop('dataType', None)\n        data = filter_dict(datasource_type, data)\n        model = datasource_type(**data)\n        return model\n\n    def __repr__(self):\n        return self._model.__repr__()\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.create","title":"create(connector, datatype=DataSourceType.TABULAR, name=None, project=None, wait_for_metadata=True, client=None, **config) classmethod","text":"

    Create a new DataSource.

    Parameters:

    Name Type Description Default connector Connector

    Connector from which the datasource is created

    required datatype Optional[Union[DataSourceType, str]]

    (optional) DataSource type

    TABULAR name Optional[str]

    (optional) DataSource name

    None project Optional[Project]

    (optional) Project name for this datasource

    None wait_for_metadata bool

    If True, wait until the metadata is fully calculated

    True client Client

    (optional) Client to connect to the backend

    None **config

    Datasource specific configuration

    {}

    Returns:

    Type Description DataSource

    DataSource

    Source code in ydata/sdk/datasources/datasource.py
    @classmethod\ndef create(\n    cls, connector: Connector, datatype: Optional[Union[DataSourceType, str]] = DataSourceType.TABULAR,\n    name: Optional[str] = None, project: Optional[Project] = None, wait_for_metadata: bool = True,\n    client: Optional[Client] = None, **config\n) -> \"DataSource\":\n    \"\"\"Create a new [`DataSource`][ydata.sdk.datasources.DataSource].\n\n    Arguments:\n        connector (Connector): Connector from which the datasource is created\n        datatype (Optional[Union[DataSourceType, str]]): (optional) DataSource type\n        name (Optional[str]): (optional) DataSource name\n        project (Optional[Project]): (optional) Project name for this datasource\n        wait_for_metadata (bool): If `True`, wait until the metadata is fully calculated\n        client (Client): (optional) Client to connect to the backend\n        **config: Datasource specific configuration\n\n    Returns:\n        DataSource\n    \"\"\"\n    datasource_type = CONNECTOR_TO_DATASOURCE.get(connector.type)\n    return cls._create(\n        connector=connector, datasource_type=datasource_type, datatype=datatype, config=config, name=name,\n        project=project, wait_for_metadata=wait_for_metadata, client=client)\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.get","title":"get(uid, project=None, client=None) staticmethod","text":"

    Get an existing DataSource.

    Parameters:

    Name Type Description Default uid UID

    DataSource identifier

    required project Optional[Project]

    (optional) Project name from where to get the connector

    None client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description DataSource

    DataSource

    Source code in ydata/sdk/datasources/datasource.py
    @staticmethod\n@init_client\ndef get(uid: UID, project: Optional[Project] = None, client: Optional[Client] = None) -> \"DataSource\":\n    \"\"\"Get an existing [`DataSource`][ydata.sdk.datasources.DataSource].\n\n    Arguments:\n        uid (UID): DataSource identifier\n        project (Optional[Project]): (optional) Project name from where to get the connector\n        client (Client): (optional) Client to connect to the backend\n\n    Returns:\n        DataSource\n    \"\"\"\n    response = client.get(f'/datasource/{uid}', project=project)\n    data: list = response.json()\n    datasource_type = CONNECTOR_TO_DATASOURCE.get(\n        ConnectorType(data['connector']['type']))\n    model = DataSource._model_from_api(data, datasource_type)\n    datasource = DataSource._init_from_model_data(model)\n    datasource._project = project\n    return datasource\n
    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.datasource.DataSource.list","title":"list(project=None, client=None) staticmethod","text":"

    List the DataSource instances.

    Parameters:

    Name Type Description Default project Optional[Project]

    (optional) Project name from where to list the datasources

    None client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description DataSourceList

    List of datasources

    Source code in ydata/sdk/datasources/datasource.py
    @staticmethod\n@init_client\ndef list(project: Optional[Project] = None, client: Optional[Client] = None) -> DataSourceList:\n    \"\"\"List the  [`DataSource`][ydata.sdk.datasources.DataSource]\n    instances.\n\n    Arguments:\n        project (Optional[Project]): (optional) Project name from where to list the datasources\n        client (Client): (optional) Client to connect to the backend\n\n    Returns:\n        List of datasources\n    \"\"\"\n    def __process_data(data: list) -> list:\n        to_del = ['metadata']\n        for e in data:\n            for k in to_del:\n                e.pop(k, None)\n        return data\n\n    response = client.get('/datasource', project=project)\n    data: list = response.json()\n    data = __process_data(data)\n\n    return DataSourceList(data)\n
    "},{"location":"sdk/reference/api/datasources/datasource/#status","title":"Status","text":"

    Bases: BaseModel

    "},{"location":"sdk/reference/api/datasources/datasource/#datasourcetype","title":"DataSourceType","text":"

    Bases: StringEnum

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.DataSourceType.MULTITABLE","title":"MULTITABLE = 'multiTable' class-attribute instance-attribute","text":"

    The DataSource is a multi table RDBMS.

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.DataSourceType.TABULAR","title":"TABULAR = 'tabular' class-attribute instance-attribute","text":"

    The DataSource is tabular (i.e. it does not have a temporal dimension).

    "},{"location":"sdk/reference/api/datasources/datasource/#ydata.sdk.datasources.DataSourceType.TIMESERIES","title":"TIMESERIES = 'timeseries' class-attribute instance-attribute","text":"

    The DataSource has a temporal dimension.

    "},{"location":"sdk/reference/api/datasources/metadata/","title":"Metadata","text":"

    Bases: BaseModel

    The Metadata object contains descriptive information about a.

    DataSource

    Attributes:

    Name Type Description columns List[Column]

    columns information

    "},{"location":"sdk/reference/api/synthesizers/base/","title":"Synthesizer","text":"

    Bases: ABC, ModelFactoryMixin

    Main synthesizer class.

    This class cannot be directly instanciated because of the specificities between RegularSynthesizer, TimeSeriesSynthesizer or MultiTableSynthesizer sample methods.

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer--methods","title":"Methods","text":"
    • fit: train a synthesizer instance.
    • sample: request synthetic data.
    • status: current status of the synthesizer instance.
    Note

    The synthesizer instance is created in the backend only when the fit method is called.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None Source code in ydata/sdk/synthesizers/synthesizer.py
    @typechecked\nclass BaseSynthesizer(ABC, ModelFactoryMixin):\n    \"\"\"Main synthesizer class.\n\n    This class cannot be directly instanciated because of the specificities between [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer], [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] or [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer] `sample` methods.\n\n    Methods\n    -------\n    - `fit`: train a synthesizer instance.\n    - `sample`: request synthetic data.\n    - `status`: current status of the synthesizer instance.\n\n    Note:\n            The synthesizer instance is created in the backend only when the `fit` method is called.\n\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n    \"\"\"\n\n    def __init__(\n            self, uid: Optional[UID] = None, name: Optional[str] = None,\n            project: Optional[Project] = None, client: Optional[Client] = None):\n        self._init_common(client=client)\n        self._model = mSynthesizer(uid=uid, name=name or str(uuid4()))\n        self._project = project\n\n    @init_client\n    def _init_common(self, client: Optional[Client] = None):\n        self._client = client\n        self._logger = create_logger(__name__, level=LOG_LEVEL)\n\n    @property\n    def project(self) -> Project:\n        return self._project or self._client.project\n\n    def fit(self, X: Union[DataSource, pdDataFrame],\n            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n            datatype: Optional[Union[DataSourceType, str]] = None,\n            sortbykey: Optional[Union[str, List[str]]] = None,\n            entities: Optional[Union[str, List[str]]] = None,\n            generate_cols: Optional[List[str]] = None,\n            exclude_cols: Optional[List[str]] = None,\n            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n            target: Optional[str] = None,\n            anonymize: Optional[dict] = None,\n            condition_on: Optional[List[str]] = None) -> None:\n        \"\"\"Fit the synthesizer.\n\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.\n\n        The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].\n\n        By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.\n        The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.\n\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]\n            sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Target for the dataset\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\n        if self._already_fitted():\n            raise AlreadyFittedError()\n\n        datatype = DataSourceType(datatype)\n\n        dataset_attrs = self._init_datasource_attributes(\n            sortbykey, entities, generate_cols, exclude_cols, dtypes)\n        self._validate_datasource_attributes(X, dataset_attrs, datatype, target)\n\n        # If the training data is a pandas dataframe, we first need to create a data source and then the instance\n        if isinstance(X, pdDataFrame):\n            if X.empty:\n                raise EmptyDataError(\"The DataFrame is empty\")\n            self._logger.info('creating local connector with pandas dataframe')\n            connector = LocalConnector.create(\n                source=X, project=self._project, client=self._client)\n            self._logger.info(\n                f'created local connector. creating datasource with {connector}')\n            _X = LocalDataSource(connector=connector, project=self._project,\n                                 datatype=datatype, client=self._client)\n            self._logger.info(f'created datasource {_X}')\n        else:\n            _X = X\n\n        if dsState(_X.status.state) != dsState.AVAILABLE:\n            raise DataSourceNotAvailableError(\n                f\"The datasource '{_X.uid}' is not available (status = {_X.status})\")\n\n        if isinstance(dataset_attrs, dict):\n            dataset_attrs = DataSourceAttrs(**dataset_attrs)\n\n        self._fit_from_datasource(\n            X=_X, datatype=datatype, dataset_attrs=dataset_attrs, target=target,\n            anonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)\n\n    @staticmethod\n    def _init_datasource_attributes(\n            sortbykey: Optional[Union[str, List[str]]],\n            entities: Optional[Union[str, List[str]]],\n            generate_cols: Optional[List[str]],\n            exclude_cols: Optional[List[str]],\n            dtypes: Optional[Dict[str, Union[str, DataType]]]) -> DataSourceAttrs:\n        dataset_attrs = {\n            'sortbykey': sortbykey if sortbykey is not None else [],\n            'entities': entities if entities is not None else [],\n            'generate_cols': generate_cols if generate_cols is not None else [],\n            'exclude_cols': exclude_cols if exclude_cols is not None else [],\n            'dtypes': {k: DataType(v) for k, v in dtypes.items()} if dtypes is not None else {}\n        }\n        return DataSourceAttrs(**dataset_attrs)\n\n    @staticmethod\n    def _validate_datasource_attributes(X: Union[DataSource, pdDataFrame], dataset_attrs: DataSourceAttrs, datatype: DataSourceType, target: Optional[str]):\n        columns = []\n        if isinstance(X, pdDataFrame):\n            columns = X.columns\n            if datatype is None:\n                raise DataTypeMissingError(\n                    \"Argument `datatype` is mandatory for pandas.DataFrame training data\")\n        else:\n            columns = [c.name for c in X.metadata.columns]\n\n        if target is not None and target not in columns:\n            raise DataSourceAttrsError(\n                \"Invalid target: column '{target}' does not exist\")\n\n        if datatype == DataSourceType.TIMESERIES:\n            if not dataset_attrs.sortbykey:\n                raise DataSourceAttrsError(\n                    \"The argument `sortbykey` is mandatory for timeseries datasource.\")\n\n        invalid_fields = {}\n        for field, v in dataset_attrs.dict().items():\n            field_columns = v if field != 'dtypes' else v.keys()\n            not_in_cols = [c for c in field_columns if c not in columns]\n            if len(not_in_cols) > 0:\n                invalid_fields[field] = not_in_cols\n\n        if len(invalid_fields) > 0:\n            error_msgs = [\"\\t- Field '{}': columns {} do not exist\".format(\n                f, ', '.join(v)) for f, v in invalid_fields.items()]\n            raise DataSourceAttrsError(\n                \"The dataset attributes are invalid:\\n {}\".format('\\n'.join(error_msgs)))\n\n    @staticmethod\n    def _metadata_to_payload(\n        datatype: DataSourceType, ds_metadata: Metadata,\n        dataset_attrs: Optional[DataSourceAttrs] = None, target: Optional[str] = None\n    ) -> dict:\n        \"\"\"Transform a the metadata and dataset attributes into a valid\n        payload.\n\n        Arguments:\n            datatype (DataSourceType): datasource type\n            ds_metadata (Metadata): datasource metadata object\n            dataset_attrs ( Optional[DataSourceAttrs] ): (optional) Dataset attributes\n            target (Optional[str]): (optional) target column name\n\n        Returns:\n            metadata payload dictionary\n        \"\"\"\n\n        columns = [\n            {\n                'name': c.name,\n                'generation': True and c.name not in dataset_attrs.exclude_cols,\n                'dataType': DataType(dataset_attrs.dtypes[c.name]).value if c.name in dataset_attrs.dtypes else c.datatype,\n                'varType': c.vartype,\n            }\n            for c in ds_metadata.columns]\n\n        metadata = {\n            'columns': columns,\n            'target': target\n        }\n\n        if dataset_attrs is not None:\n            if datatype == DataSourceType.TIMESERIES:\n                metadata['sortBy'] = [c for c in dataset_attrs.sortbykey]\n                metadata['entity'] = [c for c in dataset_attrs.entities]\n\n        return metadata\n\n    def _fit_from_datasource(\n        self,\n        X: DataSource,\n        datatype: DataSourceType,\n        privacy_level: Optional[PrivacyLevel] = None,\n        dataset_attrs: Optional[DataSourceAttrs] = None,\n        target: Optional[str] = None,\n        anonymize: Optional[dict] = None,\n        condition_on: Optional[List[str]] = None\n    ) -> None:\n        payload = self._create_payload()\n\n        payload['dataSourceUID'] = X.uid\n\n        if privacy_level:\n            payload['privacyLevel'] = privacy_level.value\n\n        if X.metadata is not None:\n            payload['metadata'] = self._metadata_to_payload(\n                datatype, X.metadata, dataset_attrs, target)\n\n        payload['type'] = str(datatype.value)\n\n        if anonymize is not None:\n            # process and validated the anonymization config shared by the end user\n            anonymize = build_and_validate_anonimization(\n                anonimyze=anonymize, cols=[col.name for col in X.metadata.columns])\n            payload[\"extraData\"][\"anonymize\"] = anonymize\n        if condition_on is not None:\n            payload[\"extraData\"][\"condition_on\"] = condition_on\n\n        response = self._client.post(\n            '/synthesizer/', json=payload, project=self._project)\n        data = response.json()\n        self._model = mSynthesizer(**data)\n        while self._check_fitting_not_finished(self.status):\n            self._logger.info('Training the synthesizer...')\n            sleep(BACKOFF)\n\n    def _create_payload(self) -> dict:\n        payload = {\n            'extraData': {}\n        }\n\n        if self._model and self._model.name:\n            payload['name'] = self._model.name\n\n        return payload\n\n    def _check_fitting_not_finished(self, status: Status) -> bool:\n        self._logger.debug(f'checking status {status}')\n\n        if Status.State(status.state) in [Status.State.READY, Status.State.REPORT]:\n            return False\n\n        self._logger.debug(f'status not ready yet {status.state}')\n\n        if status.prepare and PrepareState(status.prepare.state) == PrepareState.FAILED:\n            raise FittingError('Could not train the synthesizer')\n\n        if status.training and TrainingState(status.training.state) == TrainingState.FAILED:\n            raise FittingError('Could not train the synthesizer')\n\n        return True\n\n    @abstractmethod\n    def sample(self) -> pdDataFrame:\n        \"\"\"Abstract method to sample from a synthesizer.\"\"\"\n\n    def _sample(self, payload: Dict) -> pdDataFrame:\n        \"\"\"Sample from a synthesizer.\n\n        Arguments:\n            payload (dict): payload configuring the sample request\n\n        Returns:\n            pandas `DataFrame`\n        \"\"\"\n        response = self._client.post(\n            f\"/synthesizer/{self.uid}/sample\", json=payload, project=self._project)\n\n        data: Dict = response.json()\n        sample_uid = data.get('uid')\n        sample_status = None\n        while sample_status not in ['finished', 'failed']:\n            self._logger.info('Sampling from the synthesizer...')\n            response = self._client.get(\n                f'/synthesizer/{self.uid}/history', project=self._project)\n            history: Dict = response.json()\n            sample_data = next((s for s in history if s.get('uid') == sample_uid), None)\n            sample_status = sample_data.get('status', {}).get('state')\n            sleep(BACKOFF)\n\n        response = self._client.get_static_file(\n            f'/synthesizer/{self.uid}/sample/{sample_uid}/sample.csv', project=self._project)\n        data = StringIO(response.content.decode())\n        return read_csv(data)\n\n    @property\n    def uid(self) -> UID:\n        \"\"\"Get the status of a synthesizer instance.\n\n        Returns:\n            Synthesizer status\n        \"\"\"\n        if not self._is_initialized():\n            return Status.State.NOT_INITIALIZED\n\n        return self._model.uid\n\n    @property\n    def status(self) -> Status:\n        \"\"\"Get the status of a synthesizer instance.\n\n        Returns:\n            Synthesizer status\n        \"\"\"\n        if not self._is_initialized():\n            return Status.not_initialized()\n\n        try:\n            self = self.get()\n            return self._model.status\n        except Exception:  # noqa: PIE786\n            return Status.unknown()\n\n    def get(self):\n        assert self._is_initialized() and self._model.uid, InputError(\n            \"Please provide the synthesizer `uid`\")\n\n        response = self._client.get(f'/synthesizer/{self.uid}', project=self._project)\n        data = response.json()\n        self._model = mSynthesizer(**data)\n\n        return self\n\n    @staticmethod\n    @init_client\n    def list(client: Optional[Client] = None) -> SynthesizersList:\n        \"\"\"List the synthesizer instances.\n\n        Arguments:\n            client (Client): (optional) Client to connect to the backend\n\n        Returns:\n            List of synthesizers\n        \"\"\"\n        def __process_data(data: list) -> list:\n            to_del = ['metadata', 'report', 'mode']\n            for e in data:\n                for k in to_del:\n                    e.pop(k, None)\n            return data\n\n        response = client.get('/synthesizer')\n        data: list = response.json()\n        data = __process_data(data)\n\n        return SynthesizersList(data)\n\n    def _is_initialized(self) -> bool:\n        \"\"\"Determine if a synthesizer is instanciated or not.\n\n        Returns:\n            True if the synthesizer is instanciated\n        \"\"\"\n        return self._model is not None\n\n    def _already_fitted(self) -> bool:\n        \"\"\"Determine if a synthesizer is already fitted.\n\n        Returns:\n            True if the synthesizer is instanciated\n        \"\"\"\n\n        return self._is_initialized() and \\\n            (self._model.status is not None\n             and self._model.status.training is not None\n             and self._model.status.training.state is not [TrainingState.PREPARING])\n\n    @staticmethod\n    def _resolve_api_status(api_status: Dict) -> Status:\n        \"\"\"Determine the status of the Synthesizer.\n\n        The status of the synthesizer instance is determined by the state of\n        its different components.\n\n        Arguments:\n            api_status (dict): json from the endpoint GET /synthesizer\n\n        Returns:\n            Synthesizer Status\n        \"\"\"\n        status = Status(api_status.get('state', Status.UNKNOWN.name))\n        if status == Status.PREPARE:\n            if PrepareState(api_status.get('prepare', {}).get(\n                    'state', PrepareState.UNKNOWN.name)) == PrepareState.FAILED:\n                return Status.FAILED\n        elif status == Status.TRAIN:\n            if TrainingState(api_status.get('training', {}).get(\n                    'state', TrainingState.UNKNOWN.name)) == TrainingState.FAILED:\n                return Status.FAILED\n        elif status == Status.REPORT:\n            return Status.READY\n        return status\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.status","title":"status: Status property","text":"

    Get the status of a synthesizer instance.

    Returns:

    Type Description Status

    Synthesizer status

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.uid","title":"uid: UID property","text":"

    Get the status of a synthesizer instance.

    Returns:

    Type Description UID

    Synthesizer status

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, datatype=None, sortbykey=None, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource. When the training dataset is a pandas DataFrame, the argument datatype is required as it cannot be deduced.

    The argumentsortbykey is mandatory for TimeSeries.

    By default, if generate_cols or exclude_cols are not specified, all columns are generated by the synthesizer. The argument exclude_cols has precedence over generate_cols, i.e. a column col will not be generated if it is in both list.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY datatype Optional[Union[DataSourceType, str]]

    (optional) Dataset datatype - required if X is a pandas.DataFrame

    None sortbykey Union[str, List[str]]

    (optional) column(s) to use to sort timeseries datasets

    None entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Target for the dataset

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/synthesizer.py
    def fit(self, X: Union[DataSource, pdDataFrame],\n        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n        datatype: Optional[Union[DataSourceType, str]] = None,\n        sortbykey: Optional[Union[str, List[str]]] = None,\n        entities: Optional[Union[str, List[str]]] = None,\n        generate_cols: Optional[List[str]] = None,\n        exclude_cols: Optional[List[str]] = None,\n        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n        target: Optional[str] = None,\n        anonymize: Optional[dict] = None,\n        condition_on: Optional[List[str]] = None) -> None:\n    \"\"\"Fit the synthesizer.\n\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    When the training dataset is a pandas [`DataFrame`][pandas.DataFrame], the argument `datatype` is required as it cannot be deduced.\n\n    The argument`sortbykey` is mandatory for [`TimeSeries`][ydata.sdk.datasources.DataSourceType.TIMESERIES].\n\n    By default, if `generate_cols` or `exclude_cols` are not specified, all columns are generated by the synthesizer.\n    The argument `exclude_cols` has precedence over `generate_cols`, i.e. a column `col` will not be generated if it is in both list.\n\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        datatype (Optional[Union[DataSourceType, str]]): (optional) Dataset datatype - required if `X` is a [`pandas.DataFrame`][pandas.DataFrame]\n        sortbykey (Union[str, List[str]]): (optional) column(s) to use to sort timeseries datasets\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Target for the dataset\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\n    if self._already_fitted():\n        raise AlreadyFittedError()\n\n    datatype = DataSourceType(datatype)\n\n    dataset_attrs = self._init_datasource_attributes(\n        sortbykey, entities, generate_cols, exclude_cols, dtypes)\n    self._validate_datasource_attributes(X, dataset_attrs, datatype, target)\n\n    # If the training data is a pandas dataframe, we first need to create a data source and then the instance\n    if isinstance(X, pdDataFrame):\n        if X.empty:\n            raise EmptyDataError(\"The DataFrame is empty\")\n        self._logger.info('creating local connector with pandas dataframe')\n        connector = LocalConnector.create(\n            source=X, project=self._project, client=self._client)\n        self._logger.info(\n            f'created local connector. creating datasource with {connector}')\n        _X = LocalDataSource(connector=connector, project=self._project,\n                             datatype=datatype, client=self._client)\n        self._logger.info(f'created datasource {_X}')\n    else:\n        _X = X\n\n    if dsState(_X.status.state) != dsState.AVAILABLE:\n        raise DataSourceNotAvailableError(\n            f\"The datasource '{_X.uid}' is not available (status = {_X.status})\")\n\n    if isinstance(dataset_attrs, dict):\n        dataset_attrs = DataSourceAttrs(**dataset_attrs)\n\n    self._fit_from_datasource(\n        X=_X, datatype=datatype, dataset_attrs=dataset_attrs, target=target,\n        anonymize=anonymize, privacy_level=privacy_level, condition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.list","title":"list(client=None) staticmethod","text":"

    List the synthesizer instances.

    Parameters:

    Name Type Description Default client Client

    (optional) Client to connect to the backend

    None

    Returns:

    Type Description SynthesizersList

    List of synthesizers

    Source code in ydata/sdk/synthesizers/synthesizer.py
    @staticmethod\n@init_client\ndef list(client: Optional[Client] = None) -> SynthesizersList:\n    \"\"\"List the synthesizer instances.\n\n    Arguments:\n        client (Client): (optional) Client to connect to the backend\n\n    Returns:\n        List of synthesizers\n    \"\"\"\n    def __process_data(data: list) -> list:\n        to_del = ['metadata', 'report', 'mode']\n        for e in data:\n            for k in to_del:\n                e.pop(k, None)\n        return data\n\n    response = client.get('/synthesizer')\n    data: list = response.json()\n    data = __process_data(data)\n\n    return SynthesizersList(data)\n
    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.synthesizer.BaseSynthesizer.sample","title":"sample() abstractmethod","text":"

    Abstract method to sample from a synthesizer.

    Source code in ydata/sdk/synthesizers/synthesizer.py
    @abstractmethod\ndef sample(self) -> pdDataFrame:\n    \"\"\"Abstract method to sample from a synthesizer.\"\"\"\n
    "},{"location":"sdk/reference/api/synthesizers/base/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/base/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"sdk/reference/api/synthesizers/multitable/","title":"MultiTable","text":"

    Bases: BaseSynthesizer

    MultiTable synthesizer class.

    "},{"location":"sdk/reference/api/synthesizers/multitable/#ydata.sdk.synthesizers.multitable.MultiTableSynthesizer--methods","title":"Methods","text":"
    • fit: train a synthesizer instance.
    • sample: request synthetic data.
    • status: current status of the synthesizer instance.
    Note

    The synthesizer instance is created in the backend only when the fit method is called.

    Parameters:

    Name Type Description Default write_connector UID | Connector

    Connector of type RDBMS to be used to write the samples

    required uid UID

    (optional) UID to identify this synthesizer

    None name str

    (optional) Name to be used when creating the synthesizer. Calculated internally if not provided

    None client Client

    (optional) Client to connect to the backend

    None Source code in ydata/sdk/synthesizers/multitable.py
    class MultiTableSynthesizer(BaseSynthesizer):\n    \"\"\"MultiTable synthesizer class.\n\n    Methods\n    -------\n    - `fit`: train a synthesizer instance.\n    - `sample`: request synthetic data.\n    - `status`: current status of the synthesizer instance.\n\n    Note:\n            The synthesizer instance is created in the backend only when the `fit` method is called.\n\n    Arguments:\n        write_connector (UID | Connector): Connector of type RDBMS to be used to write the samples\n        uid (UID): (optional) UID to identify this synthesizer\n        name (str): (optional) Name to be used when creating the synthesizer. Calculated internally if not provided\n        client (Client): (optional) Client to connect to the backend\n    \"\"\"\n\n    def __init__(\n            self, write_connector: Union[Connector, UID], uid: Optional[UID] = None, name: Optional[str] = None,\n            project: Optional[Project] = None, client: Optional[Client] = None):\n\n        super().__init__(uid, name, project, client)\n\n        connector = self._check_or_fetch_connector(write_connector)\n        self.__write_connector = connector.uid\n\n    def fit(self, X: DataSource,\n            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n            datatype: Optional[Union[DataSourceType, str]] = None,\n            sortbykey: Optional[Union[str, List[str]]] = None,\n            entities: Optional[Union[str, List[str]]] = None,\n            generate_cols: Optional[List[str]] = None,\n            exclude_cols: Optional[List[str]] = None,\n            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n            target: Optional[str] = None,\n            anonymize: Optional[dict] = None,\n            condition_on: Optional[List[str]] = None) -> None:\n        \"\"\"Fit the synthesizer.\n\n        The synthesizer accepts as training dataset a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n        Except X, all the other arguments are for now ignored until they are supported.\n\n        Arguments:\n            X (DataSource): DataSource to Train\n        \"\"\"\n\n        self._fit_from_datasource(X, datatype=DataSourceType.MULTITABLE)\n\n    def sample(self, frac: Union[int, float] = 1, write_connector: Optional[Union[Connector, UID]] = None) -> None:\n        \"\"\"Sample from a [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer]\n        instance.\n        The sample is saved in the connector that was provided in the synthesizer initialization\n        or in the\n\n        Arguments:\n            frac (int | float): fraction of the sample to be returned\n        \"\"\"\n\n        assert frac >= 0.1, InputError(\n            \"It is not possible to generate an empty synthetic data schema. Please validate the input provided. \")\n        assert frac <= 5, InputError(\n            \"It is not possible to generate a database that is 5x bigger than the original dataset. Please validate the input provided.\")\n\n        payload = {\n            'fraction': frac,\n        }\n\n        if write_connector is not None:\n            connector = self._check_or_fetch_connector(write_connector)\n            payload['writeConnector'] = connector.uid\n\n        response = self._client.post(\n            f\"/synthesizer/{self.uid}/sample\", json=payload, project=self._project)\n\n        data = response.json()\n        sample_uid = data.get('uid')\n        sample_status = None\n        while sample_status not in ['finished', 'failed']:\n            self._logger.info('Sampling from the synthesizer...')\n            response = self._client.get(\n                f'/synthesizer/{self.uid}/history', project=self._project)\n            history = response.json()\n            sample_data = next((s for s in history if s.get('uid') == sample_uid), None)\n            sample_status = sample_data.get('status', {}).get('state')\n            sleep(BACKOFF)\n\n        print(\n            f\"Sample created and saved into connector with ID {self.__write_connector or write_connector}\")\n\n    def _create_payload(self) -> dict:\n        payload = super()._create_payload()\n        payload['writeConnector'] = self.__write_connector\n\n        return payload\n\n    def _check_or_fetch_connector(self, write_connector: Union[Connector, UID]) -> Connector:\n        self._logger.debug(f'Write connector is {write_connector}')\n        if isinstance(write_connector, str):\n            self._logger.debug(f'Write connector is of type `UID` {write_connector}')\n            write_connector = Connector.get(write_connector)\n            self._logger.debug(f'Using fetched connector {write_connector}')\n\n        if write_connector.uid is None:\n            raise InputError(\"Invalid connector provided as input for write\")\n\n        if write_connector.type not in [ConnectorType.AZURE_SQL, ConnectorType.MYSQL, ConnectorType.SNOWFLAKE]:\n            raise ConnectorError(\n                f\"Invalid type `{write_connector.type}` for the provided connector\")\n\n        return write_connector\n
    "},{"location":"sdk/reference/api/synthesizers/multitable/#ydata.sdk.synthesizers.multitable.MultiTableSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, datatype=None, sortbykey=None, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset a YData DataSource. Except X, all the other arguments are for now ignored until they are supported.

    Parameters:

    Name Type Description Default X DataSource

    DataSource to Train

    required Source code in ydata/sdk/synthesizers/multitable.py
    def fit(self, X: DataSource,\n        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n        datatype: Optional[Union[DataSourceType, str]] = None,\n        sortbykey: Optional[Union[str, List[str]]] = None,\n        entities: Optional[Union[str, List[str]]] = None,\n        generate_cols: Optional[List[str]] = None,\n        exclude_cols: Optional[List[str]] = None,\n        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n        target: Optional[str] = None,\n        anonymize: Optional[dict] = None,\n        condition_on: Optional[List[str]] = None) -> None:\n    \"\"\"Fit the synthesizer.\n\n    The synthesizer accepts as training dataset a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n    Except X, all the other arguments are for now ignored until they are supported.\n\n    Arguments:\n        X (DataSource): DataSource to Train\n    \"\"\"\n\n    self._fit_from_datasource(X, datatype=DataSourceType.MULTITABLE)\n
    "},{"location":"sdk/reference/api/synthesizers/multitable/#ydata.sdk.synthesizers.multitable.MultiTableSynthesizer.sample","title":"sample(frac=1, write_connector=None)","text":"

    Sample from a MultiTableSynthesizer instance. The sample is saved in the connector that was provided in the synthesizer initialization or in the

    Parameters:

    Name Type Description Default frac int | float

    fraction of the sample to be returned

    1 Source code in ydata/sdk/synthesizers/multitable.py
    def sample(self, frac: Union[int, float] = 1, write_connector: Optional[Union[Connector, UID]] = None) -> None:\n    \"\"\"Sample from a [`MultiTableSynthesizer`][ydata.sdk.synthesizers.MultiTableSynthesizer]\n    instance.\n    The sample is saved in the connector that was provided in the synthesizer initialization\n    or in the\n\n    Arguments:\n        frac (int | float): fraction of the sample to be returned\n    \"\"\"\n\n    assert frac >= 0.1, InputError(\n        \"It is not possible to generate an empty synthetic data schema. Please validate the input provided. \")\n    assert frac <= 5, InputError(\n        \"It is not possible to generate a database that is 5x bigger than the original dataset. Please validate the input provided.\")\n\n    payload = {\n        'fraction': frac,\n    }\n\n    if write_connector is not None:\n        connector = self._check_or_fetch_connector(write_connector)\n        payload['writeConnector'] = connector.uid\n\n    response = self._client.post(\n        f\"/synthesizer/{self.uid}/sample\", json=payload, project=self._project)\n\n    data = response.json()\n    sample_uid = data.get('uid')\n    sample_status = None\n    while sample_status not in ['finished', 'failed']:\n        self._logger.info('Sampling from the synthesizer...')\n        response = self._client.get(\n            f'/synthesizer/{self.uid}/history', project=self._project)\n        history = response.json()\n        sample_data = next((s for s in history if s.get('uid') == sample_uid), None)\n        sample_status = sample_data.get('status', {}).get('state')\n        sleep(BACKOFF)\n\n    print(\n        f\"Sample created and saved into connector with ID {self.__write_connector or write_connector}\")\n
    "},{"location":"sdk/reference/api/synthesizers/regular/","title":"Regular","text":"

    Bases: BaseSynthesizer

    Source code in ydata/sdk/synthesizers/regular.py
    class RegularSynthesizer(BaseSynthesizer):\n\n    def sample(self, n_samples: int = 1, condition_on: Optional[dict] = None) -> pdDataFrame:\n        \"\"\"Sample from a [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer]\n        instance.\n\n        Arguments:\n            n_samples (int): number of rows in the sample\n            condition_on: (Optional[dict]): (optional) conditional sampling parameters\n\n        Returns:\n            synthetic data\n        \"\"\"\n        if n_samples < 1:\n            raise InputError(\"Parameter 'n_samples' must be greater than 0\")\n\n        payload = {\"numberOfRecords\": n_samples}\n        if condition_on is not None:\n            payload[\"extraData\"] = {\n                \"condition_on\": condition_on\n            }\n        return self._sample(payload=payload)\n\n    def fit(self, X: Union[DataSource, pdDataFrame],\n            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n            entities: Optional[Union[str, List[str]]] = None,\n            generate_cols: Optional[List[str]] = None,\n            exclude_cols: Optional[List[str]] = None,\n            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n            target: Optional[str] = None,\n            anonymize: Optional[dict] = None,\n            condition_on: Optional[List[str]] = None) -> None:\n        \"\"\"Fit the synthesizer.\n\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Target column\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\n        BaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TABULAR, entities=entities,\n                            generate_cols=generate_cols, exclude_cols=exclude_cols, dtypes=dtypes,\n                            target=target, anonymize=anonymize, privacy_level=privacy_level,\n                            condition_on=condition_on)\n\n    def __repr__(self):\n        if self._model is not None:\n            return self._model.__repr__()\n        else:\n            return \"RegularSynthesizer(Not Initialized)\"\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.regular.RegularSynthesizer.fit","title":"fit(X, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Target column

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/regular.py
    def fit(self, X: Union[DataSource, pdDataFrame],\n        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n        entities: Optional[Union[str, List[str]]] = None,\n        generate_cols: Optional[List[str]] = None,\n        exclude_cols: Optional[List[str]] = None,\n        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n        target: Optional[str] = None,\n        anonymize: Optional[dict] = None,\n        condition_on: Optional[List[str]] = None) -> None:\n    \"\"\"Fit the synthesizer.\n\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Target column\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\n    BaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TABULAR, entities=entities,\n                        generate_cols=generate_cols, exclude_cols=exclude_cols, dtypes=dtypes,\n                        target=target, anonymize=anonymize, privacy_level=privacy_level,\n                        condition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.regular.RegularSynthesizer.sample","title":"sample(n_samples=1, condition_on=None)","text":"

    Sample from a RegularSynthesizer instance.

    Parameters:

    Name Type Description Default n_samples int

    number of rows in the sample

    1 condition_on Optional[dict]

    (Optional[dict]): (optional) conditional sampling parameters

    None

    Returns:

    Type Description DataFrame

    synthetic data

    Source code in ydata/sdk/synthesizers/regular.py
    def sample(self, n_samples: int = 1, condition_on: Optional[dict] = None) -> pdDataFrame:\n    \"\"\"Sample from a [`RegularSynthesizer`][ydata.sdk.synthesizers.RegularSynthesizer]\n    instance.\n\n    Arguments:\n        n_samples (int): number of rows in the sample\n        condition_on: (Optional[dict]): (optional) conditional sampling parameters\n\n    Returns:\n        synthetic data\n    \"\"\"\n    if n_samples < 1:\n        raise InputError(\"Parameter 'n_samples' must be greater than 0\")\n\n    payload = {\"numberOfRecords\": n_samples}\n    if condition_on is not None:\n        payload[\"extraData\"] = {\n            \"condition_on\": condition_on\n        }\n    return self._sample(payload=payload)\n
    "},{"location":"sdk/reference/api/synthesizers/regular/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/regular/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"sdk/reference/api/synthesizers/timeseries/","title":"TimeSeries","text":"

    Bases: BaseSynthesizer

    Source code in ydata/sdk/synthesizers/timeseries.py
    class TimeSeriesSynthesizer(BaseSynthesizer):\n\n    def sample(self, n_entities: int, condition_on: Optional[dict] = None) -> pdDataFrame:\n        \"\"\"Sample from a [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] instance.\n\n        If a training dataset was not using any `entity` column, the Synthesizer assumes a single entity.\n        A [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] always sample the full trajectory of its entities.\n\n        Arguments:\n            n_entities (int): number of entities to sample\n            condition_on: (Optional[dict]): (optional) conditional sampling parameters\n\n        Returns:\n            synthetic data\n        \"\"\"\n        if n_entities is not None and n_entities < 1:\n            raise InputError(\"Parameter 'n_entities' must be greater than 0\")\n\n        payload = {\"numberOfRecords\": n_entities}\n        if condition_on is not None:\n            payload[\"extraData\"] = {\n                \"condition_on\": condition_on\n            }\n        return self._sample(payload=payload)\n\n    def fit(self, X: Union[DataSource, pdDataFrame],\n            sortbykey: Optional[Union[str, List[str]]],\n            privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n            entities: Optional[Union[str, List[str]]] = None,\n            generate_cols: Optional[List[str]] = None,\n            exclude_cols: Optional[List[str]] = None,\n            dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n            target: Optional[str] = None,\n            anonymize: Optional[dict] = None,\n            condition_on: Optional[List[str]] = None) -> None:\n        \"\"\"Fit the synthesizer.\n\n        The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n\n        Arguments:\n            X (Union[DataSource, pandas.DataFrame]): Training dataset\n            sortbykey (Union[str, List[str]]): column(s) to use to sort timeseries datasets\n            privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n            entities (Union[str, List[str]]): (optional) columns representing entities ID\n            generate_cols (List[str]): (optional) columns that should be synthesized\n            exclude_cols (List[str]): (optional) columns that should not be synthesized\n            dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n            target (Optional[str]): (optional) Metadata associated to the datasource\n            name (Optional[str]): (optional) Synthesizer instance name\n            anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n            condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n        \"\"\"\n        BaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TIMESERIES, sortbykey=sortbykey,\n                            entities=entities, generate_cols=generate_cols, exclude_cols=exclude_cols,\n                            dtypes=dtypes, target=target, anonymize=anonymize, privacy_level=privacy_level,\n                            condition_on=condition_on)\n\n    def __repr__(self):\n        if self._model is not None:\n            return self._model.__repr__()\n        else:\n            return \"TimeSeriesSynthesizer(Not Initialized)\"\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.timeseries.TimeSeriesSynthesizer.fit","title":"fit(X, sortbykey, privacy_level=PrivacyLevel.HIGH_FIDELITY, entities=None, generate_cols=None, exclude_cols=None, dtypes=None, target=None, anonymize=None, condition_on=None)","text":"

    Fit the synthesizer.

    The synthesizer accepts as training dataset either a pandas DataFrame directly or a YData DataSource.

    Parameters:

    Name Type Description Default X Union[DataSource, DataFrame]

    Training dataset

    required sortbykey Union[str, List[str]]

    column(s) to use to sort timeseries datasets

    required privacy_level PrivacyLevel

    Synthesizer privacy level (defaults to high fidelity)

    HIGH_FIDELITY entities Union[str, List[str]]

    (optional) columns representing entities ID

    None generate_cols List[str]

    (optional) columns that should be synthesized

    None exclude_cols List[str]

    (optional) columns that should not be synthesized

    None dtypes Dict[str, Union[str, DataType]]

    (optional) datatype mapping that will overwrite the datasource metadata column datatypes

    None target Optional[str]

    (optional) Metadata associated to the datasource

    None name Optional[str]

    (optional) Synthesizer instance name

    required anonymize Optional[str]

    (optional) fields to anonymize and the anonymization strategy

    None condition_on Optional[List[str]]

    (Optional[List[str]]): (optional) list of features to condition upon

    None Source code in ydata/sdk/synthesizers/timeseries.py
    def fit(self, X: Union[DataSource, pdDataFrame],\n        sortbykey: Optional[Union[str, List[str]]],\n        privacy_level: PrivacyLevel = PrivacyLevel.HIGH_FIDELITY,\n        entities: Optional[Union[str, List[str]]] = None,\n        generate_cols: Optional[List[str]] = None,\n        exclude_cols: Optional[List[str]] = None,\n        dtypes: Optional[Dict[str, Union[str, DataType]]] = None,\n        target: Optional[str] = None,\n        anonymize: Optional[dict] = None,\n        condition_on: Optional[List[str]] = None) -> None:\n    \"\"\"Fit the synthesizer.\n\n    The synthesizer accepts as training dataset either a pandas [`DataFrame`][pandas.DataFrame] directly or a YData [`DataSource`][ydata.sdk.datasources.DataSource].\n\n    Arguments:\n        X (Union[DataSource, pandas.DataFrame]): Training dataset\n        sortbykey (Union[str, List[str]]): column(s) to use to sort timeseries datasets\n        privacy_level (PrivacyLevel): Synthesizer privacy level (defaults to high fidelity)\n        entities (Union[str, List[str]]): (optional) columns representing entities ID\n        generate_cols (List[str]): (optional) columns that should be synthesized\n        exclude_cols (List[str]): (optional) columns that should not be synthesized\n        dtypes (Dict[str, Union[str, DataType]]): (optional) datatype mapping that will overwrite the datasource metadata column datatypes\n        target (Optional[str]): (optional) Metadata associated to the datasource\n        name (Optional[str]): (optional) Synthesizer instance name\n        anonymize (Optional[str]): (optional) fields to anonymize and the anonymization strategy\n        condition_on: (Optional[List[str]]): (optional) list of features to condition upon\n    \"\"\"\n    BaseSynthesizer.fit(self, X=X, datatype=DataSourceType.TIMESERIES, sortbykey=sortbykey,\n                        entities=entities, generate_cols=generate_cols, exclude_cols=exclude_cols,\n                        dtypes=dtypes, target=target, anonymize=anonymize, privacy_level=privacy_level,\n                        condition_on=condition_on)\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.timeseries.TimeSeriesSynthesizer.sample","title":"sample(n_entities, condition_on=None)","text":"

    Sample from a TimeSeriesSynthesizer instance.

    If a training dataset was not using any entity column, the Synthesizer assumes a single entity. A TimeSeriesSynthesizer always sample the full trajectory of its entities.

    Parameters:

    Name Type Description Default n_entities int

    number of entities to sample

    required condition_on Optional[dict]

    (Optional[dict]): (optional) conditional sampling parameters

    None

    Returns:

    Type Description DataFrame

    synthetic data

    Source code in ydata/sdk/synthesizers/timeseries.py
    def sample(self, n_entities: int, condition_on: Optional[dict] = None) -> pdDataFrame:\n    \"\"\"Sample from a [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] instance.\n\n    If a training dataset was not using any `entity` column, the Synthesizer assumes a single entity.\n    A [`TimeSeriesSynthesizer`][ydata.sdk.synthesizers.TimeSeriesSynthesizer] always sample the full trajectory of its entities.\n\n    Arguments:\n        n_entities (int): number of entities to sample\n        condition_on: (Optional[dict]): (optional) conditional sampling parameters\n\n    Returns:\n        synthetic data\n    \"\"\"\n    if n_entities is not None and n_entities < 1:\n        raise InputError(\"Parameter 'n_entities' must be greater than 0\")\n\n    payload = {\"numberOfRecords\": n_entities}\n    if condition_on is not None:\n        payload[\"extraData\"] = {\n            \"condition_on\": condition_on\n        }\n    return self._sample(payload=payload)\n
    "},{"location":"sdk/reference/api/synthesizers/timeseries/#privacylevel","title":"PrivacyLevel","text":"

    Bases: StringEnum

    Privacy level exposed to the end-user.

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.BALANCED_PRIVACY_FIDELITY","title":"BALANCED_PRIVACY_FIDELITY = 'BALANCED_PRIVACY_FIDELITY' class-attribute instance-attribute","text":"

    Balanced privacy/fidelity

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_FIDELITY","title":"HIGH_FIDELITY = 'HIGH_FIDELITY' class-attribute instance-attribute","text":"

    High fidelity

    "},{"location":"sdk/reference/api/synthesizers/timeseries/#ydata.sdk.synthesizers.PrivacyLevel.HIGH_PRIVACY","title":"HIGH_PRIVACY = 'HIGH_PRIVACY' class-attribute instance-attribute","text":"

    High privacy

    "},{"location":"support/help-troubleshooting/","title":"Help & Troubleshooting","text":""},{"location":"synthetic_data/","title":"Synthetic Data generation","text":"

    YData Fabric's Synthetic data Generation capabilities leverages the latest generative models to create high-quality artificial data that replicates real-world data properties. Regardless it is a table, a database or a tex corpus this powerful capability ensures privacy, enhances data availability, and boosts model performance across various industries. In this section discover how YData Fabric's synthetic data solutions can transform your data-driven initiatives.

    "},{"location":"synthetic_data/#what-is-synthetic-data","title":"What is Synthetic Data?","text":"

    Synthetic data is artificially generated data that mimics the statistical properties and structure of real-world data without directly copying it. It is created using algorithms and models designed to replicate the characteristics of actual data sets. This process ensures that synthetic data retains the essential patterns and relationships present in the original data, making it a valuable asset for various applications, particularly in situations where using real data might pose privacy, security, or availability concerns. It can be used for:

    • Guaranteeing privacy and compliance when sharing datasets (for quality assurance, product development and other analytics teams)
    • Removing bias by upsampling rare events
    • Balancing datasets
    • Augment existing datasets to improve the performance of machine learning models or use in stress testing
    • Smartly fill in missing values based on context
    • Simulate new scenarios and hypothesis
    "},{"location":"synthetic_data/#the-benefits-of-synthetic-data","title":"The benefits of Synthetic Data","text":"

    Leveraging synthetic data offers numerous benefits:

    • Privacy and Security: Synthetic data eliminates the risk of exposing sensitive information, making it an ideal solution for industries handling sensitive data, such as healthcare, finance, and telecommunications.
    • Data Augmentation: It enables organizations to augment existing data sets, enhancing model training by providing diverse and representative samples, thereby improving model accuracy and robustness.
    • Cost Efficiency: Generating synthetic data can be more cost-effective than collecting and labeling large volumes of real data, particularly for rare events or scenarios that are difficult to capture.
    • Testing and Development: Synthetic data provides a safe environment for testing and developing algorithms, ensuring that models are robust before deployment in real-world scenarios.
    "},{"location":"synthetic_data/#synthetic-data-in-fabric","title":"Synthetic Data in Fabric","text":"

    YData Fabric offers robust support for creating high-quality synthetic data using generative models and/or through bootstrapping. The platform is designed to address the diverse needs of data scientists, engineers, and analysts by providing a comprehensive set of tools and features.

    "},{"location":"synthetic_data/#data-types-supported","title":"Data Types Supported:","text":"

    YData Fabric supports the generation of various data types, including:

    • Tabular Data: Generate synthetic versions of structured data typically found in spreadsheets and databases, with support for categorical, numerical, and mixed data types.
    • Time Series Data: Create synthetic time series data that preserves the temporal dependencies and trends, useful for applications like financial forecasting and sensor data analysis.
    • Multi-Table or Database Synthesis: Synthesize complex databases with multiple interrelated tables, maintaining the relational integrity and dependencies, which is crucial for comprehensive data analysis and testing applications.
    • Text Data: Produce synthetic text data for natural language processing (NLP) tasks, ensuring the generated text maintains the linguistic properties and context of the original data.
    "},{"location":"synthetic_data/#related-materials","title":"Related Materials","text":"
    • \ud83d\udcd6 The 5 Benefits of Synthetic data generation for modern AI
    • \ud83d\udcd6 The role of Synthetic data in Healthcare
    • \ud83d\udcd6 The role of Synthetic data to overcome Bias
    "},{"location":"synthetic_data/best_practices/","title":"Best practices for optimal synthetic data generation","text":""},{"location":"synthetic_data/best_practices/#overview","title":"Overview","text":"

    This document outlines the best practices for generating structured synthetic data, focusing on ensuring data quality, privacy, and utility. Synthetic data generation is a sophisticated process involving the training of generative models to produce artificial datasets that mimic real-world data. This documentation is intended to guide data scientists, engineers, and analysts in configuring and refining the synthetic data generation process, with a focus on avoiding common pitfalls.

    "},{"location":"synthetic_data/best_practices/#1-understanding-the-use-case","title":"1. Understanding the Use Case","text":"

    Before beginning the synthetic data generation process, it is essential to clearly define the use case. The purpose of the synthetic data\u2014whether for training machine learning models, testing algorithms, or validating data pipelines\u2014will influence the structure, scale, and fidelity required.

    Key Considerations:

    Understand and know your data: Deeply understanding the characteristics and behaviors of the original dataset is crucial for configuring the synthetic data generation process to optimize outcomes. This understanding is also essential for validating and assessing the quality of the synthetic data. If your synthetic data fails to represent all classes from the original dataset, it could indicate that the original data lacks sufficient records for those particular behaviors.

    • Data Characteristics: Identify the necessary size, format, and distribution of the data.

    • Privacy Concerns: Determine if there are specific regulations or privacy requirements to be met.

    • Critical Variables: Identify the key variables and relationships that must be preserved in the synthetic data.

    "},{"location":"synthetic_data/best_practices/#2-configuring-the-data-schema-relations","title":"2. Configuring the Data Schema & Relations","text":"

    Setting and configuring a concise and business aligned dataset schema is crucial for generating high-quality synthetic data. The schema should mirror the structure of the real-world data you aim to emulate, while ensuring the selected PII Types and Data Types are aligned with the use-case and applications.

    Key Considerations:

    • Data Types: Make sure to always verify the configured data types. After all learning a \"Category\" is a different from learning the distribution for a Numerical variable.

    • Unique Identifiers: Exclude unique identifiers (e.g., user IDs, transaction IDs) from the data generation process. These identifiers are typically arbitrary and do not carry meaningful information for the generative model to learn. Instead, generate them separately or replace them with randomized values. Documentation: Thoroughly document the schema, including all constraints and relationships, for future reference and reproducibility.

    • Data Constraints: Include constraints such as primary keys, foreign keys, and data types to maintain data integrity. Also, make sure to configure the relation between tables (eg. x= a + b) as it will ensure that the model will treat the outcome for variable x as a deterministic process.

    "},{"location":"synthetic_data/best_practices/#3-avoiding-overfitting-to-the-original-data","title":"3. Avoiding Overfitting to the Original Data","text":"

    To ensure that the synthetic data is useful and generalizable, it is important to avoid overfitting the generative model to the original dataset. YData Fabric synthetic data generation process leverages the concept of Holdout in order to avoid overfitting, but the effectiveness of the holdout might vary depending on the dataset behaviour and size.

    Key Considerations:

    • Excessive Fine-Tuning: Avoid overly fine-tuning the generative model on your whole dataset, as this can lead to synthetic data that is too similar to the original, reducing its utility.

    • Ignoring Variability: Ensure that the synthetic data introduces enough variability to cover edge cases and rare events, rather than merely replicating common patterns from the training data.

    "},{"location":"synthetic_data/best_practices/#4-ensuring-data-privacy","title":"4. Ensuring Data Privacy","text":"

    One of the key benefits of synthetic data is the ability to mitigate privacy risks. However, careful attention must be paid to ensure that the synthetic data does not inadvertently reveal sensitive information from the original dataset.

    Key Considerations:

    • Reusing Identifiable Information: Do not include direct identifiers (such as names, addresses, etc.) in the synthetic data.

    Having a true identifier among the synthetic data might not only hinder the quality of the synthetic data but also its capacity to remain anonymous.

    "},{"location":"synthetic_data/best_practices/#5-validating-the-synthetic-data","title":"5. Validating the Synthetic Data","text":"

    Validation is a critical step in the synthetic data generation process. The synthetic data must be rigorously tested to ensure that it meets the necessary criteria for its intended use.

    Key Considerations:

    • Skipping Statistical Validation: Do not skip the step of comparing the statistical properties of the synthetic data against the real data. This is essential to ensure that the synthetic data is both realistic and useful.

    • Using a Single Metric: Avoid relying on a single validation metric. Validate the synthetic data across multiple dimensions, such as distribution, correlation, and predictive performance, to get a comprehensive view of its quality.

    YData Fabric synthetic data generation process offers an extensive and automated synthetic data quality report and profiling compare to help with the data quality validation.

    "},{"location":"synthetic_data/best_practices/#6-iterating-and-refining-the-process","title":"6. Iterating and Refining the Process","text":"

    Synthetic data generation is inherently iterative. The initial datasets may require refinement to improve their accuracy, utility, or realism.

    Key Considerations:

    • Treating the First Version as Final: The first generated dataset is rarely perfect. Continuous iteration and refinement are key to achieving high-quality synthetic data.

    • Ignoring Feedback: Feedback from domain experts and end-users is invaluable. Do not disregard this input, as it can significantly improve the relevance and utility of the synthetic data.

    "},{"location":"synthetic_data/best_practices/#7-documenting-and-sharing-the-process","title":"7. Documenting and Sharing the Process","text":"

    Thorough documentation is essential for transparency, reproducibility, and collaboration in synthetic data generation.

    Key Considerations:

    • Skipping Documentation: Failing to document the synthetic data generation process can make it difficult to reproduce results or understand the rationale behind certain decisions.

    • Keeping the Process Opaque: Transparency is crucial, especially when synthetic data is used in critical applications. Ensure that all relevant details, including methodologies, parameters, and assumptions, are clearly documented and accessible to stakeholders.

    Before diving into complex applications, ensure you're thoroughly familiar with synthetic data by starting small and gradually increasing complexity. Build your understanding step by step, and only proceed to more advanced use cases once you're confident in the quality and reliability of the synthetic data. Know your data and ensure that your synthetic data matches your expectations fully before leveraging it for downstream applications.

    "},{"location":"synthetic_data/relational_database/","title":"Multi-Table Synthetic data generation","text":"

    Multi-Table or Database's synthetic data generation is a powerful method to create high-quality artificial datasets that mirror the statistical properties and relational structures of original multi-table databases. A multi-table database consists of multiple interrelated tables, often with various data types (dates, categorical, numerical, etc.) and complex relationships between records. Key use cases include privacy-preserving access to full production databases and the creation of realistic test environments. Synthetic data allows organizations to share and analyze full production databases without exposing sensitive information, ensuring compliance with data privacy regulations. Additionally, it is invaluable for creating realistic test environments, enabling developers and testers to simulate real-world scenarios, identify potential issues, and validate database applications without risking data breaches. By leveraging synthetic multi-table data, organizations can simulate complex relational data environments, enhance the robustness of database applications, and ensure data privacy, making it a valuable tool for industries that rely on intricate data structures and interdependencies.

    "},{"location":"synthetic_data/relational_database/#tutorials-recipes","title":"Tutorials & Recipes","text":"

    To get-started with Synthetic Data Generation you can follow out quickstart guide.

    For more tutorial and recipes, follow the link to YData's Academy.

    "},{"location":"synthetic_data/relational_database/#related-materials","title":"Related Materials","text":"
    • How to generate Synthetic Data from a Database
    • How to generate Multi-Table step-by-step
    • How to generate Multi-Table synthetic data in Google Colab
    "},{"location":"synthetic_data/single_table/","title":"Tabular synthetic data generation","text":"

    Tabular synthetic data generation is a powerful method to create high-quality artificial datasets that mirror the statistical properties of original tabular data. A tabular dataset is usually composed by several columns with structured data and mixed data types (dates, categorical, numerical, etc) with not time dependence between records. This ability of generating synthetic data from this type of datasets is essential for a wide range of applications, from data augmentation to privacy preservation, and is particularly useful in scenarios where obtaining or using real data is challenging.

    "},{"location":"synthetic_data/single_table/#tutorials-recipes","title":"Tutorials & Recipes","text":"

    To get-started with Synthetic Data Generation you can follow out quickstart guide.

    For more tutorial and recipes, follow the link to YData's Academy.

    "},{"location":"synthetic_data/single_table/#related-materials","title":"Related Materials","text":"
    • \ud83d\udcd6 Generating Synthetic data from a Tabular dataset with a large number of columns
    • \ud83d\udcd6 Synthetic data to improve Credit Scoring models
    • Generate Synthetic data with Python code
    • Synthetic data generation with API
    "},{"location":"synthetic_data/text/","title":"Text Synthetic Data generation","text":"

    Synthetic data generation for text creates high-quality artificial text datasets that mimic the properties and patterns of original text data, playing a crucial role in Generative AI applications. This technique enhances the performance of large language models (LLMs) by providing extensive training datasets, which improve model accuracy and robustness. It addresses data scarcity by generating text for specialized domains or languages where data is limited. Additionally, synthetic text generation ensures privacy preservation, allowing organizations to create useful datasets without compromising sensitive information, thereby complying with data privacy regulations while enabling comprehensive data analysis and model training\u200b

    Feature in Preview

    This feature is in preview and not available for all users. Contact us if you are interested in giving it a try!

    "},{"location":"synthetic_data/text/#related-materials","title":"Related Materials","text":"
    • How to generate Synthetic Text Data?
    "},{"location":"synthetic_data/timeseries/","title":"Time-series synthetic data generation","text":"

    Time-series synthetic data generation is a powerful method to create high-quality artificial datasets that mirror the statistical properties of original time-series data. A time-series dataset is composed of sequential data points recorded at specific time intervals, capturing trends, patterns, and temporal dependencies. This ability to generate synthetic data from time-series datasets is essential for a wide range of applications, from data augmentation to privacy preservation, and is particularly useful in scenarios where obtaining or using real data is challenging. By leveraging synthetic time-series data, organizations can simulate various conditions and events, enhance model robustness, and ensure data privacy, making it a valuable tool for industries reliant on temporal data analysis. This type of data is prevalent in various fields, including finance, healthcare, energy, and IoT (Internet of Things).

    "},{"location":"synthetic_data/timeseries/#tutorials-recipes","title":"Tutorials & Recipes","text":"

    To get-started with Synthetic Data Generation you can follow out quickstart guide.

    For more tutorial and recipes, follow the link to YData's Academy.

    "},{"location":"synthetic_data/timeseries/#related-materials","title":"Related Materials","text":"
    • \ud83d\udcd6 Understanding the structure of a time-series dataset
    • \ud83d\udcd6 Time-series synthetic data generation
    • \ud83d\udcd6 Synthetic multivariate time-series data
    • How to generate time-series synthetic data?
    "}]} \ No newline at end of file diff --git a/1.0/sitemap.xml.gz b/1.0/sitemap.xml.gz index 23098322d9c75b74c8d329f5699ccaba32d89afe..fe6dfa30698fd9fe084f694239ed984c13da81be 100644 GIT binary patch delta 13 Ucmb=gXP58h;Am*Jn#f)O02~4Y4FCWD delta 13 Ucmb=gXP58h;ApUsn8;oM02&tqqyPW_ diff --git a/1.0/support/help-troubleshooting/index.html b/1.0/support/help-troubleshooting/index.html index 0462fc30..7275b0b6 100644 --- a/1.0/support/help-troubleshooting/index.html +++ b/1.0/support/help-troubleshooting/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/synthetic_data/best_practices/index.html b/1.0/synthetic_data/best_practices/index.html index a3d45b05..10e344fb 100644 --- a/1.0/synthetic_data/best_practices/index.html +++ b/1.0/synthetic_data/best_practices/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/synthetic_data/index.html b/1.0/synthetic_data/index.html index f1df899f..8a9a940f 100644 --- a/1.0/synthetic_data/index.html +++ b/1.0/synthetic_data/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/synthetic_data/relational_database/index.html b/1.0/synthetic_data/relational_database/index.html index ade79240..cc1e98b4 100644 --- a/1.0/synthetic_data/relational_database/index.html +++ b/1.0/synthetic_data/relational_database/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/synthetic_data/relational_database/use_in_labs/index.html b/1.0/synthetic_data/relational_database/use_in_labs/index.html index 777c1d11..5a7c8584 100644 --- a/1.0/synthetic_data/relational_database/use_in_labs/index.html +++ b/1.0/synthetic_data/relational_database/use_in_labs/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/synthetic_data/single_table/index.html b/1.0/synthetic_data/single_table/index.html index 3ee05d41..d5f2b70e 100644 --- a/1.0/synthetic_data/single_table/index.html +++ b/1.0/synthetic_data/single_table/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/synthetic_data/synthetic_data_quality/compare_profiling/index.html b/1.0/synthetic_data/synthetic_data_quality/compare_profiling/index.html index 10521e4e..501731f8 100644 --- a/1.0/synthetic_data/synthetic_data_quality/compare_profiling/index.html +++ b/1.0/synthetic_data/synthetic_data_quality/compare_profiling/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/synthetic_data/synthetic_data_quality/report_pdf/index.html b/1.0/synthetic_data/synthetic_data_quality/report_pdf/index.html index 697983c7..7b153f0f 100644 --- a/1.0/synthetic_data/synthetic_data_quality/report_pdf/index.html +++ b/1.0/synthetic_data/synthetic_data_quality/report_pdf/index.html @@ -12,7 +12,7 @@ - + diff --git a/1.0/synthetic_data/text/index.html b/1.0/synthetic_data/text/index.html index d1706da3..ee605068 100644 --- a/1.0/synthetic_data/text/index.html +++ b/1.0/synthetic_data/text/index.html @@ -16,7 +16,7 @@ - + diff --git a/1.0/synthetic_data/timeseries/index.html b/1.0/synthetic_data/timeseries/index.html index 6c65416e..e4413d1f 100644 --- a/1.0/synthetic_data/timeseries/index.html +++ b/1.0/synthetic_data/timeseries/index.html @@ -16,7 +16,7 @@ - +