Skip to content

Commit

Permalink
Huggingface datasets v2 (#150)
Browse files Browse the repository at this point in the history
* huggingface datasets

* more fixes for hf datasets
  • Loading branch information
dllllb authored Feb 25, 2024
1 parent 36a3580 commit 74c1bd3
Show file tree
Hide file tree
Showing 9 changed files with 31 additions and 187 deletions.
71 changes: 3 additions & 68 deletions demo/CoLES-multimodal.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions demo/coles-embedder-finetune.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,8 @@
],
"source": [
"joined = pd.merge(\n",
" pd.read_csv(data_path / 'transactions_train.csv'), # , nrows=10000\n",
" pd.read_csv(data_path / 'small_group_description.csv').rename(columns={'small_group': 'mcc_description'}),\n",
" pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true', compression='gzip'),\n",
" pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/small_group_description.csv?download=true').rename(columns={'small_group': 'mcc_description'}),\n",
" left_on='small_group', right_on='small_group_code',\n",
").drop(columns=['small_group', 'small_group_code'])\n",
"\n",
Expand Down
26 changes: 3 additions & 23 deletions demo/coles-pretrained-embeddings.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,26 +75,6 @@
"os.environ['TOKENIZERS_PARALLELISM'] = 'false'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"if not data_path.joinpath('transactions_train.csv').exists():\n",
" ! mkdir -p data\n",
" ! curl -OL https://storage.yandexcloud.net/ptls-datasets/age-prediction-nti-sbebank-2019.zip\n",
" ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data\n",
" ! mv age-prediction-nti-sbebank-2019.zip data/"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -226,8 +206,8 @@
],
"source": [
"joined = pd.merge(\n",
" pd.read_csv(data_path / 'transactions_train.csv'), # , nrows=10000\n",
" pd.read_csv(data_path / 'small_group_description.csv').rename(columns={'small_group': 'mcc_description'}),\n",
" pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true',compression='gzip'),\n",
" pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/small_group_description.csv?download=true').rename(columns={'small_group': 'mcc_description'}),\n",
" left_on='small_group', right_on='small_group_code',\n",
").drop(columns=['small_group', 'small_group_code'])\n",
"\n",
Expand Down Expand Up @@ -843,7 +823,7 @@
},
"outputs": [],
"source": [
"target_df = pd.read_csv(data_path / 'train_target.csv')"
"target_df = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')"
]
},
{
Expand Down
7 changes: 6 additions & 1 deletion demo/extended_inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,12 @@
}
],
"source": [
"df_real_trx = spark.read.csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true', header=True) \\\n",
"from urllib.request import urlretrieve\n",
"data, _ = urlretrieve(\n",
" 'https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true',\n",
" 'transactions_train.csv.gz')\n",
"\n",
"df_real_trx = spark.read.csv(data, header=True) \\\n",
".select(\n",
" F.col('trans_date').cast('float'), \n",
" 'small_group',\n",
Expand Down
18 changes: 0 additions & 18 deletions demo/mlm-emb.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,6 @@
"This demo shows pretrain `TrxEncoder` with MaskedLanguageModel task."
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "e2d26483",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/alge/.local/share/virtualenvs/ptls-experiments-0GRH_plu/src/pytorch-lifestream/demo\r\n"
]
}
],
"source": [
"!pwd"
]
},
{
"cell_type": "markdown",
"id": "9e91a902",
Expand Down
7 changes: 6 additions & 1 deletion demo/multilabel-classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,12 @@
},
"outputs": [],
"source": [
"source_data = spark.read.options(header=True, inferSchema=True).csv('https://huggingface.co/datasets/dllllb/transactions-gender/resolve/main/transactions.csv.gz?download=true')"
"from urllib.request import urlretrieve\n",
"data, _ = urlretrieve(\n",
" 'https://huggingface.co/datasets/dllllb/transactions-gender/resolve/main/transactions.csv.gz?download=true',\n",
" 'transactions.csv.gz')\n",
"\n",
"source_data = spark.read.options(header=True, inferSchema=True).csv(data)"
]
},
{
Expand Down
34 changes: 3 additions & 31 deletions demo/nsp-sop-emb.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,32 +24,6 @@
" ! {sys.executable} -m pip install -U 'torchaudio<2' # downgrade for ptls==0.5.x"
]
},
{
"cell_type": "markdown",
"id": "a7431993",
"metadata": {},
"source": [
"## Data load"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a798aaae",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import os\n",
"\n",
"if not os.path.exists('data/transactions_train.csv'):\n",
" ! mkdir -p data\n",
" ! curl -OL https://storage.yandexcloud.net/ptls-datasets/age-prediction-nti-sbebank-2019.zip\n",
" ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data\n",
" ! mv age-prediction-nti-sbebank-2019.zip data/"
]
},
{
"cell_type": "markdown",
"id": "9e91a902",
Expand Down Expand Up @@ -166,9 +140,7 @@
"import os\n",
"import pandas as pd\n",
"\n",
"data_path = 'data/'\n",
"\n",
"source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))\n",
"source_data = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true', compression='gzip')\n",
"source_data.head(2)"
]
},
Expand Down Expand Up @@ -643,7 +615,7 @@
"source": [
"# join target and embeddings\n",
"\n",
"df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))\n",
"df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')\n",
"df_target = df_target.set_index('client_id')\n",
"df_target.rename(columns={\"bins\": \"target\"}, inplace=True)\n",
"\n",
Expand Down Expand Up @@ -999,7 +971,7 @@
"source": [
"# join target and embeddings\n",
"\n",
"df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))\n",
"df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')\n",
"df_target = df_target.set_index('client_id')\n",
"df_target.rename(columns={\"bins\": \"target\"}, inplace=True)\n",
"\n",
Expand Down
7 changes: 6 additions & 1 deletion demo/pyspark-parquet.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,12 @@
}
],
"source": [
"source_data = spark.read.options(header=True, inferSchema=True).csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true')\n",
"from urllib.request import urlretrieve\n",
"data, _ = urlretrieve(\n",
" 'https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true',\n",
" 'transactions_train.csv.gz')\n",
"\n",
"source_data = spark.read.options(header=True, inferSchema=True).csv(data)\n",
"source_data.show(2)"
]
},
Expand Down
44 changes: 2 additions & 42 deletions demo/supervised-sequence-to-target-transformer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,46 +8,6 @@
"# Supervised task with transformer sequence encoder"
]
},
{
"cell_type": "markdown",
"id": "d64030fe",
"metadata": {},
"source": [
"## Data load"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a258bf97",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 239M 100 239M 0 0 52.2M 0 0:00:04 0:00:04 --:--:-- 54.1M\n",
"Archive: age-prediction-nti-sbebank-2019.zip\n",
" inflating: data/test.csv \n",
" inflating: data/small_group_description.csv \n",
" inflating: data/train_target.csv \n",
" inflating: data/transactions_train.csv \n",
" inflating: data/transactions_test.csv \n"
]
}
],
"source": [
"import os\n",
"\n",
"if not os.path.exists('data/transactions_train.csv'):\n",
" ! mkdir -p data\n",
" ! curl -OL https://storage.yandexcloud.net/ptls-datasets/age-prediction-nti-sbebank-2019.zip\n",
" ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data\n",
" ! mv age-prediction-nti-sbebank-2019.zip data/"
]
},
{
"cell_type": "markdown",
"id": "862a9986",
Expand Down Expand Up @@ -165,7 +125,7 @@
}
],
"source": [
"df_target = pd.read_csv('data/train_target.csv')\n",
"df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')\n",
"df_target.head()"
]
},
Expand Down Expand Up @@ -296,7 +256,7 @@
}
],
"source": [
"df_trx = pd.read_csv('data/transactions_train.csv')\n",
"df_trx = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true', compression='gzip')\n",
"df_trx.head()"
]
},
Expand Down

0 comments on commit 74c1bd3

Please sign in to comment.