Skip to content
This repository has been archived by the owner on Jul 18, 2024. It is now read-only.

Commit

Permalink
Merge pull request #491 from xuechendi/customer_op_fix
Browse files Browse the repository at this point in the history
[v1.2][ISSUE-490]Fix bug for using customer ip in jupyter
  • Loading branch information
xuechendi authored Dec 15, 2023
2 parents f557a7f + 3f43b0d commit 23fe0da
Show file tree
Hide file tree
Showing 12 changed files with 262 additions and 130 deletions.
200 changes: 100 additions & 100 deletions RecDP/examples/notebooks/llmutils/custom_map.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"tiny_c4_sample.jsonl\n"
]
Expand All @@ -108,13 +108,13 @@
},
{
"cell_type": "markdown",
"source": [
"### 3.1 PIPELINE based API"
],
"id": "HZIPEzB477Bm",
"metadata": {
"id": "HZIPEzB477Bm"
},
"id": "HZIPEzB477Bm"
"source": [
"### 3.1 PIPELINE based API"
]
},
{
"cell_type": "code",
Expand All @@ -132,50 +132,33 @@
},
{
"cell_type": "code",
"source": [
"# plugin into pipeline\n",
"from pyrecdp.LLM import TextPipeline, ResumableTextPipeline\n",
"from pyrecdp.primitives.operations import *\n",
"\n",
"pipeline = ResumableTextPipeline()\n",
"ops = [\n",
" JsonlReader(\"/content/test_data\"),\n",
" TextQualityScorer(),\n",
" TextCustomerMap(classify, text_key='doc_score'),\n",
" PerfileParquetWriter(\"ResumableTextPipeline_output\")\n",
"]\n",
"pipeline.add_operations(ops)\n",
"pipeline.execute()\n",
"del pipeline"
],
"execution_count": 5,
"id": "KeVkLyrRBJ0d",
"metadata": {
"id": "KeVkLyrRBJ0d",
"outputId": "64e52513-5729-45c4-adda-4daff6b5e59f",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 191
}
},
"id": "KeVkLyrRBJ0d",
"outputId": "64e52513-5729-45c4-adda-4daff6b5e59f"
},
"id": "KeVkLyrRBJ0d",
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
" warnings.warn(\n"
]
},
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
Expand Down Expand Up @@ -227,142 +210,112 @@
"\t\t"
]
},
"metadata": {}
"metadata": {},
"output_type": "display_data"
},
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Will assign 1 cores and 10386 M memory for spark\n",
"per core memory size is 10.143 GB and shuffle_disk maximum capacity is 8589934592.000 GB\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"output_type": "stream",
"text": [
"ResumableTextPipeline, current on tiny_c4_sample.jsonl: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"model_name is gpt3\n",
"\u001b[32m2023-10-12 22:33:46.431\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.primitives.operations.text_qualityscorer\u001b[0m:\u001b[36mprepare_model\u001b[0m:\u001b[36m122\u001b[0m - \u001b[1mPreparing scorer model in [/root/.cache/recdp/models/gpt3_quality_model]...\u001b[0m\n",
"\u001b[32m2023-10-12 22:34:03.479\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.primitives.operations.text_qualityscorer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mStart scoring dataset...\u001b[0m\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"output_type": "stream",
"text": [
"ResumableTextPipeline, current on tiny_c4_sample.jsonl: 100%|██████████| 1/1 [00:24<00:00, 24.62s/it]"
]
},
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2023-10-12 22:34:11.050\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.LLM.TextPipeline\u001b[0m:\u001b[36mexecute\u001b[0m:\u001b[36m325\u001b[0m - \u001b[1mCompleted! ResumableTextPipeline will not return dataset, please check ResumableTextPipeline_output for verification.\u001b[0m\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# plugin into pipeline\n",
"from pyrecdp.LLM import TextPipeline, ResumableTextPipeline\n",
"from pyrecdp.primitives.operations import *\n",
"\n",
"pipeline = ResumableTextPipeline()\n",
"ops = [\n",
" JsonlReader(\"/content/test_data\"),\n",
" TextQualityScorer(),\n",
" TextCustomerMap(classify, text_key='doc_score'),\n",
" PerfileParquetWriter(\"ResumableTextPipeline_output\")\n",
"]\n",
"pipeline.add_operations(ops)\n",
"pipeline.execute()\n",
"del pipeline"
]
},
{
"cell_type": "code",
"source": [
"# View output\n",
"! ls ResumableTextPipeline_output"
],
"execution_count": 7,
"id": "BA5uiJPlBdzt",
"metadata": {
"id": "BA5uiJPlBdzt",
"outputId": "c59bcacd-5994-4f13-b47a-d9a04017b9b3",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"id": "BA5uiJPlBdzt",
"outputId": "c59bcacd-5994-4f13-b47a-d9a04017b9b3"
},
"id": "BA5uiJPlBdzt",
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"pipeline.json pipeline.log status.log tiny_c4_sample.jsonl\n"
]
}
],
"source": [
"# View output\n",
"! ls ResumableTextPipeline_output"
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"pd.read_parquet(\"ResumableTextPipeline_output/tiny_c4_sample.jsonl\")"
],
"execution_count": 9,
"id": "8LKIxfkIBaIs",
"metadata": {
"id": "8LKIxfkIBaIs",
"outputId": "8a834079-7baa-4523-d9aa-1b986e761fc5",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
}
},
"id": "8LKIxfkIBaIs",
"outputId": "8a834079-7baa-4523-d9aa-1b986e761fc5"
},
"id": "8LKIxfkIBaIs",
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" text \\\n",
"0 lorazepam nombre comercial mexico From an inte... \n",
"1 It is possible to love someone who does not lo... \n",
"2 Canon PIXMA TS9520 All-in-One Print / Scan / C... \n",
"3 For those who plan on buying an iPad this Satu... \n",
"4 After tipping 25 tokens in a day, you'll be ab... \n",
".. ... \n",
"444 Sunrise is an equal opportunity employer. Vete... \n",
"445 Home / Business / #Exploitation: Coca Cola is ... \n",
"446 I got really surprised when I saw that I recei... \n",
"447 Here's a brief schedule for 2016 as requested ... \n",
"448 It spreads like a wild fire. “It can never hap... \n",
"\n",
" meta source_id \\\n",
"0 {\"timestamp\":\"2019-04-24T02:17:53Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"1 {\"timestamp\":\"2019-04-23T06:32:35Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"2 {\"timestamp\":\"2019-04-25T17:03:36Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"3 {\"timestamp\":\"2019-04-22T22:39:52Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"4 {\"timestamp\":\"2019-04-20T00:25:13Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
".. ... ... \n",
"444 {\"timestamp\":\"2019-04-22T10:28:15Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"445 {\"timestamp\":\"2019-04-24T18:04:45Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"446 {\"timestamp\":\"2019-04-26T08:57:28Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"447 {\"timestamp\":\"2019-04-18T10:15:11Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"448 {\"timestamp\":\"2019-04-22T18:51:33Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"\n",
" doc_score should_keep classify_text \n",
"0 0.139534 0 0 \n",
"1 0.999997 1 1 \n",
"2 0.941116 1 1 \n",
"3 0.999765 1 1 \n",
"4 0.939119 1 1 \n",
".. ... ... ... \n",
"444 0.834727 0 1 \n",
"445 0.998307 1 1 \n",
"446 0.864012 1 1 \n",
"447 0.999769 1 1 \n",
"448 0.999995 1 1 \n",
"\n",
"[449 rows x 6 columns]"
],
"text/html": [
"\n",
" <div id=\"df-aa834e1b-d49c-47a6-92a7-6abe7080147a\" class=\"colab-df-container\">\n",
Expand Down Expand Up @@ -704,11 +657,58 @@
"</div>\n",
" </div>\n",
" </div>\n"
],
"text/plain": [
" text \\\n",
"0 lorazepam nombre comercial mexico From an inte... \n",
"1 It is possible to love someone who does not lo... \n",
"2 Canon PIXMA TS9520 All-in-One Print / Scan / C... \n",
"3 For those who plan on buying an iPad this Satu... \n",
"4 After tipping 25 tokens in a day, you'll be ab... \n",
".. ... \n",
"444 Sunrise is an equal opportunity employer. Vete... \n",
"445 Home / Business / #Exploitation: Coca Cola is ... \n",
"446 I got really surprised when I saw that I recei... \n",
"447 Here's a brief schedule for 2016 as requested ... \n",
"448 It spreads like a wild fire. “It can never hap... \n",
"\n",
" meta source_id \\\n",
"0 {\"timestamp\":\"2019-04-24T02:17:53Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"1 {\"timestamp\":\"2019-04-23T06:32:35Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"2 {\"timestamp\":\"2019-04-25T17:03:36Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"3 {\"timestamp\":\"2019-04-22T22:39:52Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"4 {\"timestamp\":\"2019-04-20T00:25:13Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
".. ... ... \n",
"444 {\"timestamp\":\"2019-04-22T10:28:15Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"445 {\"timestamp\":\"2019-04-24T18:04:45Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"446 {\"timestamp\":\"2019-04-26T08:57:28Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"447 {\"timestamp\":\"2019-04-18T10:15:11Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"448 {\"timestamp\":\"2019-04-22T18:51:33Z\",\"url\":\"htt... tiny_c4_sample.jsonl \n",
"\n",
" doc_score should_keep classify_text \n",
"0 0.139534 0 0 \n",
"1 0.999997 1 1 \n",
"2 0.941116 1 1 \n",
"3 0.999765 1 1 \n",
"4 0.939119 1 1 \n",
".. ... ... ... \n",
"444 0.834727 0 1 \n",
"445 0.998307 1 1 \n",
"446 0.864012 1 1 \n",
"447 0.999769 1 1 \n",
"448 0.999995 1 1 \n",
"\n",
"[449 rows x 6 columns]"
]
},
"execution_count": 9,
"metadata": {},
"execution_count": 9
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"pd.read_parquet(\"ResumableTextPipeline_output/output/tiny_c4_sample.jsonl\")"
]
}
],
Expand Down Expand Up @@ -736,4 +736,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
4 changes: 2 additions & 2 deletions RecDP/pyrecdp/core/di_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ def convert_to_node_chain(self):
ret = list(self.keys())
return ret

def json_dump(self):
def json_dump(self, base_dir = ""):
import json
to_dump = dict((node_id, op.dump()) for node_id, op in self.items())
to_dump = dict((node_id, op.dump(base_dir)) for node_id, op in self.items())
return json.dumps(to_dump, indent=4)

def copy(self):
Expand Down
4 changes: 3 additions & 1 deletion RecDP/pyrecdp/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ def __repr__(self):

def export(self, file_path = None):
import copy
import os
base_dir = os.path.dirname(file_path) if file_path else ""
pipeline_deepcopy = copy.deepcopy(self.pipeline)
json_object = pipeline_deepcopy.json_dump()
json_object = pipeline_deepcopy.json_dump(base_dir)
if file_path:
# Writing to sample.json
with open(file_path, "w") as outfile:
Expand Down
Loading

0 comments on commit 23fe0da

Please sign in to comment.