Merge pull request #491 from xuechendi/customer_op_fix

[v1.2][ISSUE-490]Fix bug for using customer ip in jupyter
intel · Dec 15, 2023 · 23fe0da · 23fe0da
2 parents f557a7f + 3f43b0d
commit 23fe0da
Show file tree

Hide file tree

Showing 12 changed files with 262 additions and 130 deletions.
diff --git a/RecDP/examples/notebooks/llmutils/custom_map.ipynb b/RecDP/examples/notebooks/llmutils/custom_map.ipynb
@@ -95,8 +95,8 @@
       },
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "tiny_c4_sample.jsonl\n"
           ]
@@ -108,13 +108,13 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "### 3.1 PIPELINE based API"
-      ],
+      "id": "HZIPEzB477Bm",
       "metadata": {
         "id": "HZIPEzB477Bm"
       },
-      "id": "HZIPEzB477Bm"
+      "source": [
+        "### 3.1 PIPELINE based API"
+      ]
     },
     {
       "cell_type": "code",
@@ -132,50 +132,33 @@
     },
     {
       "cell_type": "code",
-      "source": [
-        "# plugin into pipeline\n",
-        "from pyrecdp.LLM import TextPipeline, ResumableTextPipeline\n",
-        "from pyrecdp.primitives.operations import *\n",
-        "\n",
-        "pipeline = ResumableTextPipeline()\n",
-        "ops = [\n",
-        "    JsonlReader(\"/content/test_data\"),\n",
-        "    TextQualityScorer(),\n",
-        "    TextCustomerMap(classify, text_key='doc_score'),\n",
-        "    PerfileParquetWriter(\"ResumableTextPipeline_output\")\n",
-        "]\n",
-        "pipeline.add_operations(ops)\n",
-        "pipeline.execute()\n",
-        "del pipeline"
-      ],
+      "execution_count": 5,
+      "id": "KeVkLyrRBJ0d",
       "metadata": {
-        "id": "KeVkLyrRBJ0d",
-        "outputId": "64e52513-5729-45c4-adda-4daff6b5e59f",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 191
-        }
+        },
+        "id": "KeVkLyrRBJ0d",
+        "outputId": "64e52513-5729-45c4-adda-4daff6b5e59f"
       },
-      "id": "KeVkLyrRBJ0d",
-      "execution_count": 5,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
             "  warnings.warn(\n"
           ]
         },
         {
-          "output_type": "display_data",
           "data": {
             "text/html": [
               "\n",
@@ -227,142 +210,112 @@
               "\t\t"
             ]
           },
-          "metadata": {}
+          "metadata": {},
+          "output_type": "display_data"
         },
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Will assign 1 cores and 10386 M memory for spark\n",
             "per core memory size is 10.143 GB and shuffle_disk maximum capacity is 8589934592.000 GB\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "ResumableTextPipeline, current on tiny_c4_sample.jsonl:   0%|          | 0/1 [00:00<?, ?it/s]"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "model_name is gpt3\n",
             "\u001b[32m2023-10-12 22:33:46.431\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpyrecdp.primitives.operations.text_qualityscorer\u001b[0m:\u001b[36mprepare_model\u001b[0m:\u001b[36m122\u001b[0m - \u001b[1mPreparing scorer model in [/root/.cache/recdp/models/gpt3_quality_model]...\u001b[0m\n",
             "\u001b[32m2023-10-12 22:34:03.479\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpyrecdp.primitives.operations.text_qualityscorer\u001b[0m:\u001b[36mpredict\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mStart scoring dataset...\u001b[0m\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "ResumableTextPipeline, current on tiny_c4_sample.jsonl: 100%|██████████| 1/1 [00:24<00:00, 24.62s/it]"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "\u001b[32m2023-10-12 22:34:11.050\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpyrecdp.LLM.TextPipeline\u001b[0m:\u001b[36mexecute\u001b[0m:\u001b[36m325\u001b[0m - \u001b[1mCompleted! ResumableTextPipeline will not return dataset, please check ResumableTextPipeline_output for verification.\u001b[0m\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "\n"
           ]
         }
+      ],
+      "source": [
+        "# plugin into pipeline\n",
+        "from pyrecdp.LLM import TextPipeline, ResumableTextPipeline\n",
+        "from pyrecdp.primitives.operations import *\n",
+        "\n",
+        "pipeline = ResumableTextPipeline()\n",
+        "ops = [\n",
+        "    JsonlReader(\"/content/test_data\"),\n",
+        "    TextQualityScorer(),\n",
+        "    TextCustomerMap(classify, text_key='doc_score'),\n",
+        "    PerfileParquetWriter(\"ResumableTextPipeline_output\")\n",
+        "]\n",
+        "pipeline.add_operations(ops)\n",
+        "pipeline.execute()\n",
+        "del pipeline"
       ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "# View output\n",
-        "! ls ResumableTextPipeline_output"
-      ],
+      "execution_count": 7,
+      "id": "BA5uiJPlBdzt",
       "metadata": {
-        "id": "BA5uiJPlBdzt",
-        "outputId": "c59bcacd-5994-4f13-b47a-d9a04017b9b3",
         "colab": {
           "base_uri": "https://localhost:8080/"
-        }
+        },
+        "id": "BA5uiJPlBdzt",
+        "outputId": "c59bcacd-5994-4f13-b47a-d9a04017b9b3"
       },
-      "id": "BA5uiJPlBdzt",
-      "execution_count": 7,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "pipeline.json  pipeline.log  status.log  tiny_c4_sample.jsonl\n"
           ]
         }
+      ],
+      "source": [
+        "# View output\n",
+        "! ls ResumableTextPipeline_output"
       ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "import pandas as pd\n",
-        "pd.read_parquet(\"ResumableTextPipeline_output/tiny_c4_sample.jsonl\")"
-      ],
+      "execution_count": 9,
+      "id": "8LKIxfkIBaIs",
       "metadata": {
-        "id": "8LKIxfkIBaIs",
-        "outputId": "8a834079-7baa-4523-d9aa-1b986e761fc5",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 424
-        }
+        },
+        "id": "8LKIxfkIBaIs",
+        "outputId": "8a834079-7baa-4523-d9aa-1b986e761fc5"
       },
-      "id": "8LKIxfkIBaIs",
-      "execution_count": 9,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
-            "text/plain": [
-              "                                                  text  \\\n",
-              "0    lorazepam nombre comercial mexico From an inte...   \n",
-              "1    It is possible to love someone who does not lo...   \n",
-              "2    Canon PIXMA TS9520 All-in-One Print / Scan / C...   \n",
-              "3    For those who plan on buying an iPad this Satu...   \n",
-              "4    After tipping 25 tokens in a day, you'll be ab...   \n",
-              "..                                                 ...   \n",
-              "444  Sunrise is an equal opportunity employer. Vete...   \n",
-              "445  Home / Business / #Exploitation: Coca Cola is ...   \n",
-              "446  I got really surprised when I saw that I recei...   \n",
-              "447  Here's a brief schedule for 2016 as requested ...   \n",
-              "448  It spreads like a wild fire. “It can never hap...   \n",
-              "\n",
-              "                                                  meta             source_id  \\\n",
-              "0    {\"timestamp\":\"2019-04-24T02:17:53Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "1    {\"timestamp\":\"2019-04-23T06:32:35Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "2    {\"timestamp\":\"2019-04-25T17:03:36Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "3    {\"timestamp\":\"2019-04-22T22:39:52Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "4    {\"timestamp\":\"2019-04-20T00:25:13Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "..                                                 ...                   ...   \n",
-              "444  {\"timestamp\":\"2019-04-22T10:28:15Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "445  {\"timestamp\":\"2019-04-24T18:04:45Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "446  {\"timestamp\":\"2019-04-26T08:57:28Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "447  {\"timestamp\":\"2019-04-18T10:15:11Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "448  {\"timestamp\":\"2019-04-22T18:51:33Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "\n",
-              "     doc_score  should_keep classify_text  \n",
-              "0     0.139534            0             0  \n",
-              "1     0.999997            1             1  \n",
-              "2     0.941116            1             1  \n",
-              "3     0.999765            1             1  \n",
-              "4     0.939119            1             1  \n",
-              "..         ...          ...           ...  \n",
-              "444   0.834727            0             1  \n",
-              "445   0.998307            1             1  \n",
-              "446   0.864012            1             1  \n",
-              "447   0.999769            1             1  \n",
-              "448   0.999995            1             1  \n",
-              "\n",
-              "[449 rows x 6 columns]"
-            ],
             "text/html": [
               "\n",
               "  <div id=\"df-aa834e1b-d49c-47a6-92a7-6abe7080147a\" class=\"colab-df-container\">\n",
@@ -704,11 +657,58 @@
               "</div>\n",
               "    </div>\n",
               "  </div>\n"
+            ],
+            "text/plain": [
+              "                                                  text  \\\n",
+              "0    lorazepam nombre comercial mexico From an inte...   \n",
+              "1    It is possible to love someone who does not lo...   \n",
+              "2    Canon PIXMA TS9520 All-in-One Print / Scan / C...   \n",
+              "3    For those who plan on buying an iPad this Satu...   \n",
+              "4    After tipping 25 tokens in a day, you'll be ab...   \n",
+              "..                                                 ...   \n",
+              "444  Sunrise is an equal opportunity employer. Vete...   \n",
+              "445  Home / Business / #Exploitation: Coca Cola is ...   \n",
+              "446  I got really surprised when I saw that I recei...   \n",
+              "447  Here's a brief schedule for 2016 as requested ...   \n",
+              "448  It spreads like a wild fire. “It can never hap...   \n",
+              "\n",
+              "                                                  meta             source_id  \\\n",
+              "0    {\"timestamp\":\"2019-04-24T02:17:53Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "1    {\"timestamp\":\"2019-04-23T06:32:35Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "2    {\"timestamp\":\"2019-04-25T17:03:36Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "3    {\"timestamp\":\"2019-04-22T22:39:52Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "4    {\"timestamp\":\"2019-04-20T00:25:13Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "..                                                 ...                   ...   \n",
+              "444  {\"timestamp\":\"2019-04-22T10:28:15Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "445  {\"timestamp\":\"2019-04-24T18:04:45Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "446  {\"timestamp\":\"2019-04-26T08:57:28Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "447  {\"timestamp\":\"2019-04-18T10:15:11Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "448  {\"timestamp\":\"2019-04-22T18:51:33Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "\n",
+              "     doc_score  should_keep classify_text  \n",
+              "0     0.139534            0             0  \n",
+              "1     0.999997            1             1  \n",
+              "2     0.941116            1             1  \n",
+              "3     0.999765            1             1  \n",
+              "4     0.939119            1             1  \n",
+              "..         ...          ...           ...  \n",
+              "444   0.834727            0             1  \n",
+              "445   0.998307            1             1  \n",
+              "446   0.864012            1             1  \n",
+              "447   0.999769            1             1  \n",
+              "448   0.999995            1             1  \n",
+              "\n",
+              "[449 rows x 6 columns]"
             ]
           },
+          "execution_count": 9,
           "metadata": {},
-          "execution_count": 9
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "pd.read_parquet(\"ResumableTextPipeline_output/output/tiny_c4_sample.jsonl\")"
       ]
     }
   ],
@@ -736,4 +736,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 5
-}
+}
diff --git a/RecDP/pyrecdp/core/di_graph.py b/RecDP/pyrecdp/core/di_graph.py
@@ -92,9 +92,9 @@ def convert_to_node_chain(self):
             ret = list(self.keys())
         return ret
 
-    def json_dump(self):
+    def json_dump(self, base_dir = ""):
         import json
-        to_dump = dict((node_id, op.dump()) for node_id, op in self.items())
+        to_dump = dict((node_id, op.dump(base_dir)) for node_id, op in self.items())
         return json.dumps(to_dump, indent=4)
 
     def copy(self):

diff --git a/RecDP/pyrecdp/core/pipeline.py b/RecDP/pyrecdp/core/pipeline.py
@@ -20,8 +20,10 @@ def __repr__(self):
 
     def export(self, file_path = None):
         import copy
+        import os
+        base_dir = os.path.dirname(file_path) if file_path else ""
         pipeline_deepcopy = copy.deepcopy(self.pipeline)
-        json_object = pipeline_deepcopy.json_dump()
+        json_object = pipeline_deepcopy.json_dump(base_dir)
         if file_path:
             # Writing to sample.json
             with open(file_path, "w") as outfile: