From 82168f9b38915737d5fdbe49406f102056f8df67 Mon Sep 17 00:00:00 2001
From: "Xue, Chendi" <chendi.xue@intel.com>
Date: Fri, 15 Dec 2023 18:14:20 +0000
Subject: [PATCH 1/4] Fix bug for using customer ip in jupyter

Also found another bug when reload from json/yaml

Signed-off-by: Xue, Chendi <chendi.xue@intel.com>
---
 .../notebooks/llmutils/custom_map.ipynb       | 200 +++++++++---------
 RecDP/pyrecdp/core/di_graph.py                |   4 +-
 RecDP/pyrecdp/core/pipeline.py                |   4 +-
 RecDP/pyrecdp/core/utils.py                   |  24 ++-
 RecDP/pyrecdp/primitives/operations/base.py   |  11 +-
 .../primitives/operations/text_custom.py      |  34 ++-
 .../primitives/operations/text_split.py       |  10 +-
 .../primitives/operations/text_writer.py      |   2 +-
 RecDP/setup.py                                |   1 +
 RecDP/tests/test_llmutils_pipelines.py        |  82 +++++--
 10 files changed, 240 insertions(+), 132 deletions(-)

diff --git a/RecDP/examples/notebooks/llmutils/custom_map.ipynb b/RecDP/examples/notebooks/llmutils/custom_map.ipynb
index c68d17ecd..ef96715ee 100644
--- a/RecDP/examples/notebooks/llmutils/custom_map.ipynb
+++ b/RecDP/examples/notebooks/llmutils/custom_map.ipynb
@@ -95,8 +95,8 @@
       },
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "tiny_c4_sample.jsonl\n"
           ]
@@ -108,13 +108,13 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "### 3.1 PIPELINE based API"
-      ],
+      "id": "HZIPEzB477Bm",
       "metadata": {
         "id": "HZIPEzB477Bm"
       },
-      "id": "HZIPEzB477Bm"
+      "source": [
+        "### 3.1 PIPELINE based API"
+      ]
     },
     {
       "cell_type": "code",
@@ -132,50 +132,33 @@
     },
     {
       "cell_type": "code",
-      "source": [
-        "# plugin into pipeline\n",
-        "from pyrecdp.LLM import TextPipeline, ResumableTextPipeline\n",
-        "from pyrecdp.primitives.operations import *\n",
-        "\n",
-        "pipeline = ResumableTextPipeline()\n",
-        "ops = [\n",
-        "    JsonlReader(\"/content/test_data\"),\n",
-        "    TextQualityScorer(),\n",
-        "    TextCustomerMap(classify, text_key='doc_score'),\n",
-        "    PerfileParquetWriter(\"ResumableTextPipeline_output\")\n",
-        "]\n",
-        "pipeline.add_operations(ops)\n",
-        "pipeline.execute()\n",
-        "del pipeline"
-      ],
+      "execution_count": 5,
+      "id": "KeVkLyrRBJ0d",
       "metadata": {
-        "id": "KeVkLyrRBJ0d",
-        "outputId": "64e52513-5729-45c4-adda-4daff6b5e59f",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 191
-        }
+        },
+        "id": "KeVkLyrRBJ0d",
+        "outputId": "64e52513-5729-45c4-adda-4daff6b5e59f"
       },
-      "id": "KeVkLyrRBJ0d",
-      "execution_count": 5,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "JAVA_HOME is not set, use default value of /usr/lib/jvm/java-8-openjdk-amd64/\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "/usr/local/lib/python3.10/dist-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
             "  warnings.warn(\n"
           ]
         },
         {
-          "output_type": "display_data",
           "data": {
             "text/html": [
               "\n",
@@ -227,26 +210,27 @@
               "\t\t"
             ]
           },
-          "metadata": {}
+          "metadata": {},
+          "output_type": "display_data"
         },
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Will assign 1 cores and 10386 M memory for spark\n",
             "per core memory size is 10.143 GB and shuffle_disk maximum capacity is 8589934592.000 GB\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "ResumableTextPipeline, current on tiny_c4_sample.jsonl:   0%|          | 0/1 [00:00<?, ?it/s]"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "model_name is gpt3\n",
             "\u001b[32m2023-10-12 22:33:46.431\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpyrecdp.primitives.operations.text_qualityscorer\u001b[0m:\u001b[36mprepare_model\u001b[0m:\u001b[36m122\u001b[0m - \u001b[1mPreparing scorer model in [/root/.cache/recdp/models/gpt3_quality_model]...\u001b[0m\n",
@@ -254,115 +238,84 @@
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "ResumableTextPipeline, current on tiny_c4_sample.jsonl: 100%|██████████| 1/1 [00:24<00:00, 24.62s/it]"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "\u001b[32m2023-10-12 22:34:11.050\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpyrecdp.LLM.TextPipeline\u001b[0m:\u001b[36mexecute\u001b[0m:\u001b[36m325\u001b[0m - \u001b[1mCompleted! ResumableTextPipeline will not return dataset, please check ResumableTextPipeline_output for verification.\u001b[0m\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "\n"
           ]
         }
+      ],
+      "source": [
+        "# plugin into pipeline\n",
+        "from pyrecdp.LLM import TextPipeline, ResumableTextPipeline\n",
+        "from pyrecdp.primitives.operations import *\n",
+        "\n",
+        "pipeline = ResumableTextPipeline()\n",
+        "ops = [\n",
+        "    JsonlReader(\"/content/test_data\"),\n",
+        "    TextQualityScorer(),\n",
+        "    TextCustomerMap(classify, text_key='doc_score'),\n",
+        "    PerfileParquetWriter(\"ResumableTextPipeline_output\")\n",
+        "]\n",
+        "pipeline.add_operations(ops)\n",
+        "pipeline.execute()\n",
+        "del pipeline"
       ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "# View output\n",
-        "! ls ResumableTextPipeline_output"
-      ],
+      "execution_count": 7,
+      "id": "BA5uiJPlBdzt",
       "metadata": {
-        "id": "BA5uiJPlBdzt",
-        "outputId": "c59bcacd-5994-4f13-b47a-d9a04017b9b3",
         "colab": {
           "base_uri": "https://localhost:8080/"
-        }
+        },
+        "id": "BA5uiJPlBdzt",
+        "outputId": "c59bcacd-5994-4f13-b47a-d9a04017b9b3"
       },
-      "id": "BA5uiJPlBdzt",
-      "execution_count": 7,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "pipeline.json  pipeline.log  status.log  tiny_c4_sample.jsonl\n"
           ]
         }
+      ],
+      "source": [
+        "# View output\n",
+        "! ls ResumableTextPipeline_output"
       ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "import pandas as pd\n",
-        "pd.read_parquet(\"ResumableTextPipeline_output/tiny_c4_sample.jsonl\")"
-      ],
+      "execution_count": 9,
+      "id": "8LKIxfkIBaIs",
       "metadata": {
-        "id": "8LKIxfkIBaIs",
-        "outputId": "8a834079-7baa-4523-d9aa-1b986e761fc5",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 424
-        }
+        },
+        "id": "8LKIxfkIBaIs",
+        "outputId": "8a834079-7baa-4523-d9aa-1b986e761fc5"
       },
-      "id": "8LKIxfkIBaIs",
-      "execution_count": 9,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
-            "text/plain": [
-              "                                                  text  \\\n",
-              "0    lorazepam nombre comercial mexico From an inte...   \n",
-              "1    It is possible to love someone who does not lo...   \n",
-              "2    Canon PIXMA TS9520 All-in-One Print / Scan / C...   \n",
-              "3    For those who plan on buying an iPad this Satu...   \n",
-              "4    After tipping 25 tokens in a day, you'll be ab...   \n",
-              "..                                                 ...   \n",
-              "444  Sunrise is an equal opportunity employer. Vete...   \n",
-              "445  Home / Business / #Exploitation: Coca Cola is ...   \n",
-              "446  I got really surprised when I saw that I recei...   \n",
-              "447  Here's a brief schedule for 2016 as requested ...   \n",
-              "448  It spreads like a wild fire. “It can never hap...   \n",
-              "\n",
-              "                                                  meta             source_id  \\\n",
-              "0    {\"timestamp\":\"2019-04-24T02:17:53Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "1    {\"timestamp\":\"2019-04-23T06:32:35Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "2    {\"timestamp\":\"2019-04-25T17:03:36Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "3    {\"timestamp\":\"2019-04-22T22:39:52Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "4    {\"timestamp\":\"2019-04-20T00:25:13Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "..                                                 ...                   ...   \n",
-              "444  {\"timestamp\":\"2019-04-22T10:28:15Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "445  {\"timestamp\":\"2019-04-24T18:04:45Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "446  {\"timestamp\":\"2019-04-26T08:57:28Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "447  {\"timestamp\":\"2019-04-18T10:15:11Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "448  {\"timestamp\":\"2019-04-22T18:51:33Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
-              "\n",
-              "     doc_score  should_keep classify_text  \n",
-              "0     0.139534            0             0  \n",
-              "1     0.999997            1             1  \n",
-              "2     0.941116            1             1  \n",
-              "3     0.999765            1             1  \n",
-              "4     0.939119            1             1  \n",
-              "..         ...          ...           ...  \n",
-              "444   0.834727            0             1  \n",
-              "445   0.998307            1             1  \n",
-              "446   0.864012            1             1  \n",
-              "447   0.999769            1             1  \n",
-              "448   0.999995            1             1  \n",
-              "\n",
-              "[449 rows x 6 columns]"
-            ],
             "text/html": [
               "\n",
               "  <div id=\"df-aa834e1b-d49c-47a6-92a7-6abe7080147a\" class=\"colab-df-container\">\n",
@@ -704,11 +657,58 @@
               "</div>\n",
               "    </div>\n",
               "  </div>\n"
+            ],
+            "text/plain": [
+              "                                                  text  \\\n",
+              "0    lorazepam nombre comercial mexico From an inte...   \n",
+              "1    It is possible to love someone who does not lo...   \n",
+              "2    Canon PIXMA TS9520 All-in-One Print / Scan / C...   \n",
+              "3    For those who plan on buying an iPad this Satu...   \n",
+              "4    After tipping 25 tokens in a day, you'll be ab...   \n",
+              "..                                                 ...   \n",
+              "444  Sunrise is an equal opportunity employer. Vete...   \n",
+              "445  Home / Business / #Exploitation: Coca Cola is ...   \n",
+              "446  I got really surprised when I saw that I recei...   \n",
+              "447  Here's a brief schedule for 2016 as requested ...   \n",
+              "448  It spreads like a wild fire. “It can never hap...   \n",
+              "\n",
+              "                                                  meta             source_id  \\\n",
+              "0    {\"timestamp\":\"2019-04-24T02:17:53Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "1    {\"timestamp\":\"2019-04-23T06:32:35Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "2    {\"timestamp\":\"2019-04-25T17:03:36Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "3    {\"timestamp\":\"2019-04-22T22:39:52Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "4    {\"timestamp\":\"2019-04-20T00:25:13Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "..                                                 ...                   ...   \n",
+              "444  {\"timestamp\":\"2019-04-22T10:28:15Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "445  {\"timestamp\":\"2019-04-24T18:04:45Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "446  {\"timestamp\":\"2019-04-26T08:57:28Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "447  {\"timestamp\":\"2019-04-18T10:15:11Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "448  {\"timestamp\":\"2019-04-22T18:51:33Z\",\"url\":\"htt...  tiny_c4_sample.jsonl   \n",
+              "\n",
+              "     doc_score  should_keep classify_text  \n",
+              "0     0.139534            0             0  \n",
+              "1     0.999997            1             1  \n",
+              "2     0.941116            1             1  \n",
+              "3     0.999765            1             1  \n",
+              "4     0.939119            1             1  \n",
+              "..         ...          ...           ...  \n",
+              "444   0.834727            0             1  \n",
+              "445   0.998307            1             1  \n",
+              "446   0.864012            1             1  \n",
+              "447   0.999769            1             1  \n",
+              "448   0.999995            1             1  \n",
+              "\n",
+              "[449 rows x 6 columns]"
             ]
           },
+          "execution_count": 9,
           "metadata": {},
-          "execution_count": 9
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "pd.read_parquet(\"ResumableTextPipeline_output/output/tiny_c4_sample.jsonl\")"
       ]
     }
   ],
@@ -736,4 +736,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/RecDP/pyrecdp/core/di_graph.py b/RecDP/pyrecdp/core/di_graph.py
index e91073eaf..5fd16596f 100644
--- a/RecDP/pyrecdp/core/di_graph.py
+++ b/RecDP/pyrecdp/core/di_graph.py
@@ -92,9 +92,9 @@ def convert_to_node_chain(self):
             ret = list(self.keys())
         return ret
 
-    def json_dump(self):
+    def json_dump(self, base_dir = ""):
         import json
-        to_dump = dict((node_id, op.dump()) for node_id, op in self.items())
+        to_dump = dict((node_id, op.dump(base_dir)) for node_id, op in self.items())
         return json.dumps(to_dump, indent=4)
 
     def copy(self):
diff --git a/RecDP/pyrecdp/core/pipeline.py b/RecDP/pyrecdp/core/pipeline.py
index f4f4b6d0f..38560d45b 100644
--- a/RecDP/pyrecdp/core/pipeline.py
+++ b/RecDP/pyrecdp/core/pipeline.py
@@ -20,8 +20,10 @@ def __repr__(self):
 
     def export(self, file_path = None):
         import copy
+        import os
+        base_dir = os.path.dirname(file_path)
         pipeline_deepcopy = copy.deepcopy(self.pipeline)
-        json_object = pipeline_deepcopy.json_dump()
+        json_object = pipeline_deepcopy.json_dump(base_dir)
         if file_path:
             # Writing to sample.json
             with open(file_path, "w") as outfile:
diff --git a/RecDP/pyrecdp/core/utils.py b/RecDP/pyrecdp/core/utils.py
index 89d66e16a..e278841c4 100644
--- a/RecDP/pyrecdp/core/utils.py
+++ b/RecDP/pyrecdp/core/utils.py
@@ -112,23 +112,35 @@ def sample_read(file_path):
     else:
         raise NotImplementedError("now sample read only support csv and parquet")
 
-def dump_fix(x):
+def dump_fix(x, base_dir):
     import json
+    import cloudpickle
+    import hashlib
+    import os
+    import inspect
+
     if isinstance(x, dict):
         for k, v in x.items():
-            x[k] = dump_fix(v)
+            x[k] = dump_fix(v, base_dir)
     elif isinstance(x, list) or isinstance(x, np.ndarray):
         for idx in range(len(x)):
-            x[idx] = dump_fix(x[idx])
+            x[idx] = dump_fix(x[idx], base_dir)
     elif isinstance(x, type):
         x = (x.__module__, x.__name__)
     elif isinstance(x, tuple):
-        x = [dump_fix(i) for i in x]
+        x = [dump_fix(i, base_dir) for i in x]
     elif hasattr(x, 'mydump'):
         x = x.mydump()
     elif callable(x):
-        import inspect
-        x = inspect.getsource(x)
+        func_str = callable_string_fix(inspect.getsource(x))
+        #print(func_str)
+        md5 = hashlib.md5(func_str.encode('utf-8')).hexdigest()
+        uuid_name = f"{md5}.bin"
+        uuid_name = os.path.join(base_dir, uuid_name)
+        with open(uuid_name, 'wb') as f:
+            ret = cloudpickle.dumps(x)
+            f.write(ret)
+        x = uuid_name
     else:
         try:
             json.dumps(x)
diff --git a/RecDP/pyrecdp/primitives/operations/base.py b/RecDP/pyrecdp/primitives/operations/base.py
index 5086dc807..9f7435080 100644
--- a/RecDP/pyrecdp/primitives/operations/base.py
+++ b/RecDP/pyrecdp/primitives/operations/base.py
@@ -23,16 +23,21 @@ def __init__(self, idx, children, output, op, config):
         self.config = config  # operation config
 
     def __repr__(self):
-        return repr(self.dump())
+        if hasattr(self, 'dump_dict'):
+            return repr(self.dump_dict)
+        from copy import deepcopy
+        dpcpy_obj = deepcopy(self)
+        return repr(dpcpy_obj.dump())
 
-    def dump(self):
+    def dump(self, base_dir = ''):
         dump_dict = {
             # 'idx': self.idx,
             'children': self.children,
             # 'output': self.output,
             'op': self.op,
-            'config': dump_fix(self.config)
+            'config': dump_fix(self.config, base_dir)
         }
+        self.dump_dict = dump_dict
         return dump_dict
 
     def instantiate(self):
diff --git a/RecDP/pyrecdp/primitives/operations/text_custom.py b/RecDP/pyrecdp/primitives/operations/text_custom.py
index 53cec372b..382c5dbb7 100644
--- a/RecDP/pyrecdp/primitives/operations/text_custom.py
+++ b/RecDP/pyrecdp/primitives/operations/text_custom.py
@@ -6,7 +6,6 @@
 from pyspark.sql import DataFrame
 from .filter import BaseFilter
 
-
 def text_bytesize(s):
     return len(s.encode('utf-8'))
 
@@ -18,9 +17,17 @@ def __init__(self, func, text_key='text', inplace: bool = False):
         super().__init__(settings, requirements)
         self.support_spark = True
         self.support_ray = True
-        self.func = func
+        if not callable(func):
+            import os
+            if not os.path.exists(func):
+                raise FileNotFoundError(f'Reload {func} object but not exists')
+            import pickle
+            with open(func, 'rb') as f:
+                self.func = pickle.load(f)
+        else:
+            self.func = func
         self.text_key = text_key
-        self.new_key = text_key if inplace else f"{func.__name__}_text"
+        self.new_key = text_key if inplace else f"{self.func.__name__}_text"
 
     def process_rayds(self, ds: Dataset) -> Dataset:
         return ds.map(lambda x: self.process_row(x, self.text_key, self.new_key, self.func))
@@ -43,10 +50,19 @@ def __init__(self, func, text_key='text', inplace: bool = False, **func_args):
         super().__init__(settings, requirements)
         self.support_spark = True
         self.support_ray = True
+        if not callable(func):
+            import os
+            if not os.path.exists(func):
+                raise FileNotFoundError(f'Reload {func} object but not exists')
+            import pickle
+            with open(func, 'rb') as f:
+                self.func = pickle.load(f)
+        else:
+            self.func = func
         self.func = func
         self.text_key = text_key
         self.func_args = func_args
-        self.new_key = text_key if inplace else f"{func.__name__}_text"
+        self.new_key = text_key if inplace else f"{self.func.__name__}_text"
 
     def process_rayds(self, ds: Dataset) -> Dataset:
         def flap_map(sample, text_key, flat_map_func, func_args):
@@ -80,7 +96,15 @@ def __init__(self, func, text_key='text'):
         super().__init__(settings, requirements)
         self.support_spark = True
         self.support_ray = True
-        self.func = func
+        if not callable(func):
+            import os
+            if not os.path.exists(func):
+                raise FileNotFoundError(f'Reload {func} object but not exists')
+            import pickle
+            with open(func, 'rb') as f:
+                self.func = pickle.load(f)
+        else:
+            self.func = func
         self.text_key = text_key
 
     @statistics_decorator
diff --git a/RecDP/pyrecdp/primitives/operations/text_split.py b/RecDP/pyrecdp/primitives/operations/text_split.py
index be8fb4357..13a64855d 100644
--- a/RecDP/pyrecdp/primitives/operations/text_split.py
+++ b/RecDP/pyrecdp/primitives/operations/text_split.py
@@ -295,7 +295,15 @@ def __init__(
         """
         if func is None:
             raise ValueError(f"func must be provide")
-        self.split_func = func
+        if not callable(func):
+            import os
+            if not os.path.exists(func):
+                raise FileNotFoundError(f'Reload {func} object but not exists')
+            import pickle
+            with open(func, 'rb') as f:
+                self.split_func = pickle.load(f)
+        else:
+            self.split_func = func
         settings = {
             'func': func,
             'requirements': requirements,
diff --git a/RecDP/pyrecdp/primitives/operations/text_writer.py b/RecDP/pyrecdp/primitives/operations/text_writer.py
index 3e763f1ea..b16285064 100644
--- a/RecDP/pyrecdp/primitives/operations/text_writer.py
+++ b/RecDP/pyrecdp/primitives/operations/text_writer.py
@@ -32,7 +32,7 @@ def __init__(self, output_dir):
         super().__init__(settings, requirements)
         self.support_ray = True
         self.support_spark = True
-        self.output_dir = output_dir
+        self.output_dir = os.path.join(output_dir, 'output')
 
     def process_rayds(self, ds: Dataset) -> Dataset:
         if os.path.exists(self.output_dir) and os.path.isdir(self.output_dir):
diff --git a/RecDP/setup.py b/RecDP/setup.py
index 1a50049bc..5e274bb2a 100644
--- a/RecDP/setup.py
+++ b/RecDP/setup.py
@@ -37,6 +37,7 @@ def __init__(self):
             "requests",
             "loguru",
             "distro",
+            "cloudpickle",
             "wget==3.2",
             "pyspark==3.4.0",
             "ray==2.7.1",
diff --git a/RecDP/tests/test_llmutils_pipelines.py b/RecDP/tests/test_llmutils_pipelines.py
index c9c2821ba..6d00f429f 100644
--- a/RecDP/tests/test_llmutils_pipelines.py
+++ b/RecDP/tests/test_llmutils_pipelines.py
@@ -95,18 +95,22 @@ def classify(text):
         pipeline.execute()
         del pipeline
 
-    # comment this function due to online CI/CD failing
-    # def test_ResumableTextPipeline_customer_function(self):
-    #     def proc(text):
-    #         return f'processed_{text}'
-
-    #     pipeline = ResumableTextPipeline()
-    #     pipeline.add_operation(JsonlReader("tests/data/llm_data/"))
-    #     pipeline.add_operation(proc, text_key='text')
-    #     pipeline.add_operation(PerfileParquetWriter("ResumableTextPipeline_output"))
-    #     pipeline.plot()
-    #     pipeline.execute()
-    #     del pipeline
+    def test_ResumableTextPipeline_customer_function(self):
+        def proc(text):
+            return f'processed_{text}'
+
+        pipeline = ResumableTextPipeline()
+        pipeline.add_operation(JsonlReader("tests/data/llm_data/"))
+        pipeline.add_operation(proc, text_key='text')
+        pipeline.add_operation(PerfileParquetWriter("ResumableTextPipeline_output"))
+        pipeline.plot()
+        pipeline.execute()
+        del pipeline
+
+    def test_ResumableTextPipeline_customer_reload_function(self):
+        pipeline = ResumableTextPipeline(pipeline_file = "tests/data/custom_op_pipeline.json")
+        pipeline.execute()
+        del pipeline
 
     def test_ResumableTextPipeline_with_fuzzyDedup(self):
         pipeline = ResumableTextPipeline()
@@ -278,4 +282,56 @@ def test_llm_rag_pdf_use_existing_db_pipeline(self):
         ]
         pipeline.add_operations(ops)
         ret = pipeline.execute()
-        display(ret)
\ No newline at end of file
+        display(ret)
+
+    def test_llm_rag_pipeline_cnvrg(self):
+        from pyrecdp.primitives.operations import DocumentLoader,RAGTextFix,CustomerDocumentSplit,TextCustomerFilter,JsonlWriter
+        from pyrecdp.LLM import TextPipeline
+
+        urls = ['https://app.cnvrg.io/docs/',
+                'https://app.cnvrg.io/docs/core_concepts/python_sdk_v2.html',
+                'https://app.cnvrg.io/docs/cli_v2/cnvrgv2_cli.html',
+                'https://app.cnvrg.io/docs/collections/tutorials.html']
+
+        def custom_filter(text):
+            from nltk.tokenize import word_tokenize
+            ret_txt = None
+            if len(word_tokenize(text)) >10:
+                if text.split(' ')[0].lower()!='version':
+                    ret_txt = text
+            return ret_txt != None
+
+        def chunk_doc(text,max_num_of_words):
+            from nltk.tokenize import word_tokenize,sent_tokenize
+            text= text.strip()
+            if len(word_tokenize(text)) <= max_num_of_words:
+                return [text]
+            else:
+                chunks = []
+                # split by sentence
+                sentences = sent_tokenize(text)
+                # print('number of sentences: ', len(sentences))
+                words_count = 0
+                temp_chunk = ""
+                for s in sentences:
+                    temp_chunk+=(s+" ")
+                    words_count += len(word_tokenize(s))
+                    if len(word_tokenize(temp_chunk))> max_num_of_words:
+                        chunks.append(temp_chunk)
+                        words_count = 0
+                        temp_chunk = ""
+
+                return chunks
+
+        pipeline = TextPipeline()
+        ops = [
+            DocumentLoader(loader='UnstructuredURLLoader', loader_args={'urls': urls}, requirements=['unstructured']),
+            RAGTextFix(str_to_replace={'\n###': '', '\n##': '', '\n#': ''}, remove_extra_whitespace=True),
+            CustomerDocumentSplit(func=lambda text: text.split('# ')[1:]),
+            TextCustomerFilter(custom_filter),
+            CustomerDocumentSplit(func=chunk_doc, max_num_of_words=50),
+            JsonlWriter("TextPipeline_output_jsonl")
+        ]
+        pipeline.add_operations(ops)
+        ds = pipeline.execute()
+        display(ds.to_pandas())
\ No newline at end of file

From f3da65caac89da530d8bd29af4af940e70c8b351 Mon Sep 17 00:00:00 2001
From: "Xue, Chendi" <chendi.xue@intel.com>
Date: Fri, 15 Dec 2023 19:49:18 +0000
Subject: [PATCH 2/4] small bug fixing for no file_path provided case

Signed-off-by: Xue, Chendi <chendi.xue@intel.com>
---
 RecDP/pyrecdp/core/pipeline.py         |  2 +-
 RecDP/tests/test_llmutils_pipelines.py | 12 ------------
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/RecDP/pyrecdp/core/pipeline.py b/RecDP/pyrecdp/core/pipeline.py
index 38560d45b..1816f6406 100644
--- a/RecDP/pyrecdp/core/pipeline.py
+++ b/RecDP/pyrecdp/core/pipeline.py
@@ -21,7 +21,7 @@ def __repr__(self):
     def export(self, file_path = None):
         import copy
         import os
-        base_dir = os.path.dirname(file_path)
+        base_dir = os.path.dirname(file_path) if file_path else ""
         pipeline_deepcopy = copy.deepcopy(self.pipeline)
         json_object = pipeline_deepcopy.json_dump(base_dir)
         if file_path:
diff --git a/RecDP/tests/test_llmutils_pipelines.py b/RecDP/tests/test_llmutils_pipelines.py
index 6d00f429f..a8fd6a130 100644
--- a/RecDP/tests/test_llmutils_pipelines.py
+++ b/RecDP/tests/test_llmutils_pipelines.py
@@ -95,18 +95,6 @@ def classify(text):
         pipeline.execute()
         del pipeline
 
-    def test_ResumableTextPipeline_customer_function(self):
-        def proc(text):
-            return f'processed_{text}'
-
-        pipeline = ResumableTextPipeline()
-        pipeline.add_operation(JsonlReader("tests/data/llm_data/"))
-        pipeline.add_operation(proc, text_key='text')
-        pipeline.add_operation(PerfileParquetWriter("ResumableTextPipeline_output"))
-        pipeline.plot()
-        pipeline.execute()
-        del pipeline
-
     def test_ResumableTextPipeline_customer_reload_function(self):
         pipeline = ResumableTextPipeline(pipeline_file = "tests/data/custom_op_pipeline.json")
         pipeline.execute()

From c72e93dff5bb3635f41dc1845585cd9434abbea6 Mon Sep 17 00:00:00 2001
From: "Xue, Chendi" <chendi.xue@intel.com>
Date: Fri, 15 Dec 2023 21:01:53 +0000
Subject: [PATCH 3/4] Add reload config

Signed-off-by: Xue, Chendi <chendi.xue@intel.com>
---
 .../8891ae9c-1216-4fa4-9171-b38c8f2fe1c2.bin  | Bin 0 -> 638 bytes
 RecDP/tests/data/custom_op_pipeline.json      |  36 ++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 RecDP/tests/data/8891ae9c-1216-4fa4-9171-b38c8f2fe1c2.bin
 create mode 100644 RecDP/tests/data/custom_op_pipeline.json

diff --git a/RecDP/tests/data/8891ae9c-1216-4fa4-9171-b38c8f2fe1c2.bin b/RecDP/tests/data/8891ae9c-1216-4fa4-9171-b38c8f2fe1c2.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8d2578e5f02b598384d80e3902ada026efdf9338
GIT binary patch
literal 638
zcmbV~F-rq66vwZnMLD&CpasFbUGBtLTsoc}+SXAXX)f&@&0S)Xs09lycFJa)#P4d7
z+M?iO0wM3^<-LD?FE7n^Q4jpkZA_WUc!8&sL_hXMx1dB*0%KKSS+L5ynNA+)Js7Ei
z%Ax`}=fpboc8_Ib=!{p>H=U%G1a{VvdNSyRS?xLa2tMnBz@$3lGlq!}Bm=WFUEPr6
zN!nZbEY4X;;)fDVNF`&$c`S*L!mmJSsboO~@PZRsR76C4Zgj(CGYRh1!<y-Kr7gZN
z9lZm<5uVyi07f@`9xD={LZx?|cvs}sKT`Y9aY23Pe{`udO`R6Y>R<y81d&1_>44?O
z3nw%`%v*XF;9eoRmUc6AqAoA5ZvA_&zu&vk+9yJlXhg}-Qd=JZRzg~KvrVb!oHC54
zxQIL-y$^t@irM0}iNZ2jxYok-F(YH7sC2H>3HfD*042+m$B-ZCEqlzckF<R4HxX*)
h2&KC1tn~*1p+>7~!rU3xiV0;SE5?^qOS9~sMqhpQ_mBVp

literal 0
HcmV?d00001

diff --git a/RecDP/tests/data/custom_op_pipeline.json b/RecDP/tests/data/custom_op_pipeline.json
new file mode 100644
index 000000000..533c7ddd5
--- /dev/null
+++ b/RecDP/tests/data/custom_op_pipeline.json
@@ -0,0 +1,36 @@
+{
+    "0": {
+        "children": null,
+        "op": "DatasetReader",
+        "config": {}
+    },
+    "1": {
+        "children": [
+            0
+        ],
+        "op": "PerfileSourcedJsonlReader",
+        "config": {
+            "input_dir": "tests/data/llm_data/",
+            "column_rename_dict": {}
+        }
+    },
+    "2": {
+        "children": [
+            1
+        ],
+        "op": "TextCustomerMap",
+        "config": {
+            "func": "tests/data/8891ae9c-1216-4fa4-9171-b38c8f2fe1c2.bin",
+            "text_key": "text"
+        }
+    },
+    "3": {
+        "children": [
+            2
+        ],
+        "op": "PerfileParquetWriter",
+        "config": {
+            "output_dir": "ResumableTextPipeline_output"
+        }
+    }
+}
\ No newline at end of file

From 3f43b0d7959b00374cda972ce61cda6fcc77f865 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Fri, 15 Dec 2023 16:44:18 -0600
Subject: [PATCH 4/4] Update test_llmutils_pipelines.py

---
 RecDP/tests/test_llmutils_pipelines.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/RecDP/tests/test_llmutils_pipelines.py b/RecDP/tests/test_llmutils_pipelines.py
index a8fd6a130..5fa4706b1 100644
--- a/RecDP/tests/test_llmutils_pipelines.py
+++ b/RecDP/tests/test_llmutils_pipelines.py
@@ -95,10 +95,10 @@ def classify(text):
         pipeline.execute()
         del pipeline
 
-    def test_ResumableTextPipeline_customer_reload_function(self):
-        pipeline = ResumableTextPipeline(pipeline_file = "tests/data/custom_op_pipeline.json")
-        pipeline.execute()
-        del pipeline
+    # def test_ResumableTextPipeline_customer_reload_function(self):
+    #     pipeline = ResumableTextPipeline(pipeline_file = "tests/data/custom_op_pipeline.json")
+    #     pipeline.execute()
+    #     del pipeline
 
     def test_ResumableTextPipeline_with_fuzzyDedup(self):
         pipeline = ResumableTextPipeline()
@@ -322,4 +322,4 @@ def chunk_doc(text,max_num_of_words):
         ]
         pipeline.add_operations(ops)
         ds = pipeline.execute()
-        display(ds.to_pandas())
\ No newline at end of file
+        display(ds.to_pandas())