From 77609b2922d3a25d6d1797aef7669de743cefd5c Mon Sep 17 00:00:00 2001 From: "Zhang, Chaojun" Date: Wed, 20 Dec 2023 20:49:05 +0000 Subject: [PATCH] add notebook for document loader, document split and document ingestion --- .../notebooks/llmutils/document_extract.ipynb | 971 ++++++++++++-- .../llmutils/document_ingestion.ipynb | 944 ++++++++++++++ .../notebooks/llmutils/document_split.ipynb | 1130 ++++++++++++++++- RecDP/pyrecdp/LLM/README.md | 7 +- .../primitives/llmutils/document/reader.py | 15 +- .../primitives/operations/text_ingestion.py | 13 +- RecDP/tests/test_llmutils_operations.py | 7 +- 7 files changed, 2983 insertions(+), 104 deletions(-) create mode 100644 RecDP/examples/notebooks/llmutils/document_ingestion.ipynb diff --git a/RecDP/examples/notebooks/llmutils/document_extract.ipynb b/RecDP/examples/notebooks/llmutils/document_extract.ipynb index eb7eb6277..d11cb1128 100644 --- a/RecDP/examples/notebooks/llmutils/document_extract.ipynb +++ b/RecDP/examples/notebooks/llmutils/document_extract.ipynb @@ -95,7 +95,7 @@ "id": "dhrC92TbIriM" }, "source": [ - "## 3. convert data" + "## 3. load documents" ] }, { @@ -105,12 +105,12 @@ "id": "MLj8YsmXgKFa" }, "source": [ - "#### 3.1 convert pdf" + "#### 3.1 load pdf documents" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 1, "id": "bKDlmC0CdlGY", "metadata": { "colab": { @@ -124,28 +124,63 @@ "name": "stdout", "output_type": "stream", "text": [ - "Document extract for '/content/test_data/document/layout-parser-paper.pdf' with [glob=**/*.pdf, required_exts=None, recursive=False, multithread=False] started ...\n" + "_get_loader\n", + "\u001b[32m2023-12-20 14:52:16.382\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pypdf\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "_get_loader\n", + "\u001b[32m2023-12-20 14:52:16.469\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pypdf\u001b[0m\n", + "init ray\n", + "init ray with total mem of 324413575987, total core of 48\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 3.16it/s]" + "2023-12-20 14:52:21,156\tINFO worker.py:1642 -- Started a local Ray instance.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Document extract for '/content/test_data/document/layout-parser-paper.pdf' with [glob=**/*.pdf, required_exts=None, recursive=False, multithread=False] took 0.32126119174063206 sec\n" + "execute with ray started ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\n" + "100%|██████████| 1/1 [00:00<00:00, 2.85it/s]\n", + "2023-12-20 14:52:22,927\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Write]\n", + "2023-12-20 14:52:22,932\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 14:52:22,933\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1defe0fb210a4084b4b8967891df4251", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00\n", " \n", " 0\n", - " LayoutParser : A Unified Toolkit for Deep\\nLear...\n", + " \\n\\n LayoutParser : A Unified Toolkit for Deep\\...\n", " {'source': '/content/test_data/document/layout...\n", " \n", " \n", @@ -184,27 +219,37 @@ "" ], "text/plain": [ - " text \\\n", - "0 LayoutParser : A Unified Toolkit for Deep\\nLear... \n", - "\n", - " metadata \n", - "0 {'source': '/content/test_data/document/layout... " + " text metadata\n", + "0 \\n\\n LayoutParser : A Unified Toolkit for Deep\\... {'source': '/content/test_data/document/layout..." ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 14:52:30,040 E 2010245 2010266] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_14-52-18_287180_2002748 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 14:52:40,054 E 2010245 2010266] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_14-52-18_287180_2002748 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 14:52:50,065 E 2010245 2010266] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_14-52-18_287180_2002748 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 14:53:00,079 E 2010245 2010266] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_14-52-18_287180_2002748 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 14:53:10,093 E 2010245 2010266] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_14-52-18_287180_2002748 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] } ], "source": [ - "from pyrecdp.primitives.llmutils.document_extractor import pdf_to_text\n", - "import pandas as pd\n", - "\n", - "file_name = \"layout-parser-paper.pdf\"\n", - "in_file = \"/content/test_data/document/\" + file_name\n", - "out_file = \"/content/doc_jsonl/\" + file_name + \".jsonl\"\n", - "pdf_to_text(in_file, out_file)\n", - "display(pd.read_json(out_file, lines=True))\n", + "from pyrecdp.primitives.operations import DirectoryLoader\n", "\n", + "from pyrecdp.LLM import TextPipeline\n", + " \n", + "pipeline = TextPipeline()\n", + "ops = [\n", + " DirectoryLoader(input_dir=\"/content/test_data/document\", glob=\"**/*.pdf\")\n", + "]\n", + "pipeline.add_operations(ops)\n", + "ds = pipeline.execute()\n", + "display(ds.to_pandas())\n", " " ] }, @@ -215,12 +260,12 @@ "id": "UAWi-jXFgXuy" }, "source": [ - "#### 3.2 convert docx" + "#### 3.2 load word documents" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "id": "pWl-ym01ga9D", "metadata": { "colab": { @@ -234,28 +279,57 @@ "name": "stdout", "output_type": "stream", "text": [ - "Document extract for '/content/test_data/document/handbook-872p.docx' with [glob=**/*.docx, required_exts=None, recursive=False, multithread=False] started ...\n" + "_get_loader\n", + "\u001b[32m2023-12-20 14:53:12.711\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install python-docx\u001b[0m\n", + "_get_loader\n", + "\u001b[32m2023-12-20 14:53:12.803\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install python-docx\u001b[0m\n", + "init ray\n", + "init ray with total mem of 324413575987, total core of 48\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 1.94it/s]" + "2023-12-20 14:53:17,517\tINFO worker.py:1642 -- Started a local Ray instance.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Document extract for '/content/test_data/document/handbook-872p.docx' with [glob=**/*.docx, required_exts=None, recursive=False, multithread=False] took 0.520426164381206 sec\n" + "execute with ray started ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\n" + "100%|██████████| 1/1 [00:00<00:00, 1.64it/s]\n", + "2023-12-20 14:53:19,445\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Write]\n", + "2023-12-20 14:53:19,448\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 14:53:19,450\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "34082bbc3a274e59961eea59b5059b3a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00\n", " \n", " 0\n", - " U.S. Department of Justice\\nExecutive Office f...\n", + " \\n\\n U.S. Department of Justice\\n\\n Executive ...\n", " {'source': '/content/test_data/document/handbo...\n", " \n", " \n", @@ -294,27 +368,33 @@ "" ], "text/plain": [ - " text \\\n", - "0 U.S. Department of Justice\\nExecutive Office f... \n", - "\n", - " metadata \n", - "0 {'source': '/content/test_data/document/handbo... " + " text metadata\n", + "0 \\n\\n U.S. Department of Justice\\n\\n Executive ... {'source': '/content/test_data/document/handbo..." ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 14:53:26,399 E 2015878 2015899] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_14-53-14_626786_2002748 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] } ], "source": [ - "from pyrecdp.primitives.llmutils.document_extractor import docx_to_text\n", - "import pandas as pd\n", - "\n", - "file_name = \"handbook-872p.docx\"\n", - "in_file = \"/content/test_data/document/\" + file_name\n", - "out_file = \"/content/doc_jsonl/\" + file_name + \".jsonl\"\n", - "docx_to_text(in_file, out_file)\n", + "from pyrecdp.primitives.operations import DirectoryLoader\n", "\n", - "display(pd.read_json(out_file, lines=True))" + "from pyrecdp.LLM import TextPipeline\n", + " \n", + "pipeline = TextPipeline()\n", + "ops = [\n", + " DirectoryLoader(input_dir=\"/content/test_data/document\", glob=\"**/*.docx\")\n", + "]\n", + "pipeline.add_operations(ops)\n", + "ds = pipeline.execute()\n", + "display(ds.to_pandas())" ] }, { @@ -324,12 +404,12 @@ "id": "fUrG8v1Og7Tb" }, "source": [ - "#### 3.3 convert images" + "#### 3.3 load images" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 3, "id": "0bowqv7Ag7hI", "metadata": { "colab": { @@ -343,28 +423,130 @@ "name": "stdout", "output_type": "stream", "text": [ - "Document extract for '/content/test_data/document/layout-parser-paper-10p.jpg' with [glob=**/*.*, required_exts=['.jpeg', '.jpg', '.png'], recursive=False, multithread=False] started ...\n" + "_get_loader\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 134 not upgraded.\n", + "\u001b[32m2023-12-20 14:53:36.861\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pillow\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n", + "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n", + "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n" + ] + }, + { + "data": { + "text/html": [ + "
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2023-12-20 14:53:46.379\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pytesseract\u001b[0m\n", + "_get_loader\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 134 not upgraded.\n", + "\u001b[32m2023-12-20 14:53:46.933\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pillow\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:31<00:00, 31.67s/it]" + "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n", + "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n", + "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n" ] }, + { + "data": { + "text/html": [ + "
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "Document extract for '/content/test_data/document/layout-parser-paper-10p.jpg' with [glob=**/*.*, required_exts=['.jpeg', '.jpg', '.png'], recursive=False, multithread=False] took 31.676521027460694 sec\n" + "\u001b[32m2023-12-20 14:53:54.602\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pytesseract\u001b[0m\n", + "init ray\n", + "init ray with total mem of 324413575987, total core of 48\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\n" + "2023-12-20 14:53:59,250\tINFO worker.py:1642 -- Started a local Ray instance.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "execute with ray started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00 TaskPoolMapOperator[Write]\n", + "2023-12-20 14:54:35,492\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 14:54:35,493\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "456480681a1143d7940f174694d8ddf1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00\n", " \n", " 0\n", - " 2103.15348v2 [cs.CV] 21 Jun 2021\\n\\narXiv\\n\\nL...\n", + " 2103.15348v2 [cs.CV] 21 Jun 2021\\n\\narXiv\\n\\n...\n", " {'source': '/content/test_data/document/layout...\n", " \n", " \n", @@ -403,27 +585,33 @@ "" ], "text/plain": [ - " text \\\n", - "0 2103.15348v2 [cs.CV] 21 Jun 2021\\n\\narXiv\\n\\nL... \n", - "\n", - " metadata \n", - "0 {'source': '/content/test_data/document/layout... " + " text metadata\n", + "0 2103.15348v2 [cs.CV] 21 Jun 2021\\n\\narXiv\\n\\n... {'source': '/content/test_data/document/layout..." ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 14:54:38,262 E 2021288 2021308] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_14-53-56_430977_2002748 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] } ], "source": [ - "from pyrecdp.primitives.llmutils.document_extractor import image_to_text\n", - "import pandas as pd\n", - "\n", - "file_name = \"layout-parser-paper-10p.jpg\"\n", - "in_file = \"/content/test_data/document/\" + file_name\n", - "out_file = \"/content/doc_jsonl/\" + file_name + \".jsonl\"\n", - "image_to_text(in_file, out_file)\n", + "from pyrecdp.primitives.operations import DirectoryLoader\n", "\n", - "display(pd.read_json(out_file, lines=True))" + "from pyrecdp.LLM import TextPipeline\n", + " \n", + "pipeline = TextPipeline()\n", + "ops = [\n", + " DirectoryLoader(input_dir=\"/content/test_data/document\", glob=\"**/*.jpg\")\n", + "]\n", + "pipeline.add_operations(ops)\n", + "ds = pipeline.execute()\n", + "display(ds.to_pandas())" ] }, { @@ -433,12 +621,12 @@ "id": "yzrS5hVsl8QZ" }, "source": [ - "#### 3.4 convert entire directory" + "#### 3.4 load entire directory" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 1, "id": "wXR8_zDUdy2w", "metadata": { "colab": { @@ -448,32 +636,221 @@ "outputId": "77c2a54d-509b-4c54-e3de-8b51fefff080" }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n", + " warnings.warn(\"Setuptools is replacing distutils.\")\n", + "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "\u001b[32m2023-12-20 15:20:13.169\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install emoji==2.2.0\u001b[0m\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Document extract for '/content/test_data/document/' with [glob=**/*.*, required_exts=None, recursive=False, multithread=True] started ...\n" + "_get_loader\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 134 not upgraded.\n", + "\u001b[32m2023-12-20 15:20:13.692\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pillow\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 40/40 [00:32<00:00, 1.22it/s]" + "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n", + "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n", + "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n" ] }, + { + "data": { + "text/html": [ + "
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "Document extract for '/content/test_data/document/' with [glob=**/*.*, required_exts=None, recursive=False, multithread=True] took 32.91086395457387 sec\n" + "\u001b[32m2023-12-20 15:20:23.167\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pytesseract\u001b[0m\n", + "\u001b[32m2023-12-20 15:20:23.172\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pypdf\u001b[0m\n", + "\u001b[32m2023-12-20 15:20:23.382\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install python-docx\u001b[0m\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 134 not upgraded.\n", + "\u001b[32m2023-12-20 15:20:23.902\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pillow\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\n" + "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n", + "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n", + "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n" + ] + }, + { + "data": { + "text/html": [ + "
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2023-12-20 15:20:31.290\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pytesseract\u001b[0m\n", + "_get_loader\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 134 not upgraded.\n", + "\u001b[32m2023-12-20 15:20:31.823\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pillow\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n", + "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n", + "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n" + ] + }, + { + "data": { + "text/html": [ + "
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2023-12-20 15:20:38.180\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pytesseract\u001b[0m\n", + "\u001b[32m2023-12-20 15:20:38.183\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pypdf\u001b[0m\n", + "\u001b[32m2023-12-20 15:20:38.185\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install python-docx\u001b[0m\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 134 not upgraded.\n", + "\u001b[32m2023-12-20 15:20:38.670\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pillow\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n", + "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n", + "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n" + ] + }, + { + "data": { + "text/html": [ + "
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2023-12-20 15:20:52.243\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install pytesseract\u001b[0m\n", + "init ray\n", + "init ray with total mem of 324413575987, total core of 48\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-12-20 15:20:56,945\tINFO worker.py:1642 -- Started a local Ray instance.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "execute with ray started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 92%|█████████▎| 37/40 [00:00<00:00, 91.06it/s]\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:21:05,827 E 2140220 2140243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-20-54_049751_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:21:15,838 E 2140220 2140243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-20-54_049751_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + " 98%|█████████▊| 39/40 [00:19<00:00, 91.06it/s]\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:21:25,850 E 2140220 2140243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-20-54_049751_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "100%|██████████| 40/40 [00:35<00:00, 1.13it/s]\n", + "2023-12-20 15:21:33,755\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Write]\n", + "2023-12-20 15:21:33,757\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 15:21:33,758\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f83689225b41469a8d7c1b59f24f81bb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/4 [00:00\n", " \n", " 0\n", - " RULES AND INSTRUCTIONS\\n\\n1. Template for day ...\n", + " \\n\\n RULES AND INSTRUCTIONS\\n\\n1. Template for...\n", " {'source': '/content/test_data/document/englis...\n", " \n", " \n", " 1\n", - " U.S. Department of Justice\\nExecutive Office f...\n", + " \\n\\n U.S. Department of Justice\\n\\n Executive ...\n", " {'source': '/content/test_data/document/handbo...\n", " \n", " \n", " 2\n", - " 2103.15348v2 [cs.CV] 21 Jun 2021\\n\\narXiv\\n\\nL...\n", + " 2103.15348v2 [cs.CV] 21 Jun 2021\\n\\narXiv\\n\\n...\n", " {'source': '/content/test_data/document/layout...\n", " \n", " \n", " 3\n", - " LayoutParser : A Unified Toolkit for Deep\\nLear...\n", + " \\n\\n LayoutParser : A Unified Toolkit for Deep\\...\n", " {'source': '/content/test_data/document/layout...\n", " \n", " \n", @@ -528,10 +905,10 @@ ], "text/plain": [ " text \\\n", - "0 RULES AND INSTRUCTIONS\\n\\n1. Template for day ... \n", - "1 U.S. Department of Justice\\nExecutive Office f... \n", - "2 2103.15348v2 [cs.CV] 21 Jun 2021\\n\\narXiv\\n\\nL... \n", - "3 LayoutParser : A Unified Toolkit for Deep\\nLear... \n", + "0 \\n\\n RULES AND INSTRUCTIONS\\n\\n1. Template for... \n", + "1 \\n\\n U.S. Department of Justice\\n\\n Executive ... \n", + "2 2103.15348v2 [cs.CV] 21 Jun 2021\\n\\narXiv\\n\\n... \n", + "3 \\n\\n LayoutParser : A Unified Toolkit for Deep\\... \n", "\n", " metadata \n", "0 {'source': '/content/test_data/document/englis... \n", @@ -542,15 +919,447 @@ }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:21:35,862 E 2140220 2140243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-20-54_049751_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:21:45,876 E 2140220 2140243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-20-54_049751_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + } + ], + "source": [ + "from pyrecdp.primitives.operations import DirectoryLoader\n", + "\n", + "from pyrecdp.LLM import TextPipeline\n", + " \n", + "pipeline = TextPipeline()\n", + "ops = [\n", + " DirectoryLoader(input_dir=\"/content/test_data/document\")\n", + "]\n", + "pipeline.add_operations(ops)\n", + "ds = pipeline.execute()\n", + "display(ds.to_pandas())" + ] + }, + { + "cell_type": "markdown", + "id": "182fe042", + "metadata": {}, + "source": [ + "#### 3.5 Url Loaders with langchain RecusiverUrlLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "71f1ead9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "init ray\n", + "init ray with total mem of 324413575987, total core of 48\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-12-20 15:28:21,062\tINFO worker.py:1642 -- Started a local Ray instance.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "execute with ray started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:28:30,032 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:28:40,047 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "2023-12-20 15:28:41,766\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Write]\n", + "2023-12-20 15:28:41,767\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 15:28:41,768\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f30151903fdf42978b4f2166f24e5ce9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/17 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textmetadata
0<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
1<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
2<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
3<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
4<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
5<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
6<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
7<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
8<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
9<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
10<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
11<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
12<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
13<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
14<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
15<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
16<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
\n", + "" + ], + "text/plain": [ + " text \\\n", + "0 \\n\\n ... \n", + "1 \\n\\n ... \n", + "2 \\n\\n ... \n", + "3 \\n\\n ... \n", + "4 \\n\\n ... \n", + "5 \\n\\n ... \n", + "6 \\n\\n ... \n", + "7 \\n\\n ... \n", + "8 \\n\\n ... \n", + "9 \\n\\n ... \n", + "10 \\n\\n ... \n", + "11 \\n\\n ... \n", + "12 \\n\\n ... \n", + "13 \\n\\n ... \n", + "14 \\n\\n ... \n", + "15 \\n\\n ... \n", + "16 \\n\\n ... \n", + "\n", + " metadata \n", + "0 {'description': 'Documentation website for cnv... \n", + "1 {'description': 'Documentation website for cnv... \n", + "2 {'description': 'Documentation website for cnv... \n", + "3 {'description': 'Documentation website for cnv... \n", + "4 {'description': 'Documentation website for cnv... \n", + "5 {'description': 'Documentation website for cnv... \n", + "6 {'description': 'Documentation website for cnv... \n", + "7 {'description': 'Documentation website for cnv... \n", + "8 {'description': 'Documentation website for cnv... \n", + "9 {'description': 'Documentation website for cnv... \n", + "10 {'description': 'Documentation website for cnv... \n", + "11 {'description': 'Documentation website for cnv... \n", + "12 {'description': 'Documentation website for cnv... \n", + "13 {'description': 'Documentation website for cnv... \n", + "14 {'description': 'Documentation website for cnv... \n", + "15 {'description': 'Documentation website for cnv... \n", + "16 {'description': 'Documentation website for cnv... " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:28:50,060 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:29:00,074 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:29:10,089 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:29:20,105 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:29:30,116 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:29:40,128 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:29:50,142 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:30:00,153 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:30:10,166 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:30:20,178 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:30:30,192 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:30:40,206 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:30:50,219 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:31:00,233 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:31:10,245 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:31:20,257 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:31:30,271 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:31:40,285 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:31:50,298 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:32:00,312 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:32:10,323 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:32:20,336 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:32:30,349 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:32:40,362 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:32:50,375 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:33:00,386 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:33:10,401 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:33:20,414 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:33:30,428 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:33:40,443 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:33:50,458 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:34:00,471 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:34:10,485 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:34:20,499 E 2159771 2159790] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-28-18_227590_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] } ], "source": [ - "from pyrecdp.primitives.llmutils.document_extractor import document_to_text\n", - "import pandas as pd\n", - "in_file = \"/content/test_data/document/\"\n", - "out_file = \"/content/doc_jsonl/\" + \"document.json\"\n", - "document_to_text(in_file, out_file, use_multithreading=True)\n", - "display(pd.read_json(out_file, lines=True))" + "url = 'https://app.cnvrg.io/docs/'\n", + "\n", + "from pyrecdp.LLM import TextPipeline\n", + "from pyrecdp.primitives.operations import DocumentLoader\n", + "\n", + "pipeline = TextPipeline()\n", + "ops = [\n", + " DocumentLoader(loader='RecursiveUrlLoader', loader_args={\"url\": url}),\n", + "]\n", + "pipeline.add_operations(ops)\n", + "ds = pipeline.execute()\n", + "display(ds.to_pandas())\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "198739a6", + "metadata": {}, + "source": [ + "#### 3.5 Online Pdf loader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "514d9f03", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "init ray\n", + "init ray with total mem of 324413575987, total core of 48\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-12-20 15:37:49,289\tINFO worker.py:1642 -- Started a local Ray instance.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "execute with ray started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:37:58,263 E 2179392 2179417] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-37-46_470728_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "2023-12-20 15:37:58,360\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Write]\n", + "2023-12-20 15:37:58,362\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 15:37:58,363\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6d4d0dd6aa644a6fa584cb901dbad3e3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textmetadata
03 2 0 2\\n\\ng u A 2\\n\\n] L C . s c [\\n\\n7 v 2 6...{'source': '/tmp/tmp7lpqus0e/tmp.pdf'}
\n", + "" + ], + "text/plain": [ + " text \\\n", + "0 3 2 0 2\\n\\ng u A 2\\n\\n] L C . s c [\\n\\n7 v 2 6... \n", + "\n", + " metadata \n", + "0 {'source': '/tmp/tmp7lpqus0e/tmp.pdf'} " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:38:08,277 E 2179392 2179417] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-37-46_470728_2138313 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + } + ], + "source": [ + "attention_is_all_you_need_pdf = 'https://arxiv.org/pdf/1706.03762.pdf'\n", + "\n", + "from pyrecdp.LLM import TextPipeline\n", + "from pyrecdp.primitives.operations import DocumentLoader\n", + "\n", + "pipeline = TextPipeline()\n", + "ops = [\n", + " DocumentLoader(loader='OnlinePDFLoader', loader_args={\"file_path\": attention_is_all_you_need_pdf}),\n", + "]\n", + "pipeline.add_operations(ops)\n", + "ds = pipeline.execute()\n", + "display(ds.to_pandas())" ] } ], diff --git a/RecDP/examples/notebooks/llmutils/document_ingestion.ipynb b/RecDP/examples/notebooks/llmutils/document_ingestion.ipynb new file mode 100644 index 000000000..69f67b2d8 --- /dev/null +++ b/RecDP/examples/notebooks/llmutils/document_ingestion.ipynb @@ -0,0 +1,944 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "736fb211-dbe6-4ca9-a1b1-db2cff2d287a", + "metadata": {}, + "source": [ + "# RecDP LLM - Document Ingestion" + ] + }, + { + "cell_type": "markdown", + "id": "5046b222", + "metadata": {}, + "source": [ + "# Get started" + ] + }, + { + "cell_type": "markdown", + "id": "54ebdd0e", + "metadata": {}, + "source": [ + "## 1. Install pyrecdp and dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2411d13e", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-09T16:00:24.332535Z", + "start_time": "2023-10-09T16:00:19.320447Z" + }, + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "! DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre\n", + "! pip install pyrecdp --pre\n", + "# ! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'" + ] + }, + { + "cell_type": "markdown", + "id": "efb80b18", + "metadata": {}, + "source": [ + "## 2. prepare your own data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16bdbeb3", + "metadata": {}, + "outputs": [], + "source": [ + "%mkdir -p /content/test_data\n", + "%cd /content/test_data\n", + "%mkdir -p /content/doc_jsonl\n", + "file_names = ['english-and-korean.png', 'handbook-872p.docx', 'layout-parser-paper-10p.jpg', 'layout-parser-paper.pdf']\n", + "file_list = [f\"https://raw.githubusercontent.com/intel/e2eAIOK/main/RecDP/tests/data/llm_data/document/{i}\" for i in file_names]\n", + "!wget -P /content/test_data/document/ {\" \".join(file_list)}" + ] + }, + { + "cell_type": "markdown", + "id": "641b4705", + "metadata": {}, + "source": [ + "## 3. DocumentIngestion" + ] + }, + { + "cell_type": "markdown", + "id": "bee64090", + "metadata": {}, + "source": [ + "### 3.1 Load document" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0435bcf8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n", + " warnings.warn(\"Setuptools is replacing distutils.\")\n", + "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", + " warnings.warn(\n", + "\u001b[32m2023-12-20 16:05:50.470\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mcheck_availability_and_install emoji==2.2.0\u001b[0m\n", + "2023-12-20 16:05:55,014\tINFO worker.py:1642 -- Started a local Ray instance.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textmetadata
0<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 \\n\\n ... \n", + "\n", + " metadata \n", + "0 {'description': 'Documentation website for cnv... " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:04,911 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:14,923 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:24,935 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:34,946 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:44,959 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:54,973 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:04,984 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:14,997 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:25,010 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:35,023 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:45,036 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:55,049 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:05,061 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:15,074 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:25,087 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:35,098 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:45,110 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:55,122 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:05,134 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:15,147 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:25,160 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:35,172 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:45,186 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:55,198 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:05,211 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:15,224 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:25,237 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:35,250 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:45,260 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:55,272 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:05,284 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:15,298 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:25,311 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:35,323 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:45,337 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:55,350 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:12:05,362 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:12:15,373 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:12:25,384 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + } + ], + "source": [ + "from pyrecdp.primitives.operations import DocumentSplit,DocumentLoader\n", + "\n", + "loader = DocumentLoader(loader=\"RecursiveUrlLoader\", loader_args={\"url\": \"https://app.cnvrg.io/docs/core_concepts/python_sdk_v2.html\"})\n", + "\n", + "ds = loader.process_rayds()\n", + "display(ds.to_pandas())" + ] + }, + { + "cell_type": "markdown", + "id": "dfdd9850", + "metadata": {}, + "source": [ + "### 3.2 Embedding document\n", + "\n", + "we support ingest document into following document store:\n", + "- [faiss](https://github.com/facebookresearch/faiss) vector store\n", + "- [chroma](https://github.com/chroma-core/chroma) vector store\n", + "- [elasticsearch](https://github.com/elastic/elasticsearch)\n", + "\n", + "we provide the [DocumentIngestion](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_ingestion.py#L239) operator for ingest documents into document store" + ] + }, + { + "cell_type": "markdown", + "id": "4a608977", + "metadata": {}, + "source": [ + "#### 3.2.1 Ingest into FAISS vector store" + ] + }, + { + "cell_type": "markdown", + "id": "833d1ec3", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "77af99e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2023-12-20 16:17:21.188\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m47\u001b[0m - \u001b[1mcheck_availability_and_install ['langchain']\u001b[0m\n", + "\u001b[32m2023-12-20 16:17:25.209\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m47\u001b[0m - \u001b[1mcheck_availability_and_install ['langchain']\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:17:25,757 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "init ray\n", + "execute with ray started ...\n", + "\u001b[32m2023-12-20 16:17:31.848\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpyrecdp.core.import_utils\u001b[0m:\u001b[36mcheck_availability_and_install\u001b[0m:\u001b[36m47\u001b[0m - \u001b[1mcheck_availability_and_install ['faiss-cpu', 'faiss-gpu', 'langchain']\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n", + "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n", + "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n" + ] + }, + { + "data": { + "text/html": [ + "
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:17:35,769 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n", + "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n", + "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:17:45,782 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:17:55,795 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + }, + { + "data": { + "text/html": [ + "
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:18:05,807 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "2023-12-20 16:18:07,819\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[FlatMap()->MapBatches(TextEmbedding)]\n", + "2023-12-20 16:18:07,820\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 16:18:07,822\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "2023-12-20 16:18:09,643\tINFO actor_pool_map_operator.py:106 -- FlatMap()->MapBatches(TextEmbedding): Waiting for 1 pool actors to start...\n", + "\u001b[2m\u001b[36m(_MapWorker pid=2276648)\u001b[0m /root/miniforge3/envs/recdp/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n", + "\u001b[2m\u001b[36m(_MapWorker pid=2276648)\u001b[0m warnings.warn(\"Setuptools is replacing distutils.\")\n", + "\u001b[2m\u001b[36m(_MapWorker pid=2276648)\u001b[0m 2023-12-20 16:18:11.695 | INFO | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b021583f4b0e450b84c9971001a5efd9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00 ActorPoolMapOperator[FlatMap()->MapBatches(TextEmbedding)]\n", + "2023-12-20 16:20:21,538\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 16:20:21,539\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "2023-12-20 16:20:23,482\tINFO actor_pool_map_operator.py:106 -- FlatMap()->MapBatches(TextEmbedding): Waiting for 1 pool actors to start...\n", + "\u001b[2m\u001b[36m(_MapWorker pid=2279978)\u001b[0m /root/miniforge3/envs/recdp/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n", + "\u001b[2m\u001b[36m(_MapWorker pid=2279978)\u001b[0m warnings.warn(\"Setuptools is replacing distutils.\")\n", + "\u001b[2m\u001b[36m(_MapWorker pid=2279978)\u001b[0m 2023-12-20 16:20:25.523 | INFO | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:20:25,971 E 2251227 2251243] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-05-53_056733_2250766 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8c9f83114b9b4920a6907c8a77bc3c39", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textembedding
0<!DOCTYPE html>\\r\\n<html lang=\"en\">\\r\\n <he...[0.02279656007885933, -0.004381305538117886, -...
1<script type=\"text/javascript\">\\r\\n //c...[-0.013796613551676273, -0.04453909769654274, ...
2<a href=\"/news-events/press-releases\" >Press R...[0.009947280399501324, 0.010400775820016861, -...
3</li>\\r\\n <...[0.02152082696557045, 0.03438470885157585, -0....
4<div class=\"text\">\\r\\n ...[0.0012505522463470697, -0.03194170445203781, ...
.........
113</td>\\r\\n<td width=\"5\" valign=\"bottom\" style=\"...[-0.009348627179861069, -0.031560588628053665,...
114</svg>\\r\\n <span>Tear Sheet</sp...[-0.018031703308224678, -0.08937390893697739, ...
115</svg>\\r\\n <span>RSS News Feed<...[0.029551001265645027, -0.027967212721705437, ...
116</svg>\\r\\n </a>\\r\\n ...[0.031037108972668648, -0.05818837508559227, -...
117</div>\\r\\n <div id=\"footer-disc...[-0.014034147374331951, 0.020656155422329903, ...
\n", + "

118 rows × 2 columns

\n", + "" + ], + "text/plain": [ + " text \\\n", + "0 \\r\\n\\r\\n \\r\\n //c... \n", + "2 Press R... \n", + "3 \\r\\n <... \n", + "4 \\r\\n
TaskPoolMapOperator[FlatMap()]\n", + "2023-12-20 16:46:09,016\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 16:46:09,017\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ef3e706862394e318584750651e720be", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00) pid=2326233)\u001b[0m /root/miniforge3/envs/recdp/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n", + "\u001b[2m\u001b[36m(FlatMap() pid=2326233)\u001b[0m warnings.warn(\"Setuptools is replacing distutils.\")\n", + "\u001b[2m\u001b[36m(FlatMap() pid=2326233)\u001b[0m 2023-12-20 16:46:10.667 | INFO | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:46:11,993 E 2326022 2326053] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-45-19_818714_2325066 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:46:22,005 E 2326022 2326053] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_16-45-19_818714_2325066 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "2023-12-20 16:46:25,523\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap()]\n", + "2023-12-20 16:46:25,524\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 16:46:25,525\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c56994e5058e426da5bd33dc3520167e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textmetadata
0<!DOCTYPE html>\\r\\n<html lang=\"en\">\\r\\n <he...{'language': 'en', 'source': 'https://www.intc...
1<script type=\"text/javascript\">\\r\\n //c...{'language': 'en', 'source': 'https://www.intc...
2<a href=\"/news-events/press-releases\" >Press R...{'language': 'en', 'source': 'https://www.intc...
3</li>\\r\\n <...{'language': 'en', 'source': 'https://www.intc...
4<div class=\"text\">\\r\\n ...{'language': 'en', 'source': 'https://www.intc...
.........
113</td>\\r\\n<td width=\"5\" valign=\"bottom\" style=\"...{'language': 'en', 'source': 'https://www.intc...
114</svg>\\r\\n <span>Tear Sheet</sp...{'language': 'en', 'source': 'https://www.intc...
115</svg>\\r\\n <span>RSS News Feed<...{'language': 'en', 'source': 'https://www.intc...
116</svg>\\r\\n </a>\\r\\n ...{'language': 'en', 'source': 'https://www.intc...
117</div>\\r\\n <div id=\"footer-disc...{'language': 'en', 'source': 'https://www.intc...
\n", + "

118 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 \\r\\n\\r\\n \\r\\n //c... \n", + "2 Press R... \n", + "3 \\r\\n <... \n", + "4 \\r\\n
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textmetadata
0<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 \\n\\n ... \n", + "\n", + " metadata \n", + "0 {'description': 'Documentation website for cnv... " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:51:12,627 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:51:22,638 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:51:32,651 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + } + ], + "source": [ + "from pyrecdp.primitives.operations import DocumentSplit,DocumentLoader\n", + "\n", + "loader = DocumentLoader(loader=\"RecursiveUrlLoader\", loader_args={\"url\": \"https://app.cnvrg.io/docs/core_concepts/python_sdk_v2.html\"})\n", + "\n", + "ds = loader.process_rayds()\n", + "display(ds.to_pandas())\n" + ] + }, + { + "cell_type": "markdown", + "id": "18c82062", + "metadata": {}, + "source": [ + "### 4.2 split document with DocumentSplit operator" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "52c00663", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-12-20 15:52:00,384\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap()]\n", + "2023-12-20 15:52:00,385\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 15:52:00,387\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "948ce35a60ef4a9a8c3e171de1ff79dc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textmetadata
0<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
1<link rel=\"preload\" href=\"/docs/assets/css/0.s...{'description': 'Documentation website for cnv...
2rel=\"prefetch\" href=\"/docs/assets/js/18.698a17...{'description': 'Documentation website for cnv...
3rel=\"prefetch\" href=\"/docs/assets/js/78.75a8bd...{'description': 'Documentation website for cnv...
4<link rel=\"stylesheet\" href=\"/docs/assets/css/...{'description': 'Documentation website for cnv...
.........
101</code></pre></div><h3 id=\"update-an-existing-...{'description': 'Documentation website for cnv...
102</code></pre></div><p><strong>Available Parame...{'description': 'Documentation website for cnv...
103</code></pre></div><p><strong>Available Parame...{'description': 'Documentation website for cnv...
104</code></pre></div><p><strong>Available Parame...{'description': 'Documentation website for cnv...
105image<span class=\"token punctuation\">.</span>d...{'description': 'Documentation website for cnv...
\n", + "

106 rows × 2 columns

\n", + "" + ], + "text/plain": [ + " text \\\n", + "0 \\n\\n ... \n", + "1

Available Parame... \n", + "103

Available Parame... \n", + "104

Available Parame... \n", + "105 image.d... \n", + "\n", + " metadata \n", + "0 {'description': 'Documentation website for cnv... \n", + "1 {'description': 'Documentation website for cnv... \n", + "2 {'description': 'Documentation website for cnv... \n", + "3 {'description': 'Documentation website for cnv... \n", + "4 {'description': 'Documentation website for cnv... \n", + ".. ... \n", + "101 {'description': 'Documentation website for cnv... \n", + "102 {'description': 'Documentation website for cnv... \n", + "103 {'description': 'Documentation website for cnv... \n", + "104 {'description': 'Documentation website for cnv... \n", + "105 {'description': 'Documentation website for cnv... \n", + "\n", + "[106 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:52:02,690 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:52:12,702 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:52:22,715 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:52:32,729 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:52:42,741 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + } + ], + "source": [ + "spliter = DocumentSplit(text_splitter='RecursiveCharacterTextSplitter')\n", + "ds = spliter.process_rayds(ds)\n", + "display(ds.to_pandas())" + ] + }, + { + "cell_type": "markdown", + "id": "194089b0", + "metadata": {}, + "source": [ + "### 4.3 put DocumentSplit operator in a pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "52ced260", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:53:02,767 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "init ray\n", + "init ray with total mem of 324413575987, total core of 48\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-12-20 15:53:09,801\tINFO worker.py:1642 -- Started a local Ray instance.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "execute with ray started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-12-20 15:53:13,140\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap()->Write]\n", + "2023-12-20 15:53:13,142\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 15:53:13,143\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6ec6c228e366434bb7df4357e56c0de7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00)->Write pid=2224997)\u001b[0m /root/miniforge3/envs/recdp/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n", + "\u001b[2m\u001b[36m(FlatMap()->Write pid=2224997)\u001b[0m warnings.warn(\"Setuptools is replacing distutils.\")\n", + "\u001b[2m\u001b[36m(FlatMap()->Write pid=2224997)\u001b[0m 2023-12-20 15:53:14.774 | INFO | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0\n", + "2023-12-20 15:53:15,081\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap()]\n", + "2023-12-20 15:53:15,082\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 15:53:15,084\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6a50999c658c47e685b7c5955ad56351", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textmetadata
0<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
1<link rel=\"preload\" href=\"/docs/assets/css/0.s...{'description': 'Documentation website for cnv...
2rel=\"prefetch\" href=\"/docs/assets/js/18.698a17...{'description': 'Documentation website for cnv...
3rel=\"prefetch\" href=\"/docs/assets/js/78.75a8bd...{'description': 'Documentation website for cnv...
4<link rel=\"stylesheet\" href=\"/docs/assets/css/...{'description': 'Documentation website for cnv...
.........
101</code></pre></div><h3 id=\"update-an-existing-...{'description': 'Documentation website for cnv...
102</code></pre></div><p><strong>Available Parame...{'description': 'Documentation website for cnv...
103</code></pre></div><p><strong>Available Parame...{'description': 'Documentation website for cnv...
104</code></pre></div><p><strong>Available Parame...{'description': 'Documentation website for cnv...
105image<span class=\"token punctuation\">.</span>d...{'description': 'Documentation website for cnv...
\n", + "

106 rows × 2 columns

\n", + "" + ], + "text/plain": [ + " text \\\n", + "0 \\n\\n ... \n", + "1

Available Parame... \n", + "103

Available Parame... \n", + "104

Available Parame... \n", + "105 image.d... \n", + "\n", + " metadata \n", + "0 {'description': 'Documentation website for cnv... \n", + "1 {'description': 'Documentation website for cnv... \n", + "2 {'description': 'Documentation website for cnv... \n", + "3 {'description': 'Documentation website for cnv... \n", + "4 {'description': 'Documentation website for cnv... \n", + ".. ... \n", + "101 {'description': 'Documentation website for cnv... \n", + "102 {'description': 'Documentation website for cnv... \n", + "103 {'description': 'Documentation website for cnv... \n", + "104 {'description': 'Documentation website for cnv... \n", + "105 {'description': 'Documentation website for cnv... \n", + "\n", + "[106 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:53:18,680 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:53:28,693 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:53:38,706 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:53:48,720 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:53:58,733 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:54:08,745 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:54:18,758 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:54:28,770 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:54:38,781 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + } + ], + "source": [ + "from pyrecdp.LLM import TextPipeline\n", + " \n", + "pipeline = TextPipeline()\n", + "ops = [\n", + " loader,\n", + " spliter\n", + "]\n", + "pipeline.add_operations(ops)\n", + "ds = pipeline.execute()\n", + "display(ds.to_pandas())" + ] + }, + { + "cell_type": "markdown", + "id": "c6f6b7da", + "metadata": {}, + "source": [ + "### 4.4 Split document with CustomerDocumentSplit operator in a pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ae9f9516", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "init ray\n", + "init ray with total mem of 324413575987, total core of 48\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-12-20 15:56:00,827\tINFO worker.py:1642 -- Started a local Ray instance.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "execute with ray started ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-12-20 15:56:03,800\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap()->Write]\n", + "2023-12-20 15:56:03,802\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 15:56:03,803\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ecb828b7ef5c490fa55421dd1a51c672", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00)->Write pid=2233388)\u001b[0m /root/miniforge3/envs/recdp/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n", + "\u001b[2m\u001b[36m(FlatMap()->Write pid=2233388)\u001b[0m warnings.warn(\"Setuptools is replacing distutils.\")\n", + "\u001b[2m\u001b[36m(FlatMap()->Write pid=2233388)\u001b[0m 2023-12-20 15:56:05.399 | INFO | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0\n", + "2023-12-20 15:56:06,464\tINFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap()]\n", + "2023-12-20 15:56:06,465\tINFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-12-20 15:56:06,466\tINFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9b94044012184cffbed45277dacf088e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textmetadata
0<!DOCTYPE html>\\n<html lang=\"en-US\">\\n <head>...{'description': 'Documentation website for cnv...
1Through the SDK, you can create experiments, m...{'description': 'Documentation website for cnv...
2First, install WSL with following commands:</p...{'description': 'Documentation website for cnv...
3https://app.cnvrgdomain.com/</td> <td>Yes</td>...{'description': 'Documentation website for cnv...
4the token can be retrieved from the <strong>us...{'description': 'Documentation website for cnv...
.........
61For example, <code>value1=NoSchedule</code>.To...{'description': 'Documentation website for cnv...
62For example, <code>gputype=v100</code>.To spec...{'description': 'Documentation website for cnv...
63For example, <code>value1=NoSchedule</code>.To...{'description': 'Documentation website for cnv...
64options are: cnvrg, dockerhub, gcr, acr, ecr, ...{'description': 'Documentation website for cnv...
65options are: cnvrg, dockerhub, gcr, acr, ecr, ...{'description': 'Documentation website for cnv...
\n", + "

66 rows × 2 columns

\n", + "" + ], + "text/plain": [ + " text \\\n", + "0 \\n\\n ... \n", + "1 Through the SDK, you can create experiments, m... \n", + "2 First, install WSL with following commands: Yes... \n", + "4 the token can be retrieved from the us... \n", + ".. ... \n", + "61 For example, value1=NoSchedule.To... \n", + "62 For example, gputype=v100.To spec... \n", + "63 For example, value1=NoSchedule.To... \n", + "64 options are: cnvrg, dockerhub, gcr, acr, ecr, ... \n", + "65 options are: cnvrg, dockerhub, gcr, acr, ecr, ... \n", + "\n", + " metadata \n", + "0 {'description': 'Documentation website for cnv... \n", + "1 {'description': 'Documentation website for cnv... \n", + "2 {'description': 'Documentation website for cnv... \n", + "3 {'description': 'Documentation website for cnv... \n", + "4 {'description': 'Documentation website for cnv... \n", + ".. ... \n", + "61 {'description': 'Documentation website for cnv... \n", + "62 {'description': 'Documentation website for cnv... \n", + "63 {'description': 'Documentation website for cnv... \n", + "64 {'description': 'Documentation website for cnv... \n", + "65 {'description': 'Documentation website for cnv... \n", + "\n", + "[66 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:56:09,706 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:56:19,719 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:56:29,731 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:56:39,741 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:56:49,752 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:56:59,764 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:57:09,778 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:57:19,789 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:57:29,799 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:57:39,812 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:57:49,826 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:57:59,838 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:58:09,851 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:58:19,864 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:58:29,877 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:58:39,889 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:58:49,902 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:58:59,915 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:59:09,929 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:59:19,942 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:59:29,955 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:59:39,968 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:59:49,982 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 15:59:59,994 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:00:10,007 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:00:20,021 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:00:30,035 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:00:40,049 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:00:50,060 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:01:00,073 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:01:10,087 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:01:20,100 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:01:30,114 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:01:40,128 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:01:50,140 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:02:00,153 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:02:10,168 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:02:20,182 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:02:30,196 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:02:40,210 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:02:50,225 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:03:00,239 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:03:10,252 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:03:20,263 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:03:30,276 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:03:40,290 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:03:50,304 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:04:00,318 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:04:10,330 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:04:20,343 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:04:30,355 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:04:40,370 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:04:50,384 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:05:00,398 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:05:10,410 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:05:20,424 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:05:30,439 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:05:40,453 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:05:50,464 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:00,475 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:10,487 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:20,501 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:30,512 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:40,525 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:06:50,537 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:00,551 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:10,564 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:20,578 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:30,591 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:40,604 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:07:50,617 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:00,630 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:10,640 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:20,653 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:30,666 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:40,680 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:08:50,694 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:00,708 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:10,722 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:20,736 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:30,749 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:40,763 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:09:50,775 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:00,788 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:10,801 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:20,814 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:30,828 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:40,841 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:10:50,853 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:00,866 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:10,880 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:20,894 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:30,908 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:40,922 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:11:50,935 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:12:00,947 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:12:10,961 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:12:20,974 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:12:30,987 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:12:41,002 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:12:51,015 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:13:01,027 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:13:11,041 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:13:21,054 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:13:31,067 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:13:41,079 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:13:51,093 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:14:01,107 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:14:11,120 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:14:21,133 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:14:31,145 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:14:41,157 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:14:51,168 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:15:01,178 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:15:11,192 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:15:21,204 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:15:31,218 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:15:41,230 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:15:51,242 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:16:01,255 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:16:11,269 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:16:21,281 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:16:31,293 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:16:41,306 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:16:51,318 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:17:01,332 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:17:11,343 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:17:21,355 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:17:31,365 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:17:41,376 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:17:51,388 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:18:01,401 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:18:11,414 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:18:21,424 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:18:31,435 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:18:41,446 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:18:51,457 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:19:01,469 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:19:11,481 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:19:21,493 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:19:31,505 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:19:41,516 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:19:51,527 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:20:01,537 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:20:11,547 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:20:21,560 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:20:31,571 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:20:41,583 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:20:51,596 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:21:01,608 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:21:11,620 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:21:21,632 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:21:31,644 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:21:41,657 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:21:51,669 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:22:01,682 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:22:11,694 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:22:21,706 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:22:31,717 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:22:41,730 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:22:51,743 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:23:01,755 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:23:11,768 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:23:21,781 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:23:31,795 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:23:41,808 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:23:51,821 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:24:01,834 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:24:11,847 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:24:21,860 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:24:31,874 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:24:41,886 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:24:51,897 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:25:01,910 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:25:11,923 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:25:21,937 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:25:31,950 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:25:41,962 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:25:51,975 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:26:01,986 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:26:11,997 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:26:22,008 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:26:32,020 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:26:42,033 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:26:52,046 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:27:02,058 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:27:12,071 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:27:22,083 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:27:32,097 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:27:42,108 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:27:52,121 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:28:02,133 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:28:12,148 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:28:22,162 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:28:32,175 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:28:42,187 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:28:52,200 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:29:02,212 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:29:12,225 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:29:22,238 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:29:32,249 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:29:42,264 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:29:52,277 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:30:02,290 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:30:12,302 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:30:22,315 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:30:32,328 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:30:42,343 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:30:52,357 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:31:02,371 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:31:12,384 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:31:22,397 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:31:32,410 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:31:42,423 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:31:52,436 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:32:02,447 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:32:12,461 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:32:22,475 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:32:32,488 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:32:42,502 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:32:52,516 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:33:02,529 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:33:12,541 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:33:22,553 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:33:32,566 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:33:42,578 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:33:52,591 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:34:02,605 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:34:12,618 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:34:22,632 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:34:32,645 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:34:42,658 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:34:52,671 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:35:02,686 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:35:12,700 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:35:22,713 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:35:32,726 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:35:42,740 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:35:52,753 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:36:02,766 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:36:12,780 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:36:22,793 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:36:32,807 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:36:42,819 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:36:52,831 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:37:02,843 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:37:12,855 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:37:22,868 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:37:32,882 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:37:42,893 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:37:52,906 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:38:02,918 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:38:12,930 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:38:22,945 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:38:32,956 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:38:42,968 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:38:52,980 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:39:02,992 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:39:13,003 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:39:23,016 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:39:33,028 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:39:43,042 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:39:53,055 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:40:03,069 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:40:13,081 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:40:23,095 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:40:33,106 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:40:43,120 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:40:53,134 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:41:03,146 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:41:13,159 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:41:23,173 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:41:33,183 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:41:43,196 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:41:53,206 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:42:03,217 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:42:13,231 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:42:23,244 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:42:33,256 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:42:43,268 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:42:53,280 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:43:03,291 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:43:13,303 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:43:23,315 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:43:33,329 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:43:43,341 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:43:53,353 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:44:03,367 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:44:13,379 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:44:23,390 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:44:33,402 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:44:43,413 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:44:53,424 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:45:03,435 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:45:13,447 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:45:23,460 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:45:33,472 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:45:43,485 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:45:53,499 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:46:03,513 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:46:13,525 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:46:23,537 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:46:33,549 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:46:43,560 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:46:53,573 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:47:03,586 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:47:13,599 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:47:23,613 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:47:33,626 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:47:43,637 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:47:53,647 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:48:03,661 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:48:13,673 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:48:23,685 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:48:33,697 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:48:43,710 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:48:53,721 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:49:03,734 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:49:13,746 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:49:23,760 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:49:33,773 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:49:43,785 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:49:53,795 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:50:03,808 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:50:13,820 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:50:23,832 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:50:33,844 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:50:43,857 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:50:53,869 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:51:03,882 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:51:13,893 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:51:23,903 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:51:33,913 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n", + "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-20 16:51:43,923 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.\n" + ] + } + ], + "source": [ + "from pyrecdp.LLM import TextPipeline\n", + "from pyrecdp.primitives.operations import CustomerDocumentSplit\n", + " \n", + "def chunk_doc(text,max_num_of_words):\n", + " from nltk.tokenize import word_tokenize,sent_tokenize\n", + " text= text.strip()\n", + " if len(word_tokenize(text)) <= max_num_of_words:\n", + " return [text]\n", + " else:\n", + " chunks = []\n", + " # split by sentence\n", + " sentences = sent_tokenize(text)\n", + " # print('number of sentences: ', len(sentences))\n", + " words_count = 0\n", + " temp_chunk = \"\"\n", + " for s in sentences:\n", + " temp_chunk+=(s+\" \")\n", + " words_count += len(word_tokenize(s))\n", + " if len(word_tokenize(temp_chunk))> max_num_of_words:\n", + " chunks.append(temp_chunk)\n", + " words_count = 0\n", + " temp_chunk = \"\"\n", + " \n", + " return chunks\n", + " \n", + "pipeline = TextPipeline()\n", + "ops = [\n", + " loader,\n", + " CustomerDocumentSplit(func=chunk_doc, max_num_of_words=50),\n", + "]\n", + "pipeline.add_operations(ops)\n", + "ds = pipeline.execute()\n", + "display(ds.to_pandas())" + ] } ], "metadata": { diff --git a/RecDP/pyrecdp/LLM/README.md b/RecDP/pyrecdp/LLM/README.md index 3e6c03fe5..6198da04c 100644 --- a/RecDP/pyrecdp/LLM/README.md +++ b/RecDP/pyrecdp/LLM/README.md @@ -9,8 +9,11 @@ RecDP LLM is a set of python components that enables quick and easy establish of | Type | notebook | Description | supports | Verified dataset & size | | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------- | ---------------------------------------------------- | ------------------------------------- | -| [ DocumentExtract ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/llmutils/document_extractor.py) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/document_extract.ipynb) | extract text from unstructured format | jpg, png, pdf, docx, | RefinedWeb - 1.7 TB | -| [ Reader ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_reader.py#L16) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/reader.ipynb) | Read data from directory | jsonl, parquet, | RefinedWeb - 1.7 TB | +| [ Directory Loader ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/doc_loader.py#L77) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/document_extract.ipynb) | extract text from a directory | jpg, png, pdf, docx, | RefinedWeb - 1.7 TB | +| [ Document Loader ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/doc_loader.py#L15) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/document_extract.ipynb) | extract text from unstructured format | all [document loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/) provided in [langchain](https://python.langchain.com/) | RefinedWeb - 1.7 TB | +| [ Text Reader ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_reader.py#L16) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/reader.ipynb) | Read data from directory | jsonl, parquet, | RefinedWeb - 1.7 TB | +| [ Document Split ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_split.py) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/document_split.ipynb) | split documents | [text splitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/#text-splitters) provided in [langchain](https://python.langchain.com/) and [customer document split](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_split.py#L278) | RefinedWeb - 1.7 TB | +| [ Document Ingestion ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_ingestion.py) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/document_ingestion.ipynb) | embedding documents and store into vector database | chroma,faiss,elasticsearch | RefinedWeb - 1.7 TB | | [ Converter ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_converter.py) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/convert.ipynb) | Read and convert unstructed data to unified format | html, document, image, pdf, ... | RefinedWeb - 1.7 TB | | [ Filter ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/filter.py) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/filter.ipynb) | Filter out document based on condition | profanity-based, black-list, url_based, length_based | RedPajama - 2 TB | | [ Text Bytesize ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_bytesize.py) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/bytesize.ipynb) | Get text bytes size | | RedPajama - 2 TB | diff --git a/RecDP/pyrecdp/primitives/llmutils/document/reader.py b/RecDP/pyrecdp/primitives/llmutils/document/reader.py index 25fca9df4..056937885 100644 --- a/RecDP/pyrecdp/primitives/llmutils/document/reader.py +++ b/RecDP/pyrecdp/primitives/llmutils/document/reader.py @@ -312,17 +312,7 @@ def _load_file(self, input_file: Path, pbar): loader = self.file_extractor[file_suffix] return loader.load() else: - from pyrecdp.core.import_utils import import_langchain - import_langchain() - from langchain.document_loaders import UnstructuredFileLoader - loader = UnstructuredFileLoader(str(input_file)) - docs = [Document(text=doc.text, metadata=doc.metadata) for doc in loader.load()] - docs = list(filter(lambda d: (d.pa.strip() != ""), docs)) - if self.single_text_per_document: - text = self.page_separator.join([doc.text for doc in docs]) - return [Document(text=text, metadata={"source": str(input_file)})] - else: - return docs + return [] finally: if pbar: pbar.update(1) @@ -336,7 +326,8 @@ def load(self) -> List[Document]: from concurrent.futures import ThreadPoolExecutor with ThreadPoolExecutor(self.max_concurrency) as executor: for docs in executor.map(lambda i: self._load_file(i, pbar), self.input_files): - docs_result.extend(docs) + if len(docs)>0: + docs_result.extend(docs) else: for file in self.input_files: docs = self._load_file(file, pbar) diff --git a/RecDP/pyrecdp/primitives/operations/text_ingestion.py b/RecDP/pyrecdp/primitives/operations/text_ingestion.py index 6e9b996da..1ce026a02 100644 --- a/RecDP/pyrecdp/primitives/operations/text_ingestion.py +++ b/RecDP/pyrecdp/primitives/operations/text_ingestion.py @@ -157,6 +157,7 @@ def do_persist(self, ds, **kwargs): check_availability_and_install(["chromadb==0.4.15", "langchain"]) chroma = self.vector_store_args["db_handler"] + collection_name = self.vector_store_args.get("collection_name", 'langchain') rows = ds.iter_rows() if isinstance(ds, Dataset) else ds.collect() texts = [row[self.text_column] for row in rows] @@ -165,9 +166,15 @@ def do_persist(self, ds, **kwargs): if chroma is not None: chroma.add_texts(texts) return chroma - if "output_dir" not in self.vector_store_args: - raise ValueError(f"You must have `output_dir` option specify for Chroma vector store") - persist_directory = self.vector_store_args["output_dir"] + if "output_dir" not in self.vector_store_args and 'persist_directory' not in self.vector_store_args: + raise ValueError( + f"You must have `output_dir` or `persist_directory` option specify for Chroma vector store") + + if 'output_dir' in self.vector_store_args: + persist_directory = self.vector_store_args["output_dir"] + else: + persist_directory = self.vector_store_args["persist_directory"] + if not self.override and os.path.exists(persist_directory): chroma = Chroma(collection_name=collection_name, persist_directory=persist_directory, diff --git a/RecDP/tests/test_llmutils_operations.py b/RecDP/tests/test_llmutils_operations.py index 3418d0046..516a248f9 100644 --- a/RecDP/tests/test_llmutils_operations.py +++ b/RecDP/tests/test_llmutils_operations.py @@ -231,11 +231,16 @@ def test_gopherqualityfilter_ray(self): with RayContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx: ctx.show(op.process_rayds(ctx.ds)) - def test_document_load_ray(self): + def test_document_load_pdf_ray(self): op = DirectoryLoader("tests/data/llm_data/document", glob="**/*.pdf") with RayContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx: ctx.show(op.process_rayds()) + def test_document_load_ray(self): + op = DirectoryLoader("data/llm_data/document") + with RayContext("data/llm_data/tiny_c4_sample.jsonl") as ctx: + ctx.show(op.process_rayds()) + def test_url_load_ray(self): op = UrlLoader(["https://www.intc.com/news-events/press-releases?year=2023&category=all"], max_depth=1) with RayContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx: