diff --git a/build.sbt b/build.sbt index 0b6f9f6f79..fe1ce62416 100644 --- a/build.sbt +++ b/build.sbt @@ -34,7 +34,8 @@ val extraDependencies = Seq( "com.jcraft" % "jsch" % "0.1.54", "org.apache.httpcomponents.client5" % "httpclient5" % "5.1.3", "org.apache.httpcomponents" % "httpmime" % "4.5.13", - "com.linkedin.isolation-forest" %% "isolation-forest_3.4.2" % "3.0.4" + "com.linkedin.isolation-forest" %% "isolation-forest_3.4.2" % "3.0.4", + "org.apache.hadoop" % "hadoop-client-api" % "3.3.4" exclude("com.google.protobuf", "protobuf-java") exclude("org.apache.spark", "spark-mllib_2.12") exclude("org.apache.spark", "spark-core_2.12") exclude("org.apache.spark", "spark-avro_2.12") exclude("org.apache.spark", "spark-sql_2.12"), diff --git a/core/src/main/python/synapse/ml/llm/HuggingFaceCausallmTransform.py b/core/src/main/python/synapse/ml/llm/HuggingFaceCausallmTransform.py new file mode 100644 index 0000000000..4a941fa566 --- /dev/null +++ b/core/src/main/python/synapse/ml/llm/HuggingFaceCausallmTransform.py @@ -0,0 +1,300 @@ +from pyspark.ml import Transformer +from pyspark.ml.param.shared import ( + HasInputCol, + HasOutputCol, + Param, + Params, + TypeConverters, +) +from pyspark.sql import Row +from pyspark.sql.functions import udf +from pyspark.sql.types import StringType, StructType, StructField +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from transformers import AutoTokenizer, AutoModelForCausalLM +from pyspark import keyword_only +import re +import os + + +class _PeekableIterator: + def __init__(self, iterable): + self._iterator = iter(iterable) + self._cache = [] + + def __iter__(self): + return self + + def __next__(self): + if self._cache: + return self._cache.pop(0) + else: + return next(self._iterator) + + def peek(self, n=1): + """Peek at the next n elements without consuming them.""" + while len(self._cache) < n: + try: + self._cache.append(next(self._iterator)) + except StopIteration: + break + if n == 1: + return self._cache[0] if self._cache else None + else: + return self._cache[:n] + + +class _ModelParam: + def __init__(self, **kwargs): + self.param = {} + self.param.update(kwargs) + + def get_param(self): + return self.param + + +class _ModelConfig: + def __init__(self, **kwargs): + self.config = {} + self.config.update(kwargs) + + def get_config(self): + return self.config + + def set_config(self, **kwargs): + self.config.update(kwargs) + + +def camel_to_snake(text): + return re.sub(r"(? Map("coordinates" -> PackageMavenCoordinate, "repo" -> PackageRepository)), Map("pypi" -> Map("package" -> "pytorch-lightning==1.5.0")), Map("pypi" -> Map("package" -> "torchvision==0.14.1")), - Map("pypi" -> Map("package" -> "transformers==4.32.1")), + Map("pypi" -> Map("package" -> "transformers==4.48.0")), + Map("pypi" -> Map("package" -> "jinja2==3.1.0")), Map("pypi" -> Map("package" -> "petastorm==0.12.0")), Map("pypi" -> Map("package" -> "protobuf==3.20.3")) ).toJson.compactPrint @@ -105,12 +106,15 @@ object DatabricksUtilities { val CPUNotebooks: Seq[File] = ParallelizableNotebooks .filterNot(_.getAbsolutePath.contains("Fine-tune")) .filterNot(_.getAbsolutePath.contains("GPU")) + .filterNot(_.getAbsolutePath.contains("Language Model")) .filterNot(_.getAbsolutePath.contains("Multivariate Anomaly Detection")) // Deprecated .filterNot(_.getAbsolutePath.contains("Audiobooks")) // TODO Remove this by fixing auth .filterNot(_.getAbsolutePath.contains("Art")) // TODO Remove this by fixing performance .filterNot(_.getAbsolutePath.contains("Explanation Dashboard")) // TODO Remove this exclusion - val GPUNotebooks: Seq[File] = ParallelizableNotebooks.filter(_.getAbsolutePath.contains("Fine-tune")) + val GPUNotebooks: Seq[File] = ParallelizableNotebooks.filter { file => + file.getAbsolutePath.contains("Fine-tune") || file.getAbsolutePath.contains("HuggingFace") + } val RapidsNotebooks: Seq[File] = ParallelizableNotebooks.filter(_.getAbsolutePath.contains("GPU")) diff --git a/deep-learning/src/main/python/horovod_installation.sh b/deep-learning/src/main/python/horovod_installation.sh index 22124422ff..a556c6b9ac 100644 --- a/deep-learning/src/main/python/horovod_installation.sh +++ b/deep-learning/src/main/python/horovod_installation.sh @@ -8,7 +8,7 @@ set -eu # Install prerequisite libraries that horovod depends on pip install pytorch-lightning==1.5.0 pip install torchvision==0.14.1 -pip install transformers==4.32.1 +pip install transformers==4.48.0 pip install petastorm>=0.12.0 pip install protobuf==3.20.3 diff --git a/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py b/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py index 8e001f3be6..d40d5aef81 100644 --- a/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py +++ b/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py @@ -11,12 +11,12 @@ if _TRANSFORMERS_AVAILABLE: import transformers - _TRANSFORMERS_EQUAL_4_32_1 = transformers.__version__ == "4.32.1" - if _TRANSFORMERS_EQUAL_4_32_1: + _TRANSFORMERS_EQUAL_4_48_0 = transformers.__version__ == "4.48.0" + if _TRANSFORMERS_EQUAL_4_48_0: from transformers import AutoTokenizer else: raise RuntimeError( - "transformers should be == 4.32.1, found: {}".format( + "transformers should be == 4.48.0, found: {}".format( transformers.__version__ ) ) diff --git a/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py b/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py index 134bc5f135..29d62803fd 100644 --- a/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py +++ b/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py @@ -13,12 +13,12 @@ if _TRANSFORMERS_AVAILABLE: import transformers - _TRANSFORMERS_EQUAL_4_32_1 = transformers.__version__ == "4.32.1" - if _TRANSFORMERS_EQUAL_4_32_1: + _TRANSFORMERS_EQUAL_4_48_0 = transformers.__version__ == "4.48.0" + if _TRANSFORMERS_EQUAL_4_48_0: from transformers import AutoModelForSequenceClassification else: raise RuntimeError( - "transformers should be == 4.32.1, found: {}".format( + "transformers should be == 4.48.0, found: {}".format( transformers.__version__ ) ) diff --git a/docs/Explore Algorithms/Language Model/Quickstart - Apply Phi 3 Model with HuggingFace CausalLM.ipynb b/docs/Explore Algorithms/Language Model/Quickstart - Apply Phi 3 Model with HuggingFace CausalLM.ipynb new file mode 100644 index 0000000000..904ddc78ef --- /dev/null +++ b/docs/Explore Algorithms/Language Model/Quickstart - Apply Phi 3 Model with HuggingFace CausalLM.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","source":["# Apply Phi3 model with HuggingFace Causal ML"],"metadata":{"nteract":{"transient":{"deleting":false}}},"id":"7a355394-5b22-4c09-8d4f-9467a2fcfce4"},{"cell_type":"markdown","source":["![HuggingFace Logo](https://huggingface.co/front/assets/huggingface_logo-noborder.svg)\n","\n","**HuggingFace** is a popular open-source platform that develops computation tools for building application using machine learning. It is widely known for its Transformers library which contains open-source implementation of transformer models for text, image, and audio task.\n","\n","[**Phi 3**](https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/) is a family of AI models developed by Microsoft, designed to redefine what is possible with small language models (SLMs). Phi-3 models are the most compatable and cost-effective SLMs, [outperforming models of the same size and even larger ones in language](https://news.microsoft.com/source/features/ai/the-phi-3-small-language-models-with-big-potential/?msockid=26355e446adb6dfa06484f956b686c27), reasoning, coding, and math benchmarks. \n","\n","\"Phi\n","\n","To make it easier to scale up causal language model prediction on a large dataset, we have integrated [HuggingFace Causal LM](https://huggingface.co/docs/transformers/tasks/language_modeling) with SynapseML. This integration makes it easy to use the Apache Spark distributed computing framework to process large data on text generation tasks.\n","\n","This tutorial shows hot to apply [phi3 model](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3) at scale with no extra setting.\n"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"aa35ae52-6a9e-458d-91ee-ae3962ab5b68"},{"cell_type":"code","source":["# %pip install --upgrade transformers==4.48.0 -q"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"fe974ccc-3243-4158-95f4-88764297807a"},{"cell_type":"code","source":["chats = [\n"," (1, \"fix grammar: helol mi friend\"),\n"," (2, \"What is HuggingFace\"),\n"," (3, \"translate to Spanish: hello\"),\n","]\n","\n","chat_df = spark.createDataFrame(chats, [\"row_index\", \"content\"])\n","chat_df.show()"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":9,"statement_ids":[9],"state":"finished","livy_statement_state":"available","session_id":"75f9bda6-1df7-4afc-b72e-f23922c4333a","normalized_state":"finished","queued_time":"2025-01-23T10:56:03.3890961Z","session_start_time":null,"execution_start_time":"2025-01-23T11:00:48.4985467Z","execution_finish_time":"2025-01-23T11:00:55.9012736Z","parent_msg_id":"c32712d4-f241-41b8-8107-1d58d718b47a"},"text/plain":"StatementMeta(, 75f9bda6-1df7-4afc-b72e-f23922c4333a, 9, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["+---------+--------------------+\n|row_index| content|\n+---------+--------------------+\n| 1|fix grammar: helo...|\n| 2| What is HuggingFace|\n| 3|translate to Span...|\n+---------+--------------------+\n\n"]}],"execution_count":3,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"7e76b540-466f-4ab3-9aa9-da8de5517fc1"},{"cell_type":"markdown","source":["## Define and Apply Phi3 model"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"ac0687e7-6609-4af4-a1a4-c098cb404374"},{"cell_type":"markdown","source":["The following example demonstrates how to load the remote Phi 3 model from HuggingFace and apply it to chats."],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"6e9ac535-e0e8-4947-8e18-2e57ecaef096"},{"cell_type":"code","source":["from synapse.ml.llm.HuggingFaceCausallmTransform import HuggingFaceCausalLM\n","\n","phi3_transformer = (\n"," HuggingFaceCausalLM()\n"," .setModelName(\"microsoft/Phi-3-mini-4k-instruct\")\n"," .setInputCol(\"content\")\n"," .setOutputCol(\"result\")\n"," .setModelParam(max_new_tokens=1000)\n"," .setModelConfig(local_files_only=False, trust_remote_code=True)\n",")\n","result_df = phi3_transformer.transform(chat_df).collect()\n","display(result_df)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":10,"statement_ids":[10],"state":"finished","livy_statement_state":"available","session_id":"75f9bda6-1df7-4afc-b72e-f23922c4333a","normalized_state":"finished","queued_time":"2025-01-23T10:56:03.3897955Z","session_start_time":null,"execution_start_time":"2025-01-23T11:00:56.0674571Z","execution_finish_time":"2025-01-23T11:17:57.9045853Z","parent_msg_id":"79822ed7-be13-4a02-bbef-7382288afe29"},"text/plain":"StatementMeta(, 75f9bda6-1df7-4afc-b72e-f23922c4333a, 10, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stderr","text":["2025-01-23 11:01:37.576700: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n2025-01-23 11:01:44.378814: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\nTo enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"]},{"output_type":"display_data","data":{"application/vnd.synapse.widget-view+json":{"widget_id":"427b3314-b88a-4524-a9bf-35bc83b2678e","widget_type":"Synapse.DataFrame"},"text/plain":"SynapseWidget(Synapse.DataFrame, 427b3314-b88a-4524-a9bf-35bc83b2678e)"},"metadata":{}}],"execution_count":4,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"collapsed":false,"jupyter":{"outputs_hidden":false},"editable":true,"run_control":{"frozen":false}},"id":"f8db55d9-b89d-420f-80e9-618041def698"},{"cell_type":"markdown","source":["## Use local cache\n","\n","By caching the model, you can reduce initialization time. On Fabric, store the model in a Lakehouse and use setCachePath to load it."],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"4c839ac6-f92e-4615-a0c3-977a96231cc6"},{"cell_type":"code","source":["# %%sh\n","# azcopy copy \"https://mmlspark.blob.core.windows.net/huggingface/microsoft/Phi-3-mini-4k-instruct\" \"/lakehouse/default/Files/microsoft/\" --recursive=true"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":11,"statement_ids":[11],"state":"finished","livy_statement_state":"available","session_id":"75f9bda6-1df7-4afc-b72e-f23922c4333a","normalized_state":"finished","queued_time":"2025-01-23T10:56:03.3904561Z","session_start_time":null,"execution_start_time":"2025-01-23T11:17:58.0975761Z","execution_finish_time":"2025-01-23T11:17:58.5016497Z","parent_msg_id":"632eceab-dd6c-4632-a038-169d5e6dff5d"},"text/plain":"StatementMeta(, 75f9bda6-1df7-4afc-b72e-f23922c4333a, 11, Finished, Available, Finished)"},"metadata":{}}],"execution_count":5,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"9bc5edf1-35cb-45d6-b1dc-49a22a01484b"},{"cell_type":"code","source":["# phi3_transformer = (\n","# HuggingFaceCausalLM()\n","# .setCachePath(\"/lakehouse/default/Files/microsoft/Phi-3-mini-4k-instruct\")\n","# .setInputCol(\"content\")\n","# .setOutputCol(\"result\")\n","# .setModelParam(max_new_tokens=1000)\n","# )\n","# result_df = phi3_transformer.transform(chat_df).collect()\n","# display(result_df)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":12,"statement_ids":[12],"state":"finished","livy_statement_state":"available","session_id":"75f9bda6-1df7-4afc-b72e-f23922c4333a","normalized_state":"finished","queued_time":"2025-01-23T10:56:03.39102Z","session_start_time":null,"execution_start_time":"2025-01-23T11:17:58.6971527Z","execution_finish_time":"2025-01-23T11:17:59.1593083Z","parent_msg_id":"a37c3790-4e89-441b-aed1-1807693800e4"},"text/plain":"StatementMeta(, 75f9bda6-1df7-4afc-b72e-f23922c4333a, 12, Finished, Available, Finished)"},"metadata":{}}],"execution_count":6,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"collapsed":false},"id":"ee52c891-3be2-48fe-87b3-648e299a794e"},{"cell_type":"markdown","source":["## Utilize GPU"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"69d2ff34-63d4-4ae4-a944-e568badbdb44"},{"cell_type":"markdown","source":["To utilize GPU, passing device_map=\"cuda\", torch_dtype=\"auto\" to modelConfig"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"cbc581bb-8af1-4a31-98b5-ce6127656572"},{"cell_type":"code","source":["# phi3_transformer = (\n","# HuggingFaceCausalLM()\n","# .setModelName(\"microsoft/Phi-3-mini-4k-instruct\")\n","# .setInputCol(\"content\")\n","# .setOutputCol(\"result\")\n","# .setModelParam(max_new_tokens=1000)\n","# .setModelConfig(device_map=\"cuda\", torch_dtype=\"auto\", local_files_only=False, trust_remote_code=True)\n","# )\n","# result_df = phi3_transformer.transform(chat_df).collect()\n","# display(result_df)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":13,"statement_ids":[13],"state":"finished","livy_statement_state":"available","session_id":"75f9bda6-1df7-4afc-b72e-f23922c4333a","normalized_state":"finished","queued_time":"2025-01-23T10:56:03.4017444Z","session_start_time":null,"execution_start_time":"2025-01-23T11:18:00.094146Z","execution_finish_time":"2025-01-23T11:18:00.4696191Z","parent_msg_id":"2e66568f-d86e-47bc-83e7-1221ebd65668"},"text/plain":"StatementMeta(, 75f9bda6-1df7-4afc-b72e-f23922c4333a, 13, Finished, Available, Finished)"},"metadata":{}}],"execution_count":7,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"b17a033b-45f0-4ee4-a3c2-26ab8511539e"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"synapse_widget":{"version":"0.1","state":{"427b3314-b88a-4524-a9bf-35bc83b2678e":{"type":"Synapse.DataFrame","sync_state":{"table":{"rows":[{"0":"1","1":"fix grammar: helol mi friend","2":"Hello, my friend."},{"0":"2","1":"What is HuggingFace","2":"HuggingFace is an open-source community and library for building and training NLP models. It provides a platform for sharing and collaborating on NLP models and tools, and it offers a wide range of pre-trained models and transformer architectures that can be used for various NLP tasks. The library is built on PyTorch and provides a simple API for training and deploying NLP models."},{"0":"3","1":"translate to Spanish: hello","2":"Hola"}],"schema":[{"key":"0","name":"row_index","type":"bigint"},{"key":"1","name":"content","type":"string"},{"key":"2","name":"result","type":"string"}],"truncated":false},"isSummary":false,"language":"scala","wranglerEntryContext":null},"persist_state":{"view":{"type":"details","tableOptions":{},"chartOptions":{"chartType":"bar","categoryFieldKeys":["1"],"seriesFieldKeys":["1"],"aggregationType":"count","isStacked":false,"binsNumber":10,"wordFrequency":"-1","evaluatesOverAllRecords":false},"viewOptionsGroup":[{"tabItems":[{"type":"table","name":"Table","key":"0","options":{}}]}]}}}}},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{"lakehouse":{"default_lakehouse":"cf3f397e-6a87-43ab-b8e0-bb9342e11c7a","default_lakehouse_name":"jessiwang_phi3","default_lakehouse_workspace_id":"4751a5bb-6a44-4164-8b31-c3b6a4cf1f8d"},"environment":{}}},"nbformat":4,"nbformat_minor":5} \ No newline at end of file diff --git a/environment.yml b/environment.yml index e9361ad5de..44de549934 100644 --- a/environment.yml +++ b/environment.yml @@ -10,7 +10,7 @@ dependencies: - r-sparklyr=1.8.1 - r-devtools=2.4.2 - pip: - - pyarrow>=0.15.0 + - pyarrow==10.0.1 - pyspark==3.4.1 - pandas==1.4.0 - wheel @@ -40,7 +40,7 @@ dependencies: - onnxmltools==1.7.0 - matplotlib - Pillow - - transformers==4.32.1 + - transformers==4.48.0 - huggingface-hub>=0.8.1 - langchain==0.0.152 - openai==0.27.5