From 2e38a40229fab67c1c728af1922adddbe9eee3ca Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Fri, 31 May 2024 16:27:38 +0800
Subject: [PATCH 01/16] enable gpu

---
 .../langchain_community/llms/ipex_llm.py      | 20 ++++++++++++++++---
 .../integration_tests/llms/test_ipex_llm.py   | 20 +++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/libs/community/langchain_community/llms/ipex_llm.py b/libs/community/langchain_community/llms/ipex_llm.py
index 0e41c305bb7a8..b0217baf1f22c 100644
--- a/libs/community/langchain_community/llms/ipex_llm.py
+++ b/libs/community/langchain_community/llms/ipex_llm.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, List, Mapping, Optional
+from typing import Any, List, Mapping, Optional, Literal
 
 from langchain_core.callbacks import CallbackManagerForLLMRun
 from langchain_core.language_models.llms import LLM
@@ -46,6 +46,7 @@ def from_model_id(
         tokenizer_id: Optional[str] = None,
         load_in_4bit: bool = True,
         load_in_low_bit: Optional[str] = None,
+        device_map: Literal['cpu','xpu'] = 'cpu',
         **kwargs: Any,
     ) -> LLM:
         """
@@ -75,6 +76,7 @@ def from_model_id(
             low_bit_model=False,
             load_in_4bit=load_in_4bit,
             load_in_low_bit=load_in_low_bit,
+            device_map=device_map,
             model_kwargs=model_kwargs,
             kwargs=kwargs,
         )
@@ -86,6 +88,7 @@ def from_model_id_low_bit(
         model_kwargs: Optional[dict] = None,
         *,
         tokenizer_id: Optional[str] = None,
+        device_map: Literal['cpu','xpu'] = 'cpu',
         **kwargs: Any,
     ) -> LLM:
         """
@@ -109,6 +112,7 @@ def from_model_id_low_bit(
             low_bit_model=True,
             load_in_4bit=False,  # not used for low-bit model
             load_in_low_bit=None,  # not used for low-bit model
+            device_map=device_map,
             model_kwargs=model_kwargs,
             kwargs=kwargs,
         )
@@ -121,6 +125,7 @@ def _load_model(
         load_in_4bit: bool = False,
         load_in_low_bit: Optional[str] = None,
         low_bit_model: bool = False,
+        device_map: Literal['cpu','xpu'] = "cpu",
         model_kwargs: Optional[dict] = None,
         kwargs: Optional[dict] = None,
     ) -> Any:
@@ -189,6 +194,15 @@ def _load_model(
                 model_kwargs=_model_kwargs,
             )
 
+        # Set "cpu" as default device
+
+        if device_map not in ["cpu", "xpu"]:
+            raise ValueError(
+                "IpexLLM currently only supports device to be "
+                f"'cpu' or 'xpu', but you have: {device_map}."
+            )
+        model.to(device_map)
+        
         return cls(
             model_id=model_id,
             model=model,
@@ -237,7 +251,7 @@ def _call(
         if self.streaming:
             from transformers import TextStreamer
 
-            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.model.device)
             streamer = TextStreamer(
                 self.tokenizer, skip_prompt=True, skip_special_tokens=True
             )
@@ -263,7 +277,7 @@ def _call(
             text = self.tokenizer.decode(output[0], skip_special_tokens=True)
             return text
         else:
-            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.model.device)
             if stop is not None:
                 from transformers.generation.stopping_criteria import (
                     StoppingCriteriaList,
diff --git a/libs/community/tests/integration_tests/llms/test_ipex_llm.py b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
index 163458029c5d5..b153cde2b5868 100644
--- a/libs/community/tests/integration_tests/llms/test_ipex_llm.py
+++ b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
@@ -86,3 +86,23 @@ def test_save_load_lowbit(model_id: str) -> None:
     )
     output = loaded_llm.invoke("Hello!")
     assert isinstance(output, str)
+
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_load_generate_gpu(model_id: str) -> None:
+    """Test valid call."""
+    llm = IpexLLM.from_model_id(
+        model_id=model_id,
+        model_kwargs={
+            "temperature": 0,
+            "max_length": 16,
+            "trust_remote_code": True,
+        },
+        device_map="xpu",
+    )
+    output = llm.generate(["Hello!"])
+    assert isinstance(output, LLMResult)
+    assert isinstance(output.generations, list)
\ No newline at end of file

From 93aa09d5ddc7f130ae60d25c0f8a2e77e17fb0fa Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Fri, 31 May 2024 16:47:27 +0800
Subject: [PATCH 02/16] add notebook

---
 .../docs/integrations/llms/ipex_llm_gpu.ipynb | 250 ++++++++++++++++++
 1 file changed, 250 insertions(+)
 create mode 100644 docs/docs/integrations/llms/ipex_llm_gpu.ipynb

diff --git a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
new file mode 100644
index 0000000000000..4f3f6788f85ba
--- /dev/null
+++ b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
@@ -0,0 +1,250 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# IPEX-LLM on Intel GPU\n",
+    "\n",
+    "> [IPEX-LLM](https://github.com/intel-analytics/ipex-llm) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency.\n",
+    "\n",
+    "This example goes over how to use LangChain to interact with `ipex-llm` for text generation. \n",
+    "\n",
+    "> **Note**\n",
+    ">\n",
+    "> It is recommended that only Windows users with Intel Arc A-Series GPU (except for Intel Arc A300-Series or Pro A60) run this Jupyter notebook directly. For other cases (e.g. Linux users, Intel iGPU, etc.), it is recommended to run the code with Python scripts in terminal for best experiences.\n",
+    "\n",
+    "## Install Prerequisites\n",
+    "To benefit from IPEX-LLM on Intel GPUs, there are several prerequisite steps for tools installation and environment preparation.\n",
+    "\n",
+    "If you are a Windows user, visit the [Install IPEX-LLM on Windows with Intel GPU Guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_windows_gpu.html), and follow [Install Prerequisites](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_windows_gpu.html#install-prerequisites) to update GPU driver (optional) and install Conda.\n",
+    "\n",
+    "If you are a Linux user, visit the [Install IPEX-LLM on Linux with Intel GPU](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_linux_gpu.html), and follow [**Install Prerequisites**](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_linux_gpu.html#install-prerequisites) to install GPU driver, Intel® oneAPI Base Toolkit 2024.0, and Conda.\n",
+    "\n",
+    "## Setup\n",
+    "\n",
+    "After the prerequisites installation, you should have created a conda environment with all prerequisites installed. **Start the jupyter service in this conda environment**:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%pip install -qU langchain langchain-community"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install IEPX-LLM for running LLMs locally on Intel GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> **Note**\n",
+    ">\n",
+    "> You can also use `https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/` as the extra-indel-url.\n",
+    "\n",
+    "## Runtime Configuration\n",
+    "\n",
+    "For optimal performance, it is recommended to set several environment variables based on your device:\n",
+    "\n",
+    "### For Windows Users with Intel Core Ultra integrated GPU"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"SYCL_CACHE_PERSISTENT\"] = \"1\"\n",
+    "os.environ[\"BIGDL_LLM_XMX_DISABLED\"] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### For Windows Users with Intel Arc A-Series GPU"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"SYCL_CACHE_PERSISTENT\"] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> **Note**\n",
+    ">\n",
+    "> For the first time that each model runs on Intel iGPU/Intel Arc A300-Series or Pro A60, it may take several minutes to compile.\n",
+    ">\n",
+    "> For other GPU type, please refer to [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html#runtime-configuration) for Windows users, and  [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html#id5) for Linux users.\n",
+    "\n",
+    "\n",
+    "## Basic Usage\n",
+    "\n",
+    "Setting `device_map` to `\"xpu\"` when initializing `IpexLLM` will put the LLM model on Intel GPU and benefit from IPEX-LLM optimizations:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "from langchain.chains import LLMChain\n",
+    "from langchain_community.llms import IpexLLM\n",
+    "from langchain_core.prompts import PromptTemplate\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\", category=UserWarning, message=\".*padding_mask.*\")\n",
+    "template = \"USER: {question}\\nASSISTANT:\"\n",
+    "prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n",
+    "\n",
+    "llm = IpexLLM.from_model_id(\n",
+    "    model_id=\"lmsys/vicuna-7b-v1.5\",\n",
+    "    model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True},\n",
+    "    device_map=\"xpu\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use it in Chains"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm_chain = prompt | llm\n",
+    "\n",
+    "question = \"What is AI?\"\n",
+    "output = llm_chain.invoke(question)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save/Load Low-bit Model\n",
+    "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step. You can similarly set `device_map` to `xpu` in order to load the LLM model to Intel GPU. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To save the low-bit model, use `save_low_bit` as follows."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "saved_lowbit_model_path = \"./vicuna-7b-1.5-low-bit\"  # path to save low-bit model\n",
+    "llm.model.save_low_bit(saved_lowbit_model_path)\n",
+    "del llm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the model from saved lowbit model path as follows. \n",
+    "> Note that the saved path for the low-bit model only includes the model itself but not the tokenizers. If you wish to have everything in one place, you will need to manually download or copy the tokenizer files from the original model's directory to the location where the low-bit model is saved."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm_lowbit = IpexLLM.from_model_id_low_bit(\n",
+    "    model_id=saved_lowbit_model_path,\n",
+    "    tokenizer_id=\"lmsys/vicuna-7b-v1.5\",\n",
+    "    # tokenizer_name=saved_lowbit_model_path,  # copy the tokenizers to saved path if you want to use it this way\n",
+    "    model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True},\n",
+    "    device_map=\"xpu\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use the loaded model in Chains:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm_chain = prompt | llm_lowbit\n",
+    "\n",
+    "\n",
+    "question = \"What is AI?\"\n",
+    "output = llm_chain.invoke(question)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From cf3f17fc552e4a19364c492cd76582015c608966 Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Fri, 31 May 2024 16:57:43 +0800
Subject: [PATCH 03/16] modify

---
 libs/community/langchain_community/llms/ipex_llm.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/libs/community/langchain_community/llms/ipex_llm.py b/libs/community/langchain_community/llms/ipex_llm.py
index b0217baf1f22c..fb76267eadb35 100644
--- a/libs/community/langchain_community/llms/ipex_llm.py
+++ b/libs/community/langchain_community/llms/ipex_llm.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, List, Mapping, Optional, Literal
+from typing import Any, List, Literal, Mapping, Optional
 
 from langchain_core.callbacks import CallbackManagerForLLMRun
 from langchain_core.language_models.llms import LLM
@@ -251,7 +251,8 @@ def _call(
         if self.streaming:
             from transformers import TextStreamer
 
-            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.model.device)
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+            input_ids.to(self.model.device)
             streamer = TextStreamer(
                 self.tokenizer, skip_prompt=True, skip_special_tokens=True
             )
@@ -277,7 +278,8 @@ def _call(
             text = self.tokenizer.decode(output[0], skip_special_tokens=True)
             return text
         else:
-            input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.model.device)
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+            input_ids.to(self.model.device)
             if stop is not None:
                 from transformers.generation.stopping_criteria import (
                     StoppingCriteriaList,

From df023d0fd923025acd095d44380d91ed861e414b Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Fri, 31 May 2024 17:04:21 +0800
Subject: [PATCH 04/16] modify

---
 libs/community/langchain_community/llms/ipex_llm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libs/community/langchain_community/llms/ipex_llm.py b/libs/community/langchain_community/llms/ipex_llm.py
index fb76267eadb35..6e1bcc1693dee 100644
--- a/libs/community/langchain_community/llms/ipex_llm.py
+++ b/libs/community/langchain_community/llms/ipex_llm.py
@@ -46,7 +46,7 @@ def from_model_id(
         tokenizer_id: Optional[str] = None,
         load_in_4bit: bool = True,
         load_in_low_bit: Optional[str] = None,
-        device_map: Literal['cpu','xpu'] = 'cpu',
+        device_map: Literal["cpu", "xpu"] = "cpu",
         **kwargs: Any,
     ) -> LLM:
         """
@@ -88,7 +88,7 @@ def from_model_id_low_bit(
         model_kwargs: Optional[dict] = None,
         *,
         tokenizer_id: Optional[str] = None,
-        device_map: Literal['cpu','xpu'] = 'cpu',
+        device_map: Literal["cpu", "xpu"] = "cpu",
         **kwargs: Any,
     ) -> LLM:
         """
@@ -125,7 +125,7 @@ def _load_model(
         load_in_4bit: bool = False,
         load_in_low_bit: Optional[str] = None,
         low_bit_model: bool = False,
-        device_map: Literal['cpu','xpu'] = "cpu",
+        device_map: Literal["cpu", "xpu"] = "cpu",
         model_kwargs: Optional[dict] = None,
         kwargs: Optional[dict] = None,
     ) -> Any:
@@ -202,7 +202,7 @@ def _load_model(
                 f"'cpu' or 'xpu', but you have: {device_map}."
             )
         model.to(device_map)
-        
+
         return cls(
             model_id=model_id,
             model=model,

From 9a8b69386f08230854a2501292f53e8b373fe129 Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Fri, 31 May 2024 17:11:20 +0800
Subject: [PATCH 05/16] modify

---
 libs/community/tests/integration_tests/llms/test_ipex_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/community/tests/integration_tests/llms/test_ipex_llm.py b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
index b153cde2b5868..daadf7aea2078 100644
--- a/libs/community/tests/integration_tests/llms/test_ipex_llm.py
+++ b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
@@ -105,4 +105,4 @@ def test_load_generate_gpu(model_id: str) -> None:
     )
     output = llm.generate(["Hello!"])
     assert isinstance(output, LLMResult)
-    assert isinstance(output.generations, list)
\ No newline at end of file
+    assert isinstance(output.generations, list)

From bf84237d1df209fb28157c2785c4aa1d0d6f08ab Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Mon, 3 Jun 2024 14:16:51 +0800
Subject: [PATCH 06/16] fix lint

---
 libs/community/tests/integration_tests/llms/test_ipex_llm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libs/community/tests/integration_tests/llms/test_ipex_llm.py b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
index daadf7aea2078..b6cf5ce7d1478 100644
--- a/libs/community/tests/integration_tests/llms/test_ipex_llm.py
+++ b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
@@ -1,4 +1,5 @@
 """Test IPEX LLM"""
+
 import os
 from typing import Any
 
@@ -87,6 +88,7 @@ def test_save_load_lowbit(model_id: str) -> None:
     output = loaded_llm.invoke("Hello!")
     assert isinstance(output, str)
 
+
 @skip_if_no_model_ids
 @pytest.mark.parametrize(
     "model_id",

From dd11295de968130a81ed64d4883f65d9e247e1ae Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Mon, 3 Jun 2024 14:24:26 +0800
Subject: [PATCH 07/16] modify

---
 docs/docs/integrations/llms/ipex_llm.ipynb     | 4 ++--
 docs/docs/integrations/llms/ipex_llm_gpu.ipynb | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/docs/integrations/llms/ipex_llm.ipynb b/docs/docs/integrations/llms/ipex_llm.ipynb
index ba456b7608b47..b4059b8cc9c41 100644
--- a/docs/docs/integrations/llms/ipex_llm.ipynb
+++ b/docs/docs/integrations/llms/ipex_llm.ipynb
@@ -4,11 +4,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# IPEX-LLM\n",
+    "# IPEX-LLM on Intel CPU\n",
     "\n",
     "> [IPEX-LLM](https://github.com/intel-analytics/ipex-llm/) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency. \n",
     "\n",
-    "This example goes over how to use LangChain to interact with `ipex-llm` for text generation. \n"
+    "This example goes over how to use LangChain to interact with `ipex-llm` for text generation on Intel CPU.\n"
    ]
   },
   {
diff --git a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
index 4f3f6788f85ba..e2e6077e3492a 100644
--- a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
+++ b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "> [IPEX-LLM](https://github.com/intel-analytics/ipex-llm) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency.\n",
     "\n",
-    "This example goes over how to use LangChain to interact with `ipex-llm` for text generation. \n",
+    "This example goes over how to use LangChain to interact with `ipex-llm` for text generation on Intel GPU. \n",
     "\n",
     "> **Note**\n",
     ">\n",

From 64449ade73ba5c3d2f41b93ee85e2fb1cda27424 Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Mon, 3 Jun 2024 16:04:45 +0800
Subject: [PATCH 08/16] unify

---
 .../langchain_community/llms/ipex_llm.py      | 30 +++++++++----------
 .../integration_tests/llms/test_ipex_llm.py   | 29 +++++-------------
 2 files changed, 21 insertions(+), 38 deletions(-)

diff --git a/libs/community/langchain_community/llms/ipex_llm.py b/libs/community/langchain_community/llms/ipex_llm.py
index 6e1bcc1693dee..d6173dff1f260 100644
--- a/libs/community/langchain_community/llms/ipex_llm.py
+++ b/libs/community/langchain_community/llms/ipex_llm.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, List, Literal, Mapping, Optional
+from typing import Any, List, Mapping, Optional
 
 from langchain_core.callbacks import CallbackManagerForLLMRun
 from langchain_core.language_models.llms import LLM
@@ -46,7 +46,6 @@ def from_model_id(
         tokenizer_id: Optional[str] = None,
         load_in_4bit: bool = True,
         load_in_low_bit: Optional[str] = None,
-        device_map: Literal["cpu", "xpu"] = "cpu",
         **kwargs: Any,
     ) -> LLM:
         """
@@ -76,7 +75,6 @@ def from_model_id(
             low_bit_model=False,
             load_in_4bit=load_in_4bit,
             load_in_low_bit=load_in_low_bit,
-            device_map=device_map,
             model_kwargs=model_kwargs,
             kwargs=kwargs,
         )
@@ -88,7 +86,6 @@ def from_model_id_low_bit(
         model_kwargs: Optional[dict] = None,
         *,
         tokenizer_id: Optional[str] = None,
-        device_map: Literal["cpu", "xpu"] = "cpu",
         **kwargs: Any,
     ) -> LLM:
         """
@@ -112,7 +109,6 @@ def from_model_id_low_bit(
             low_bit_model=True,
             load_in_4bit=False,  # not used for low-bit model
             load_in_low_bit=None,  # not used for low-bit model
-            device_map=device_map,
             model_kwargs=model_kwargs,
             kwargs=kwargs,
         )
@@ -125,7 +121,6 @@ def _load_model(
         load_in_4bit: bool = False,
         load_in_low_bit: Optional[str] = None,
         low_bit_model: bool = False,
-        device_map: Literal["cpu", "xpu"] = "cpu",
         model_kwargs: Optional[dict] = None,
         kwargs: Optional[dict] = None,
     ) -> Any:
@@ -147,6 +142,16 @@ def _load_model(
         kwargs = kwargs or {}
 
         _tokenizer_id = tokenizer_id or model_id
+        # Set "cpu" as default device
+        if "device" not in model_kwargs:
+            model_kwargs["device"] = "cpu"
+
+        if model_kwargs["device"] not in ["cpu", "xpu"]:
+            raise ValueError(
+                "IpexLLMBgeEmbeddings currently only supports device to be "
+                f"'cpu' or 'xpu', but you have: {model_kwargs['device']}."
+            )
+        device = model_kwargs.pop("device")
 
         try:
             tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
@@ -194,14 +199,7 @@ def _load_model(
                 model_kwargs=_model_kwargs,
             )
 
-        # Set "cpu" as default device
-
-        if device_map not in ["cpu", "xpu"]:
-            raise ValueError(
-                "IpexLLM currently only supports device to be "
-                f"'cpu' or 'xpu', but you have: {device_map}."
-            )
-        model.to(device_map)
+        model.to(device)
 
         return cls(
             model_id=model_id,
@@ -252,7 +250,7 @@ def _call(
             from transformers import TextStreamer
 
             input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
-            input_ids.to(self.model.device)
+            input_ids = input_ids.to(self.model.device)
             streamer = TextStreamer(
                 self.tokenizer, skip_prompt=True, skip_special_tokens=True
             )
@@ -279,7 +277,7 @@ def _call(
             return text
         else:
             input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
-            input_ids.to(self.model.device)
+            input_ids = input_ids.to(self.model.device)
             if stop is not None:
                 from transformers.generation.stopping_criteria import (
                     StoppingCriteriaList,
diff --git a/libs/community/tests/integration_tests/llms/test_ipex_llm.py b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
index b6cf5ce7d1478..9ec9095e7c949 100644
--- a/libs/community/tests/integration_tests/llms/test_ipex_llm.py
+++ b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
@@ -13,12 +13,18 @@
     not model_ids_to_test, reason="TEST_IPEXLLM_MODEL_IDS environment variable not set."
 )
 model_ids_to_test = [model_id.strip() for model_id in model_ids_to_test.split(",")]  # type: ignore
+device = os.getenv("TEST_IPEXLLM_BGE_EMBEDDING_MODEL_DEVICE") or "cpu"
 
 
 def load_model(model_id: str) -> Any:
     llm = IpexLLM.from_model_id(
         model_id=model_id,
-        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
+        model_kwargs={
+            "temperature": 0,
+            "max_length": 16,
+            "trust_remote_code": True,
+            "device": device,
+        },
     )
     return llm
 
@@ -87,24 +93,3 @@ def test_save_load_lowbit(model_id: str) -> None:
     )
     output = loaded_llm.invoke("Hello!")
     assert isinstance(output, str)
-
-
-@skip_if_no_model_ids
-@pytest.mark.parametrize(
-    "model_id",
-    model_ids_to_test,
-)
-def test_load_generate_gpu(model_id: str) -> None:
-    """Test valid call."""
-    llm = IpexLLM.from_model_id(
-        model_id=model_id,
-        model_kwargs={
-            "temperature": 0,
-            "max_length": 16,
-            "trust_remote_code": True,
-        },
-        device_map="xpu",
-    )
-    output = llm.generate(["Hello!"])
-    assert isinstance(output, LLMResult)
-    assert isinstance(output.generations, list)

From 3e1dd3c9ea69781edc3970403f5103a133cc4c19 Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Mon, 3 Jun 2024 16:10:53 +0800
Subject: [PATCH 09/16] modify

---
 libs/community/langchain_community/llms/ipex_llm.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libs/community/langchain_community/llms/ipex_llm.py b/libs/community/langchain_community/llms/ipex_llm.py
index d6173dff1f260..eca0171a4fcff 100644
--- a/libs/community/langchain_community/llms/ipex_llm.py
+++ b/libs/community/langchain_community/llms/ipex_llm.py
@@ -143,15 +143,15 @@ def _load_model(
 
         _tokenizer_id = tokenizer_id or model_id
         # Set "cpu" as default device
-        if "device" not in model_kwargs:
-            model_kwargs["device"] = "cpu"
+        if "device" not in _model_kwargs:
+            _model_kwargs["device"] = "cpu"
 
-        if model_kwargs["device"] not in ["cpu", "xpu"]:
+        if _model_kwargs["device"] not in ["cpu", "xpu"]:
             raise ValueError(
                 "IpexLLMBgeEmbeddings currently only supports device to be "
-                f"'cpu' or 'xpu', but you have: {model_kwargs['device']}."
+                f"'cpu' or 'xpu', but you have: {_model_kwargs['device']}."
             )
-        device = model_kwargs.pop("device")
+        device = _model_kwargs.pop("device")
 
         try:
             tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)

From 6243f608d0d5b5470e6c5839c04ae32ec1af1d1f Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Tue, 4 Jun 2024 11:00:39 +0800
Subject: [PATCH 10/16] update doc

---
 docs/docs/integrations/llms/ipex_llm_gpu.ipynb | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
index e2e6077e3492a..470f5022992ff 100644
--- a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
+++ b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
@@ -125,7 +125,7 @@
     "\n",
     "## Basic Usage\n",
     "\n",
-    "Setting `device_map` to `\"xpu\"` when initializing `IpexLLM` will put the LLM model on Intel GPU and benefit from IPEX-LLM optimizations:"
+    "Setting `device` to `\"xpu\"` in `model_kwargs` when initializing `IpexLLM` will put the LLM model on Intel GPU and benefit from IPEX-LLM optimizations:"
    ]
   },
   {
@@ -146,8 +146,7 @@
     "\n",
     "llm = IpexLLM.from_model_id(\n",
     "    model_id=\"lmsys/vicuna-7b-v1.5\",\n",
-    "    model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True},\n",
-    "    device_map=\"xpu\",\n",
+    "    model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True, \"device\":\"xpu\"},\n",
     ")"
    ]
   },
@@ -175,7 +174,7 @@
    "metadata": {},
    "source": [
     "## Save/Load Low-bit Model\n",
-    "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step. You can similarly set `device_map` to `xpu` in order to load the LLM model to Intel GPU. "
+    "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step. You can similarly set `device` to `xpu` in `model_kwargs` in order to load the LLM model to Intel GPU. "
    ]
   },
   {
@@ -214,8 +213,7 @@
     "    model_id=saved_lowbit_model_path,\n",
     "    tokenizer_id=\"lmsys/vicuna-7b-v1.5\",\n",
     "    # tokenizer_name=saved_lowbit_model_path,  # copy the tokenizers to saved path if you want to use it this way\n",
-    "    model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True},\n",
-    "    device_map=\"xpu\",\n",
+    "    model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True, \"device\":\"xpu\"},\n",
     ")"
    ]
   },

From 7e1250f0740842b49000e43deb73a0773b7921ef Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Tue, 4 Jun 2024 11:15:08 +0800
Subject: [PATCH 11/16] modify

---
 docs/docs/integrations/llms/ipex_llm_gpu.ipynb | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
index 470f5022992ff..040882b8ea922 100644
--- a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
+++ b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
@@ -146,7 +146,12 @@
     "\n",
     "llm = IpexLLM.from_model_id(\n",
     "    model_id=\"lmsys/vicuna-7b-v1.5\",\n",
-    "    model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True, \"device\":\"xpu\"},\n",
+    "    model_kwargs={\n",
+    "        \"temperature\": 0,\n",
+    "        \"max_length\": 64,\n",
+    "        \"trust_remote_code\": True,\n",
+    "        \"device\": \"xpu\",\n",
+    "    },\n",
     ")"
    ]
   },
@@ -213,7 +218,12 @@
     "    model_id=saved_lowbit_model_path,\n",
     "    tokenizer_id=\"lmsys/vicuna-7b-v1.5\",\n",
     "    # tokenizer_name=saved_lowbit_model_path,  # copy the tokenizers to saved path if you want to use it this way\n",
-    "    model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True, \"device\":\"xpu\"},\n",
+    "    model_kwargs={\n",
+    "        \"temperature\": 0,\n",
+    "        \"max_length\": 64,\n",
+    "        \"trust_remote_code\": True,\n",
+    "        \"device\": \"xpu\",\n",
+    "    },\n",
     ")"
    ]
   },

From 9149b023c8126ce681e76f2f575bee02a8eacb03 Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Tue, 4 Jun 2024 14:22:49 +0800
Subject: [PATCH 12/16] modify

---
 docs/docs/integrations/llms/ipex_llm_gpu.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
index 040882b8ea922..228811ab28067 100644
--- a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
+++ b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
@@ -179,7 +179,7 @@
    "metadata": {},
    "source": [
     "## Save/Load Low-bit Model\n",
-    "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step. You can similarly set `device` to `xpu` in `model_kwargs` in order to load the LLM model to Intel GPU. "
+    "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step. You can similarly set `device` to `\"xpu\"` in `model_kwargs` in order to load the LLM model to Intel GPU. "
    ]
   },
   {

From d412ad7069f439d94769950c32d5b456add8e096 Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Tue, 4 Jun 2024 14:23:33 +0800
Subject: [PATCH 13/16] modify

---
 docs/docs/integrations/llms/ipex_llm_gpu.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
index 228811ab28067..25b34c1aa156a 100644
--- a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
+++ b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
@@ -125,7 +125,7 @@
     "\n",
     "## Basic Usage\n",
     "\n",
-    "Setting `device` to `\"xpu\"` in `model_kwargs` when initializing `IpexLLM` will put the LLM model on Intel GPU and benefit from IPEX-LLM optimizations:"
+    "Setting `device` to `\"xpu\"` in `model_kwargs` when initializing `IpexLLM` will put the LLM model on Intel GPU and benefit from IPEX-LLM optimizations. Specify the prompt template for your model. In this example, we use the [vicuna-1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) model. If you're working with a different model, choose a proper template accordingly."
    ]
   },
   {

From 7acc8623b991dc3389d6b8c98f1a48c254c170ed Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Tue, 4 Jun 2024 14:34:55 +0800
Subject: [PATCH 14/16] modify

---
 .../docs/integrations/llms/ipex_llm_gpu.ipynb | 39 ++++++++++++++++---
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
index 25b34c1aa156a..826387006ef59 100644
--- a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
+++ b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
@@ -123,9 +123,7 @@
     "> For other GPU type, please refer to [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html#runtime-configuration) for Windows users, and  [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html#id5) for Linux users.\n",
     "\n",
     "\n",
-    "## Basic Usage\n",
-    "\n",
-    "Setting `device` to `\"xpu\"` in `model_kwargs` when initializing `IpexLLM` will put the LLM model on Intel GPU and benefit from IPEX-LLM optimizations. Specify the prompt template for your model. In this example, we use the [vicuna-1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) model. If you're working with a different model, choose a proper template accordingly."
+    "## Basic Usage\n"
    ]
   },
   {
@@ -140,10 +138,39 @@
     "from langchain_community.llms import IpexLLM\n",
     "from langchain_core.prompts import PromptTemplate\n",
     "\n",
-    "warnings.filterwarnings(\"ignore\", category=UserWarning, message=\".*padding_mask.*\")\n",
+    "warnings.filterwarnings(\"ignore\", category=UserWarning, message=\".*padding_mask.*\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Specify the prompt template for your model. In this example, we use the [vicuna-1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) model. If you're working with a different model, choose a proper template accordingly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "template = \"USER: {question}\\nASSISTANT:\"\n",
-    "prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n",
-    "\n",
+    "prompt = PromptTemplate(template=template, input_variables=[\"question\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the model locally using IpexLLM using `IpexLLM.from_model_id`. It will load the model directly in its Huggingface format and convert it automatically to low-bit format for inference. Set `device` to `\"xpu\"` in `model_kwargs` when initializing IpexLLM in order to load the LLM model to Intel GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "llm = IpexLLM.from_model_id(\n",
     "    model_id=\"lmsys/vicuna-7b-v1.5\",\n",
     "    model_kwargs={\n",

From 17de7d5bf4bd1ada446eb7064a0ac6554f5253ad Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Tue, 4 Jun 2024 14:38:38 +0800
Subject: [PATCH 15/16] modify

---
 .../docs/integrations/llms/ipex_llm_gpu.ipynb | 24 ++++---------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
index 826387006ef59..cd4c7ea2b05ea 100644
--- a/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
+++ b/docs/docs/integrations/llms/ipex_llm_gpu.ipynb
@@ -29,11 +29,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%pip install -qU langchain langchain-community"
@@ -49,11 +45,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
@@ -77,11 +69,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
@@ -100,11 +88,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",

From 214be6b758f1af06d5e7f49384a9a7466baa8917 Mon Sep 17 00:00:00 2001
From: ivy-lv11 <zhicunlv@gmail.com>
Date: Tue, 4 Jun 2024 14:44:11 +0800
Subject: [PATCH 16/16] modify

---
 libs/community/tests/integration_tests/llms/test_ipex_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/community/tests/integration_tests/llms/test_ipex_llm.py b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
index 9ec9095e7c949..0fc2b5caa5331 100644
--- a/libs/community/tests/integration_tests/llms/test_ipex_llm.py
+++ b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
@@ -13,7 +13,7 @@
     not model_ids_to_test, reason="TEST_IPEXLLM_MODEL_IDS environment variable not set."
 )
 model_ids_to_test = [model_id.strip() for model_id in model_ids_to_test.split(",")]  # type: ignore
-device = os.getenv("TEST_IPEXLLM_BGE_EMBEDDING_MODEL_DEVICE") or "cpu"
+device = os.getenv("TEST_IPEXLLM_MODEL_DEVICE") or "cpu"
 
 
 def load_model(model_id: str) -> Any: