new main Merge branch 'main' of https://github.com/intel-analytics/ip…

…ex-llm into test_transformers_41
intel-analytics · Sep 4, 2024 · 7d8f3a0 · 7d8f3a0
2 parents 428e62b + 75b19f8
commit 7d8f3a0
Show file tree

Hide file tree

Showing 19 changed files with 624 additions and 265 deletions.
diff --git a/.github/workflows/llm-c-evaluation.yml b/.github/workflows/llm-c-evaluation.yml
@@ -12,10 +12,10 @@ permissions:
 on:
   # schedule:
   #   - cron: "00 15 * * *" # GMT time, 15:00 GMT == 23:00 Beijing Time
-  pull_request:
-    branches: [main]
-    paths:
-      - ".github/workflows/llm-c-evaluation.yml"
+  # pull_request:
+  #   branches: [main]
+  #   paths:
+  #     - ".github/workflows/llm-c-evaluation.yml"
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
     inputs:
@@ -204,7 +204,7 @@ jobs:
           pip install pandas==1.5.3
 
       - name: Download ceval results
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@4.1.7
         with:
           name: ceval_results
           path: results
@@ -259,7 +259,7 @@ jobs:
           fi
 
       - name: Download ceval results
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@4.1.7
         with:
           name: results_${{ needs.set-matrix.outputs.date }}
           path:  ${{ env.ACC_FOLDER }}

diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml
@@ -12,10 +12,10 @@ permissions:
 on:
   # schedule:
   #   - cron: "30 12 * * *" # GMT time, 12:30 GMT == 20:30 China
-  pull_request:
-    branches: [main]
-    paths:
-      - ".github/workflows/llm-harness-evaluation.yml"
+  # pull_request:
+  #   branches: [main]
+  #   paths:
+  #     - ".github/workflows/llm-harness-evaluation.yml"
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
     inputs:
@@ -220,7 +220,7 @@ jobs:
           pip install --upgrade pip
           pip install jsonlines  pytablewriter regex
       - name: Download all results
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@4.1.7
         with:
           name: harness_results
           path: results        
@@ -260,7 +260,7 @@ jobs:
           fi
 
       - name: Download harness results
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@4.1.7
         with:
           name: harness_results
           path: ${{ env.ACC_FOLDER}}/${{ env.DATE }}

diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml
@@ -12,10 +12,10 @@ permissions:
 on:
   # schedule:
   #   - cron: "00 12 * * *" # GMT time, 12:00 GMT == 20:00 China
-  pull_request:
-    branches: [main]
-    paths:
-      - ".github/workflows/llm-ppl-evaluation.yml"
+  # pull_request:
+  #   branches: [main]
+  #   paths:
+  #     - ".github/workflows/llm-ppl-evaluation.yml"
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
     inputs:
@@ -206,7 +206,7 @@ jobs:
           pip install --upgrade pip
           pip install jsonlines  pytablewriter regex
       - name: Download all results
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@4.1.7
         with:
           name: ppl_results
           path: results        
@@ -245,7 +245,7 @@ jobs:
           fi
   
       - name: Download ppl results
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@4.1.7
         with:
           name: ppl_results
           path: ${{ env.ACC_FOLDER}}/${{ env.DATE }}

diff --git a/.github/workflows/llm-whisper-evaluation.yml b/.github/workflows/llm-whisper-evaluation.yml
@@ -12,10 +12,10 @@ permissions:
 on:
   # schedule:
   #   - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China
-  pull_request:
-    branches: [main]
-    paths:
-      - ".github/workflows/llm-whisper-evaluation.yml"
+  # pull_request:
+  #   branches: [main]
+  #   paths:
+  #     - ".github/workflows/llm-whisper-evaluation.yml"
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
     inputs:
@@ -176,14 +176,14 @@ jobs:
 
       - name: Download all results for nightly run
         if: github.event_name == 'schedule'
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@4.1.7
         with:
           name: whisper_results
           path: ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }}
 
       - name: Download all results for pr run
         if: github.event_name == 'pull_request'
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@4.1.7
         with:
           name: whisper_results
           path: ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }}

diff --git a/README.md b/README.md
@@ -319,7 +319,7 @@ Over 50 models have been optimized/verified on `ipex-llm`, including *LLaMA/LLaM
 | MiniCPM-V |  | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V) |
 | MiniCPM-V-2 |  | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2) |
 | MiniCPM-Llama3-V-2_5 |  | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5) |
-| MiniCPM-V-2_6 |  | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6) | 
+| MiniCPM-V-2_6 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6) | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6) | 
 
 ## Get Support
 - Please report a bug or raise a feature request by opening a [Github Issue](https://github.com/intel-analytics/ipex-llm/issues)

diff --git a/docs/mddocs/Quickstart/graphrag_quickstart.md b/docs/mddocs/Quickstart/graphrag_quickstart.md
@@ -9,12 +9,16 @@ The [GraphRAG project](https://github.com/microsoft/graphrag) is designed to lev
 - [Setup Python Environment for GraphRAG](#3-setup-python-environment-for-graphrag)
 - [Index GraphRAG](#4-index-graphrag)
 - [Query GraphRAG](#5-query-graphrag)
+- [Query GraphRAG](#5-query-graphrag)
+- [Troubleshooting](#troubleshooting)
 
 ## Quickstart
 
 ### 1. Install and Start `Ollama` Service on Intel GPU 
 
-Follow the steps in [Run Ollama with IPEX-LLM on Intel GPU Guide](./ollama_quickstart.md) to install and run Ollama on Intel GPU. Ensure that `ollama serve` is running correctly and can be accessed through a local URL (e.g., `https://127.0.0.1:11434`).
+Follow the steps in [Run Ollama with IPEX-LLM on Intel GPU Guide](./ollama_quickstart.md) to install `ipex-llm[cpp]==2.1.0` and run Ollama on Intel GPU. Ensure that `ollama serve` is running correctly and can be accessed through a local URL (e.g., `https://127.0.0.1:11434`).
+
+**Please note that for GraphRAG, we highly recommand using the stable version of ipex-llm through `pip install ipex-llm[cpp]==2.1.0`**.
 
 ### 2. Prepare LLM and Embedding Model
 
@@ -57,13 +61,17 @@ conda create -n graphrag-local-ollama python=3.10
 conda activate graphrag-local-ollama
 
 pip install -e .
+pip install future
 
 pip install ollama
 pip install plotly
 ```
 
 in which `pip install ollama` is for enabling restful APIs through python, and `pip install plotly` is for visualizing the knowledge graph.
 
+> [!NOTE]
+> Please note that the Python environment for GraphRAG setup here is separate from the one for Ollama server on Intel GPUs.
+
 ### 4. Index GraphRAG
 
 The environment is now ready for GraphRAG with local LLMs and embedding models running on Intel GPUs. Before querying GraphRAG, it is necessary to first index GraphRAG, which could be a resource-intensive operation.
@@ -114,24 +122,25 @@ Perpare the input corpus, and then initialize the workspace:
 #### Update `settings.yml`
 
 In the `settings.yml` file inside the `ragtest` folder, add the configuration `request_timeout: 1800.0` for `llm`. Besides, if you would like to use LLMs or embedding models other than `mistral` or `nomic-embed-text`, you are required to update the `settings.yml` in `ragtest` folder accordingly:
->
-> ```yml
-> llm:
->   api_key: ${GRAPHRAG_API_KEY}
->   type: openai_chat
->   model: mistral # change it accordingly if using another LLM
->   model_supports_json: true
->   request_timeout: 1800.0 # add this configuration; you could also increase the request_timeout
->   api_base: http://localhost:11434/v1
-> 
-> embeddings:
->   async_mode: threaded
->   llm:
->     api_key: ${GRAPHRAG_API_KEY}
->     type: openai_embedding
->     model: nomic_embed_text # change it accordingly if using another embedding model
->     api_base: http://localhost:11434/api
-> ```
+
+
+```yml
+llm:
+  api_key: ${GRAPHRAG_API_KEY}
+  type: openai_chat
+  model: mistral # change it accordingly if using another LLM
+  model_supports_json: true
+  request_timeout: 1800.0 # add this configuration; you could also increase the request_timeout
+  api_base: http://localhost:11434/v1
+
+embeddings:
+  async_mode: threaded
+  llm:
+    api_key: ${GRAPHRAG_API_KEY}
+    type: openai_embedding
+    model: nomic_embed_text # change it accordingly if using another embedding model
+    api_base: http://localhost:11434/api
+```
 
 #### Conduct GraphRAG indexing
 
@@ -197,3 +206,55 @@ The Transformer model has been very successful in various natural language proce
 
 Since its initial introduction, the Transformer model has been further developed and improved upon. Variants of the Transformer architecture, such as BERT (Bidirectional Encoder Representations from Transformers) and RoBERTa (Robustly Optimized BERT Pretraining Approach), have achieved state-of-the-art performance on a wide range of natural language processing tasks [Data: Reports (1, 2, 34, 46, 64, +more)].
 ```
+
+### Troubleshooting
+
+#### `failed to find free space in the KV cache, retrying with smaller n_batch` when conducting GraphRAG Indexing, and `JSONDecodeError` when querying GraphRAG
+
+If you observe the Ollama server log showing `failed to find free space in the KV cache, retrying with smaller n_batch` while conducting GraphRAG indexing, and receive `JSONDecodeError` when querying GraphRAG, try to increase context length for the LLM model and index/query GraphRAG again.
+
+Here introduce how to make the LLM model support larger context. To do this, we need to first create a file named `Modelfile`:
+
+```
+FROM mistral:latest
+PARAMETER num_ctx 4096
+```
+
+> [!TIP]
+> Here we increase `num_ctx` to 4096 as an example. You could adjust it accordingly.
+
+and then use the following commands to create a new model in Ollama named `mistral:latest-nctx4096`:
+
+- For **Linux users**:
+
+  ```bash
+  ./ollama create mistral:latest-nctx4096 -f Modelfile
+  ```
+
+- For **Windows users**:
+
+  Please run the following command in Miniforge or Anaconda Prompt.
+
+  ```cmd
+  ollama create mistral:latest-nctx4096 -f Modelfile
+  ```
+
+Finally, update `settings.yml` inside the `ragtest` folder to use `llm` model `mistral:latest-nctx4096`:
+
+```yml
+llm:
+  api_key: ${GRAPHRAG_API_KEY}
+  type: openai_chat
+  model: mistral:latest-nctx4096 # change it accordingly if using another LLM, or LLM model with larger num_ctx
+  model_supports_json: true
+  request_timeout: 1800.0 # add this configuration; you could also increase the request_timeout
+  api_base: http://localhost:11434/v1
+
+embeddings:
+  async_mode: threaded
+  llm:
+    api_key: ${GRAPHRAG_API_KEY}
+    type: openai_embedding
+    model: nomic_embed_text # change it accordingly if using another embedding model
+    api_base: http://localhost:11434/api
+```
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6/README.md
@@ -0,0 +1,101 @@
+# MiniCPM-V-2_6
+In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on MiniCPM-V-2_6 models. For illustration purposes, we utilize the [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) as a reference MiniCPM-V-2_6 model.
+
+## 0. Requirements
+To run these examples with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information.
+
+## Example: Predict Tokens using `chat()` API
+In the example [chat.py](./chat.py), we show a basic use case for a MiniCPM-V-2_6 model to predict the next N tokens using `chat()` API, with IPEX-LLM INT4 optimizations.
+### 1. Install
+We suggest using conda to manage environment:
+
+On Linux:
+
+```bash
+conda create -n llm python=3.11
+conda activate llm
+
+# install ipex-llm with 'all' option
+pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu
+pip install torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cpu
+pip install transformers==4.40.0 trl
+```
+On Windows:
+
+```cmd
+conda create -n llm python=3.11
+conda activate llm
+
+pip install --pre --upgrade ipex-llm[all]
+pip install torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cpu
+pip install transformers==4.40.0 trl
+```
+
+### 2. Run
+
+- chat without streaming mode:
+  ```
+  python ./chat.py --prompt 'What is in the image?'
+  ```
+- chat in streaming mode:
+  ```
+  python ./chat.py --prompt 'What is in the image?' --stream
+  ```
+
+> [!TIP]
+> For chatting in streaming mode, it is recommended to set the environment variable `PYTHONUNBUFFERED=1`.
+
+
+Arguments info:
+- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the MiniCPM-V-2_6 model (e.g. `openbmb/MiniCPM-V-2_6`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'openbmb/MiniCPM-V-2_6'`.
+- `--image-url-or-path IMAGE_URL_OR_PATH`: argument defining the image to be infered. It is default to be `'http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg'`.
+- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is in the image?'`.
+- `--stream`: flag to chat in streaming mode
+
+> **Note**: When loading the model in 4-bit, IPEX-LLM converts linear layers in the model into INT4 format. In theory, a *X*B model saved in 16-bit will requires approximately 2*X* GB of memory for loading, and ~0.5*X* GB memory for further inference.
+>
+> Please select the appropriate size of the MiniCPM model based on the capabilities of your machine.
+
+#### 2.1 Client
+On client Windows machine, it is recommended to run directly with full utilization of all cores:
+```cmd
+python ./chat.py 
+```
+
+#### 2.2 Server
+For optimal performance on server, it is recommended to set several environment variables (refer to [here](../README.md#best-known-configuration-on-linux) for more information), and run the example with all the physical cores of a single socket.
+
+E.g. on Linux,
+```bash
+# set IPEX-LLM env variables
+source ipex-llm-init
+
+# e.g. for a server with 48 cores per socket
+export OMP_NUM_THREADS=48
+numactl -C 0-47 -m 0 python ./chat.py
+```
+
+#### 2.3 Sample Output
+#### [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)
+```log
+Inference time: xxxx s
+-------------------- Input Image --------------------
+http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg
+-------------------- Input Prompt --------------------
+What is in the image?
+-------------------- Chat Output --------------------
+The image features a young child holding a white teddy bear dressed in pink. The background includes some red flowers and what appears to be a stone wall.
+```
+
+```log
+-------------------- Input Image --------------------
+http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg
+-------------------- Input Prompt --------------------
+图片里有什么？
+-------------------- Stream Chat Output --------------------
+图片中有一个小女孩，她手里拿着一个穿着粉色裙子的白色小熊玩偶。背景中有红色花朵和石头结构，可能是一个花园或庭院。
+```
+
+The sample input image is (which is fetched from [COCO dataset](https://cocodataset.org/#explore?id=264959)):
+
+<a href="http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg"><img width=400px src="http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg" ></a>