transformers v3 upgrade

jparkerweb · Nov 12, 2024 · f27e840 · f27e840
1 parent cb8695b
commit f27e840
Show file tree

Hide file tree

Showing 13 changed files with 123 additions and 146 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 All notable changes to this project will be documented in this file.
 
+## [2.3.0] - 2024-11-11
+### Updated
+- Updated `transformers.js` from v2 to v3
+- Migrated quantization option from `onnxEmbeddingModelQuantized` (boolean) to `dtype` ('p32', 'p16', 'q8', 'q4')
+- Updated Web UI to use new `dtype` option
+
 ## [2.2.5] - 2024-11-08
 ### Updated
 - Updated Web UI styles for smaller screens

diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ const myChunks = await chunkit(documents, chunkitOptions);
   - `combineChunks`: Boolean (optional, default `true`) - Determines whether to reblance and combine chunks into larger ones up to the max token limit.
   - `combineChunksSimilarityThreshold`: Float (optional, default `0.5`) - Threshold for combining chunks based on similarity during the rebalance and combining phase.
   - `onnxEmbeddingModel`: String (optional, default `Xenova/all-MiniLM-L6-v2`) - ONNX model used for creating embeddings.
-  - `onnxEmbeddingModelQuantized`: Boolean (optional, default `true`) - Indicates whether to use a quantized version of the embedding model.
+  - `dtype`: String (optional, default `fp32`) - Precision of the embedding model (options: `fp32`, `fp16`, `q8`, `q4`).
   - `localModelPath`: String (optional, default `null`) - Local path to save and load models (example: `./models`).
   - `modelCacheDir`: String (optional, default `null`) - Directory to cache downloaded models (example: `./models`).
   - `returnEmbedding`: Boolean (optional, default `false`) - If set to `true`, each chunk will include an embedding vector. This is useful for applications that require semantic understanding of the chunks. The embedding model will be the same as the one specified in `onnxEmbeddingModel`.
@@ -88,7 +88,7 @@ The output is an array of chunks, each containing the following properties:
 - `number_of_chunks`: Integer - The total number of final chunks returned from the input text.
 - `chunk_number`: Integer - The number of the current chunk.
 - `model_name`: String - The name of the embedding model used.
-- `is_model_quantized`: Boolean - Indicates whether the embedding model is quantized.
+- `dtype`: String - The precision of the embedding model used (options: `fp32`, `fp16`, `q8`, `q4`).
 - `text`: String - The chunked text.
 - `embedding`: Array - The embedding vector (if `returnEmbedding` is `true`).
 - `token_length`: Integer - The token length (if `returnTokenLength` is `true`).
@@ -213,18 +213,16 @@ The behavior of the `chunkit` function can be finely tuned using several optiona
 
 #### Curated ONNX Embedding Models
 
-| Model                                        | Quantized | Link                                                                                                                                       | Size    |
-| -------------------------------------------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------- |
-| nomic-ai/nomic-embed-text-v1.5               | true      | [https://huggingface.co/nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)                             | 138 MB  |
-| nomic-ai/nomic-embed-text-v1.5               | false     | [https://huggingface.co/nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)                             | 548 MB  |
-| Xenova/all-MiniLM-L6-v2                      | true      | [https://huggingface.co/Xenova/all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2)                                           | 23 MB   |
-| Xenova/all-MiniLM-L6-v2                      | false     | [https://huggingface.co/Xenova/all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2)                                           | 90.4 MB |
-| Xenova/paraphrase-multilingual-MiniLM-L12-v2 | true      | [https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2) | 118 MB  |
-| thenlper/gte-base                            | false     | [https://huggingface.co/thenlper/gte-base](https://huggingface.co/thenlper/gte-base)                                                       | 436 MB  |
-| Xenova/all-distilroberta-v1                  | true      | [https://huggingface.co/Xenova/all-distilroberta-v1](https://huggingface.co/Xenova/all-distilroberta-v1)                                   | 82.1 MB |
-| Xenova/all-distilroberta-v1                  | false     | [https://huggingface.co/Xenova/all-distilroberta-v1](https://huggingface.co/Xenova/all-distilroberta-v1)                                   | 326 MB  |
-| BAAI/bge-base-en-v1.5                        | false     | [https://huggingface.co/BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)                                               | 436 MB  |
-| BAAI/bge-small-en-v1.5                       | false     | [https://huggingface.co/BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5)                                             | 133 MB  |
+| Model                                        | Precision      | Link                                                                                                                                       | Size                   |
+| -------------------------------------------- | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------- |
+| nomic-ai/nomic-embed-text-v1.5               | fp32, q8       | [https://huggingface.co/nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)                             | 548 MB, 138 MB         |
+| thenlper/gte-base                            | fp32           | [https://huggingface.co/thenlper/gte-base](https://huggingface.co/thenlper/gte-base)                                                       | 436 MB                 |
+| Xenova/all-MiniLM-L6-v2                      | fp32, fp16, q8 | [https://huggingface.co/Xenova/all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2)                                           | 23 MB, 45 MB, 90 MB    |
+| Xenova/paraphrase-multilingual-MiniLM-L12-v2 | fp32, fp16, q8 | [https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2) | 470 MB, 235 MB, 118 MB |
+| Xenova/all-distilroberta-v1                  | fp32, fp16, q8 | [https://huggingface.co/Xenova/all-distilroberta-v1](https://huggingface.co/Xenova/all-distilroberta-v1)                                   | 326 MB, 163 MB, 82 MB  |
+| BAAI/bge-base-en-v1.5                        | fp32           | [https://huggingface.co/BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)                                               | 436 MB                 |
+| BAAI/bge-small-en-v1.5                       | fp32           | [https://huggingface.co/BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5)                                             | 133 MB                 |
+| yashvardhan7/snowflake-arctic-embed-m-onnx   | fp32           | [https://huggingface.co/yashvardhan7/snowflake-arctic-embed-m-onnx](https://huggingface.co/yashvardhan7/snowflake-arctic-embed-m-onnx)     | 436 MB                 |
 
 Each of these parameters allows you to customize the `chunkit` function to better fit the text size, content complexity, and performance requirements of your application.
 

diff --git a/chunkit.js b/chunkit.js
@@ -29,7 +29,8 @@ export async function chunkit(
         combineChunks = DEFAULT_CONFIG.COMBINE_CHUNKS,
         combineChunksSimilarityThreshold = DEFAULT_CONFIG.COMBINE_CHUNKS_SIMILARITY_THRESHOLD,
         onnxEmbeddingModel = DEFAULT_CONFIG.ONNX_EMBEDDING_MODEL,
-        onnxEmbeddingModelQuantized = DEFAULT_CONFIG.ONNX_EMBEDDING_MODEL_QUANTIZED,
+        dtype = DEFAULT_CONFIG.DTYPE,
+        onnxEmbeddingModelQuantized,
         localModelPath = DEFAULT_CONFIG.LOCAL_MODEL_PATH,
         modelCacheDir = DEFAULT_CONFIG.MODEL_CACHE_DIR,
         returnEmbedding = DEFAULT_CONFIG.RETURN_EMBEDDING,
@@ -43,10 +44,13 @@ export async function chunkit(
         throw new Error('Input must be an array of document objects');
     }
 
-    // Initialize embedding utilities with paths
-    const { modelName, isQuantized } = await initializeEmbeddingUtils(
+    // if legacy boolean is used (onnxEmbeddingModelQuantized), set dtype (model precision) to 'q8'
+    if (onnxEmbeddingModelQuantized === true) { dtype = 'q8'; }
+
+    // Initialize embedding utilities and set optional paths
+    const { modelName, dtype: usedDtype } = await initializeEmbeddingUtils(
         onnxEmbeddingModel, 
-        onnxEmbeddingModelQuantized,
+        dtype,
         localModelPath,
         modelCacheDir
     );
@@ -96,7 +100,7 @@ export async function chunkit(
                 console.log(`--------------`);
                 console.log(`-- Chunk ${(index + 1)} --`);
                 console.log(`--------------`);
-                console.log(chunk);
+                console.log(chunk.substring(0, 50) + '...');
             });
         }
 
@@ -112,7 +116,7 @@ export async function chunkit(
                     console.log("--------------------");
                     console.log("Chunk " + (index + 1));
                     console.log("--------------------");
-                    console.log(chunk);
+                    console.log(chunk.substring(0, 50) + '...');
                 });
             }
         } else {
@@ -131,7 +135,7 @@ export async function chunkit(
                 number_of_chunks: numberOfChunks,
                 chunk_number: index + 1,
                 model_name: modelName,
-                is_model_quantized: isQuantized,
+                dtype: usedDtype,
                 text: prefixedChunk
             };
 
@@ -177,7 +181,8 @@ export async function cramit(
         logging = DEFAULT_CONFIG.LOGGING,
         maxTokenSize = DEFAULT_CONFIG.MAX_TOKEN_SIZE,
         onnxEmbeddingModel = DEFAULT_CONFIG.ONNX_EMBEDDING_MODEL,
-        onnxEmbeddingModelQuantized = DEFAULT_CONFIG.ONNX_EMBEDDING_MODEL_QUANTIZED,
+        onnxEmbeddingModelQuantized,
+        dtype = DEFAULT_CONFIG.DTYPE,
         localModelPath = DEFAULT_CONFIG.LOCAL_MODEL_PATH,
         modelCacheDir = DEFAULT_CONFIG.MODEL_CACHE_DIR,
         returnEmbedding = DEFAULT_CONFIG.RETURN_EMBEDDING,
@@ -191,6 +196,9 @@ export async function cramit(
         throw new Error('Input must be an array of document objects');
     }
 
+    // if legacy boolean is used (onnxEmbeddingModelQuantized), set dtype (model precision) to 'q8'
+    if (onnxEmbeddingModelQuantized === true) { dtype = 'q8'; }
+
     // Initialize embedding utilities with paths
     const { modelName, isQuantized } = await initializeEmbeddingUtils(
         onnxEmbeddingModel, 
@@ -222,7 +230,7 @@ export async function cramit(
                 console.log(`--------------`);
                 console.log(`-- Chunk ${(index + 1)} --`);
                 console.log(`--------------`);
-                console.log(chunk);
+                console.log(chunk.substring(0, 50) + '...');
             });
         }
 

diff --git a/config.js b/config.js
@@ -8,8 +8,7 @@ export const DEFAULT_CONFIG = {
     COMBINE_CHUNKS: true,
     COMBINE_CHUNKS_SIMILARITY_THRESHOLD: 0.6,
     ONNX_EMBEDDING_MODEL: "Xenova/all-MiniLM-L6-v2",
-    ONNX_EMBEDDING_MODEL_QUANTIZED: true,
-    DTYPE: 'q4',
+    DTYPE: 'fp32',
     LOCAL_MODEL_PATH: null,
     MODEL_CACHE_DIR: null,
     RETURN_EMBEDDING: false,

diff --git a/embeddingUtils.js b/embeddingUtils.js
@@ -9,7 +9,7 @@ const embeddingCache = new Map();
 // --------------------------------------------
 export async function initializeEmbeddingUtils(
     onnxEmbeddingModel, 
-    onnxEmbeddingModelQuantized,
+    dtype = 'fp32',
     localModelPath = null,
     modelCacheDir = null
 ) {
@@ -20,15 +20,14 @@ export async function initializeEmbeddingUtils(
 
     tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel);
     generateEmbedding = await pipeline('feature-extraction', onnxEmbeddingModel, {
-        dtype: onnxEmbeddingModelQuantized ? 'q8' : 'fp32',
+        dtype: dtype,
     });
 
-    // Clear the embedding cache when initializing with a new model
     embeddingCache.clear();
 
     return {
         modelName: onnxEmbeddingModel,
-        isQuantized: onnxEmbeddingModelQuantized
+        dtype: dtype
     };
 }
 

diff --git a/example/example-chunkit.js b/example/example-chunkit.js
@@ -38,7 +38,7 @@ let myTestChunks = await chunkit(
         combineChunks: true,  // enable rebalancing
         combineChunksSimilarityThreshold: 0.6,
         onnxEmbeddingModel: "nomic-ai/nomic-embed-text-v1.5",
-        onnxEmbeddingModelQuantized: true,
+        dtype: "q8",
         localModelPath: "../models",
         modelCacheDir: "../models",
         returnEmbedding: false,