add phi-3-mini runner (pytorch#3951)

Summary: This PR adds a basic runner for running the phi-3-mini model. It uses sentencepiece to create the tokenizer. Commands for running the model: ``` # setup executorch per instructions in https://pytorch.org/executorch/stable/getting-started-setup.html # install latest transformers pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers # export the model, will take a few minutes cd examples/models/phi-3-mini python export_model.py # download the tokenizer.model wget -O tokenizer.model https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true # build the runner mkdir cmake-out cd cmake-out cmake .. cd .. cmake --build cmake-out -j10 ./cmake-out/phi_3_mini_runner ``` Pull Request resolved: pytorch#3951 Reviewed By: larryliu0820 Differential Revision: D58477481 Pulled By: helunwencser fbshipit-source-id: c5a7e6781338d4347a1b9d06b22e23613633df6b
kirklandsign · Jun 13, 2024 · 70743bb · 70743bb
1 parent 4ed5bc7
commit 70743bb
Show file tree

Hide file tree

Showing 6 changed files with 204 additions and 0 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -64,3 +64,6 @@
 [submodule "third-party/ios-cmake"]
 	path = third-party/ios-cmake
 	url = https://github.com/leetal/ios-cmake
+[submodule "examples/models/phi-3-mini/third-party/sentencepiece"]
+	path = examples/models/phi-3-mini/third-party/sentencepiece
+	url = https://github.com/google/sentencepiece.git
diff --git a/examples/models/phi-3-mini/CMakeLists.txt b/examples/models/phi-3-mini/CMakeLists.txt
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+project(phi_3_mini_runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+set(CMAKE_BUILD_TYPE Release)
+
+# Set options for executorch build.
+option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
+option(EXECUTORCH_BUILD_XNNPACK "" ON)
+
+add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../..
+    ${CMAKE_BINARY_DIR}/../../..)
+add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
+    ${CMAKE_BINARY_DIR}/third-party/sentencepiece)
+
+add_executable(phi_3_mini_runner main.cpp)
+target_include_directories(
+    phi_3_mini_runner
+    PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src)
+target_link_libraries(
+    phi_3_mini_runner
+    PRIVATE
+    executorch
+    extension_module_static
+    optimized_native_cpu_ops_lib
+    xnnpack_backend
+    sentencepiece)
diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md
@@ -0,0 +1,28 @@
+# Summary
+This example demonstrates how to run a [Phi-3-mini](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) 3.8B model via ExecuTorch. We use XNNPACK to accelarate the performance and XNNPACK symmetric per channel quantization.
+
+# Instructions
+## Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack`
+2. Phi-3 Mini-128K-Instruct has been integrated in the development version (4.41.0.dev0) of transformers. Make sure that you install transformers with version at least 4.41.0: `pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers`
+
+
+## Step 2: Prepare and run the model
+1. Download the `tokenizer.model` from HuggingFace.
+```
+cd examples/models/phi-3-mini
+wget -O tokenizer.model https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true
+```
+2. Export the model. This step will take a few minutes to finish.
+```
+python export_model.py
+```
+3. Build and run the runner.
+```
+mkdir cmake-out
+cd cmake-out
+cmake ..
+cd ..
+cmake --build cmake-out -j10
+./cmake-out/phi_3_mini_runner
+```
diff --git a/examples/models/phi-3-mini/main.cpp b/examples/models/phi-3-mini/main.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// main.cpp
+
+#include <iostream>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
+
+#include "sentence_piece_tokenizer.h"
+
+using namespace torch::executor;
+
+// The value of the phi-3-mini `<|endoftext|>` token.
+#define ENDOFTEXT_TOKEN 32000
+#define VOCABULARY_SIZE 32064
+
+// TODO(lunwenh): refactor and share with llama
+void generate(
+    Module& llm_model,
+    std::string& prompt,
+    SentencePieceTokenizer& tokenizer,
+    size_t max_output_length) {
+  // Convert the input text into a list of integers (tokens) that represents
+  // it, using the string-to-token mapping that the model was trained on.
+  // Each token is an integer that represents a word or part of a word.
+  std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
+
+  std::cout << "Generating tokens ..." << std::endl;
+
+  std::vector<int64_t> output_tokens;
+
+  for (size_t i = 0; i < max_output_length; i++) {
+    ManagedTensor tensor_tokens(
+        input_tokens.data(),
+        {1, static_cast<int>(input_tokens.size())},
+        ScalarType::Long);
+    std::vector<EValue> inputs = {tensor_tokens.get_aliasing_tensor()};
+
+    Result<std::vector<EValue>> result_evalue = llm_model.forward(inputs);
+
+    const auto error = result_evalue.error();
+    Tensor logits_tensor = result_evalue.get()[0].toTensor();
+    const auto sentence_length = logits_tensor.size(1);
+    std::vector<float> logits(
+        logits_tensor.data_ptr<float>() +
+            (sentence_length - 1) * VOCABULARY_SIZE,
+        logits_tensor.data_ptr<float>() + sentence_length * VOCABULARY_SIZE);
+
+    // Sample the next token from the logits.
+    int64_t next_token =
+        std::max_element(logits.begin(), logits.end()) - logits.begin();
+
+    std::cout << next_token << "\t";
+    std::cout.flush();
+
+    // Break if we reached the end of the text.
+    if (next_token == ENDOFTEXT_TOKEN) {
+      break;
+    }
+
+    output_tokens.push_back(next_token);
+
+    // Update next input.
+    input_tokens.push_back(next_token);
+  }
+
+  std::cout << std::endl;
+  std::cout << tokenizer.decode(output_tokens) << std::endl;
+}
+
+int main() {
+  // Set up the prompt. This provides the seed text for the model to elaborate.
+  std::cout << "Enter model prompt: ";
+  std::string prompt;
+  std::getline(std::cin, prompt);
+
+  SentencePieceTokenizer tokenizer("tokenizer.model");
+
+  Module model("phi-3-mini.pte", Module::MlockConfig::UseMlockIgnoreErrors);
+
+  const auto max_output_tokens = 128;
+  generate(model, prompt, tokenizer, max_output_tokens);
+}
diff --git a/examples/models/phi-3-mini/sentence_piece_tokenizer.h b/examples/models/phi-3-mini/sentence_piece_tokenizer.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <sstream>
+
+#include <sentencepiece_processor.h>
+
+// TODO(lunwenh): Add unit tests
+class SentencePieceTokenizer {
+ public:
+  SentencePieceTokenizer(const std::string& filePath) {
+    const auto status = processor_.Load(filePath);
+    if (!status.ok()) {
+      std::ostringstream errorMessageStream;
+      errorMessageStream << "Failed to load SentencePiece model from "
+                         << filePath << " with error " << status.ToString();
+      throw std::runtime_error(errorMessageStream.str());
+    }
+    processor_.SetEncodeExtraOptions("bos");
+  }
+
+  std::vector<int64_t> encode(const std::string& piece) {
+    std::vector<int> ids;
+    processor_.Encode(piece, &ids);
+    std::vector<int64_t> idsLong(ids.begin(), ids.end());
+    return idsLong;
+  }
+
+  std::string decode(const std::vector<int64_t>& ids) {
+    std::vector<int> idsInt(ids.begin(), ids.end());
+    std::string piece;
+    processor_.Decode(idsInt, &piece);
+    return piece;
+  }
+
+ private:
+  sentencepiece::SentencePieceProcessor processor_;
+};
diff --git a/examples/models/phi-3-mini/third-party/sentencepiece b/examples/models/phi-3-mini/third-party/sentencepiece