Initial NPU C++ Example (#12417)
* temp save * meet review, update * update * meet review, add license * typo
This commit is contained in:
parent
d2a37b6ab2
commit
7288c759ce
4 changed files with 339 additions and 0 deletions
|
|
@ -0,0 +1,39 @@
|
|||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
project(LLM_NPU_EXAMPLE VERSION 1.0.0 LANGUAGES CXX)
|
||||
|
||||
set (CMAKE_CXX_STANDARD 17)
|
||||
SET (CMAKE_CXX_STANDARD_REQUIRED True)
|
||||
|
||||
if(DEFINED ENV{CONDA_ENV_DIR})
|
||||
set(ENV_DIR $ENV{CONDA_ENV_DIR})
|
||||
set(LIBRARY_DIR ${ENV_DIR}/bigdl-core-npu)
|
||||
include_directories(${LIBRARY_DIR}/include)
|
||||
set(DLL_DIR ${ENV_DIR}/intel_npu_acceleration_library/lib/Release)
|
||||
else()
|
||||
set(LIBRARY_DIR ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
|
||||
endif()
|
||||
|
||||
add_library(npu_llm STATIC IMPORTED)
|
||||
set_target_properties(npu_llm PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/npu_llm.lib)
|
||||
|
||||
set(TARGET llm-npu-cli)
|
||||
add_executable(${TARGET} llm-npu-cli.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE npu_llm)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
add_custom_command(TARGET llm-npu-cli POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${LIBRARY_DIR}/npu_llm.dll
|
||||
${CMAKE_BINARY_DIR}/Release/
|
||||
COMMENT "Copying npu_llm.dll to build/Release\n"
|
||||
)
|
||||
|
||||
add_custom_command(TARGET llm-npu-cli POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_directory
|
||||
${DLL_DIR}/
|
||||
${CMAKE_BINARY_DIR}/Release/
|
||||
COMMENT "Copying dependency to build/Release\n"
|
||||
)
|
||||
|
|
@ -0,0 +1,92 @@
|
|||
# C++ Example of running LLM on Intel NPU using IPEX-LLM (Experimental)
|
||||
In this directory, you will find a C++ example on how to run LLM models on Intel NPUs using IPEX-LLM (leveraging *Intel NPU Acceleration Library*). See the table blow for verified models.
|
||||
|
||||
## Verified Models
|
||||
|
||||
| Model | Model Link |
|
||||
|------------|----------------------------------------------------------------|
|
||||
| Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
|
||||
| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) |
|
||||
|
||||
|
||||
## 0. Requirements
|
||||
To run this C++ example with IPEX-LLM on Intel NPUs, make sure to install the newest driver version of Intel NPU.
|
||||
Go to https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html to download and unzip the driver.
|
||||
Then go to **Device Manager**, find **Neural Processors** -> **Intel(R) AI Boost**.
|
||||
Right click and select **Update Driver** -> **Browse my computer for drivers**. And then manually select the unzipped driver folder to install.
|
||||
|
||||
## 1. Install
|
||||
### 1.1 Installation on Windows
|
||||
We suggest using conda to manage environment:
|
||||
```cmd
|
||||
conda create -n llm python=3.10
|
||||
conda activate llm
|
||||
|
||||
:: install ipex-llm with 'npu' option
|
||||
pip install --pre --upgrade ipex-llm[npu]
|
||||
|
||||
:: [optional] for Llama-3.2-1B-Instruct & Llama-3.2-3B-Instruct
|
||||
pip install transformers==4.45.0 accelerate==0.33.0
|
||||
```
|
||||
|
||||
## 2. Convert Model
|
||||
We provide a [convert script](convert_model.py) under current directory, by running it, you can obtain the whole weights and configuration files which are required to run C++ example.
|
||||
|
||||
```cmd
|
||||
:: to convert Qwen2.5-7b-Instruct
|
||||
python convert_model.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory <converted_model_path>
|
||||
|
||||
```
|
||||
|
||||
Arguments info:
|
||||
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `Qwen/Qwen2.5-7B-Instruct`) to be downloaded, or the path to the huggingface checkpoint folder.
|
||||
- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted model will be saved into `SAVE_DIRECTORY`.
|
||||
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
|
||||
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `960`.
|
||||
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
|
||||
|
||||
## 3. Build C++ Example `llm-npu-cli`
|
||||
|
||||
You can run below cmake script in cmd to build `llm-npu-cli`, don't forget to replace below conda env dir with your own path.
|
||||
|
||||
```cmd
|
||||
:: under current directory
|
||||
:: please replace below conda env dir with your own path
|
||||
set CONDA_ENV_DIR=C:\Users\arda\miniforge3\envs\llm\Lib\site-packages
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
cmake --build . --config Release -j
|
||||
cd Release
|
||||
```
|
||||
|
||||
## 4. Run `llm-npu-cli`
|
||||
|
||||
With built `llm-npu-cli`, you can run the example with specified paramaters. For example,
|
||||
|
||||
```cmd
|
||||
llm-npu-cli.exe -m <converted_model_path> -n 64 "AI是什么?"
|
||||
```
|
||||
|
||||
Arguments info:
|
||||
- `-m` : argument defining the path of saved converted model.
|
||||
- `-n` : argument defining how many tokens will be generated.
|
||||
- Last argument is your input prompt.
|
||||
|
||||
### 5. Sample Output
|
||||
#### [`Qwen/Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)
|
||||
```cmd
|
||||
Input:
|
||||
<|im_start|>system
|
||||
You are a helpful assistant.<|im_end|>
|
||||
<|im_start|>user
|
||||
AI是什么?<|im_end|>
|
||||
<|im_start|>assistant
|
||||
|
||||
|
||||
Prefill 22 tokens cost xxxx ms.
|
||||
Output:
|
||||
AI是"人工智能"的缩写,是英文"Artificial Intelligence"的翻译。它是研究如何使计算机也具有智能的一种技术和理论。简而言之,人工智能就是让计算机能够模仿人智能行为的一项技术。
|
||||
|
||||
Decode 46 tokens cost xxxx ms (avg xx.xx ms each token).
|
||||
```
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
#
|
||||
# Copyright 2016 The BigDL Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
|
||||
import torch
|
||||
import argparse
|
||||
from ipex_llm.transformers.npu_model import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert LLM for C++ NPU inference and save"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repo-id-or-model-path",
|
||||
type=str,
|
||||
default="Qwen/Qwen2.5-7B-Instruct", # Or Qwen2-7B-Instruct, Qwen2-1.5B-Instruct
|
||||
help="The huggingface repo id for the Qwen model to be downloaded"
|
||||
", or the path to the huggingface checkpoint folder",
|
||||
)
|
||||
parser.add_argument("--save-directory", type=str,
|
||||
required=True,
|
||||
help="The path of folder to save converted model, "
|
||||
"If path not exists, lowbit model will be saved there. "
|
||||
"Else, program will raise error.",
|
||||
)
|
||||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=960)
|
||||
parser.add_argument("--quantization_group_size", type=int, default=0)
|
||||
parser.add_argument('--load_in_low_bit', type=str, default="sym_int4",
|
||||
help='Load in low bit to use')
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
model_path = args.repo_id_or_model_path
|
||||
save_dir = args.save_directory
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
optimize_model=True,
|
||||
pipeline=True,
|
||||
load_in_low_bit=args.load_in_low_bit,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
quantization_group_size=args.quantization_group_size,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="eager",
|
||||
transpose_value_cache=not args.disable_transpose_value_cache,
|
||||
mixed_precision=True,
|
||||
trust_remote_code=True,
|
||||
compile_full_model=True,
|
||||
save_directory=save_dir)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
tokenizer.save_pretrained(save_dir)
|
||||
|
||||
print("-" * 80)
|
||||
print(f"finish save model to {save_dir}")
|
||||
print("success shut down")
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
//
|
||||
// Copyright 2016 The BigDL Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
|
||||
#include "common.h"
|
||||
#include "npu_llm.h"
|
||||
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n %s -m npu_model_dir [-n n_predict] [prompt]\n", argv[0]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
// path to the npu model directory
|
||||
std::string model_dir;
|
||||
// prompt to generate text from
|
||||
std::string prompt = "AI是什么?";
|
||||
// number of tokens to predict
|
||||
int n_predict = 32;
|
||||
|
||||
// parse command line arguments
|
||||
|
||||
{
|
||||
int i = 1;
|
||||
for (; i < argc; i++) {
|
||||
if (strcmp(argv[i], "-m") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
model_dir = argv[++i];
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-n") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
try {
|
||||
n_predict = std::stoi(argv[++i]);
|
||||
} catch (...) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
// prompt starts here
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (model_dir.empty()) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
if (i < argc) {
|
||||
prompt = argv[i++];
|
||||
for (; i < argc; i++) {
|
||||
prompt += " ";
|
||||
prompt += argv[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
params.n_predict = n_predict;
|
||||
params.model = model_dir;
|
||||
params.prompt = prompt;
|
||||
|
||||
npu_model_params model_params;
|
||||
NPUModel* model = load_model_from_file(model_params, params.model);
|
||||
|
||||
tokenizer_params tok_params;
|
||||
load_tokenizer(tok_params, params.model);
|
||||
|
||||
std::string full_prompt = add_chat_template(model_params, params.prompt);
|
||||
std::cout << "Input: " << std::endl;
|
||||
std::cout << full_prompt << std::endl;
|
||||
|
||||
// tokenize input
|
||||
std::vector<int32_t> embd_inp = llm_tokenize(full_prompt, false);
|
||||
|
||||
std::vector<int32_t> embd; // output ids
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
float* logits = run_prefill(model, embd_inp);
|
||||
int32_t token = llm_sample_token(logits, true, model_params);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
|
||||
printf("\nPrefill %d tokens cost %d ms.\n", embd_inp.size(), duration.count());
|
||||
embd.push_back(token);
|
||||
|
||||
int token_nums = 0;
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
for (int i = 1; i < params.n_predict; i++){
|
||||
auto logits = run_decode(model, embd[i-1]);
|
||||
int32_t token = llm_sample_token(logits, true, model_params);
|
||||
if (token != tok_params.eos_token_id) {
|
||||
embd.push_back(token);
|
||||
token_nums ++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
|
||||
|
||||
std::string output = llm_decode(embd);
|
||||
|
||||
std::cout << "Output: " << std::endl;
|
||||
std::cout << output << std::endl;
|
||||
|
||||
printf("\nDecode %d tokens cost %d ms (avg %f ms each token).\n", token_nums, duration.count(), (float)duration.count() / token_nums);
|
||||
|
||||
return 0;
|
||||
}
|
||||
Loading…
Reference in a new issue