From 7f8c5b410b3c4443535f534d9a15441f7f2e6d14 Mon Sep 17 00:00:00 2001 From: Shaojun Liu <61072813+liu-shaojun@users.noreply.github.com> Date: Tue, 14 May 2024 12:58:31 +0800 Subject: [PATCH] Quickstart: Run PyTorch Inference on Intel GPU using Docker (on Linux or WSL) (#10970) * add entrypoint.sh * add quickstart * remove entrypoint * update * Install related library of benchmarking * update * print out results * update docs * minor update * update * update quickstart * update * update * update * update * update * update * add chat & example section * add more details * minor update * rename quickstart * update * minor update * update * update config.yaml * update readme * use --gpu * add tips * minor update * update --- docker/llm/README.md | 51 +++-- docker/llm/inference/xpu/docker/Dockerfile | 13 +- docker/llm/inference/xpu/docker/benchmark.sh | 53 +++++ .../source/_templates/sidebar_quicklinks.html | 3 + .../docker_pytorch_inference_gpu.md | 210 ++++++++++++++++++ .../source/doc/LLM/Quickstart/index.rst | 1 + .../llm/dev/benchmark/all-in-one/config.yaml | 42 ++-- 7 files changed, 331 insertions(+), 42 deletions(-) create mode 100644 docker/llm/inference/xpu/docker/benchmark.sh create mode 100644 docs/readthedocs/source/doc/LLM/Quickstart/docker_pytorch_inference_gpu.md diff --git a/docker/llm/README.md b/docker/llm/README.md index 52cc64c6..bc7e0f54 100644 --- a/docker/llm/README.md +++ b/docker/llm/README.md @@ -159,9 +159,12 @@ Run the following command to pull image from dockerhub: docker pull intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT ``` -### 2. Start ipex-llm-xpu Docker Container +### 2. Start Chat Inference + +We provide `chat.py` for conversational AI. If your model is Llama-2-7b-chat-hf and mounted on /llm/models, you can execute the following command to initiate a conversation: To map the xpu into the container, you need to specify --device=/dev/dri when booting the container. + ```bash #/bin/bash export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT @@ -175,35 +178,43 @@ sudo docker run -itd \ --name=$CONTAINER_NAME \ --shm-size="16g" \ -v $MODEL_PATH:/llm/models \ - $DOCKER_IMAGE + $DOCKER_IMAGE bash -c "python chat.py --model-path /llm/models/Llama-2-7b-chat-hf" ``` -Access the container: -``` -docker exec -it $CONTAINER_NAME bash -``` -To verify the device is successfully mapped into the container, run `sycl-ls` to check the result. In a machine with Arc A770, the sampled output is: +### 3. Quick Performance Benchmark +Execute a quick performance benchmark by starting the ipex-llm-xpu container, specifying the model, test API, and device, then running the benchmark.sh script. + +To map the XPU into the container, specify `--device=/dev/dri` when booting the container. ```bash -root@arda-arc12:/# sycl-ls -[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device 1.2 [2023.16.7.0.21_160000] -[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i9-13900K 3.0 [2023.16.7.0.21_160000] -[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics 3.0 [23.17.26241.33] -[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26241] +#/bin/bash +export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT +export CONTAINER_NAME=my_container +export MODEL_PATH=/llm/models [change to your model path] + +sudo docker run -itd \ + --net=host \ + --device=/dev/dri \ + --memory="32G" \ + --name=$CONTAINER_NAME \ + --shm-size="16g" \ + -v $MODEL_PATH:/llm/models \ + -e REPO_IDS="meta-llama/Llama-2-7b-chat-hf" \ + -e TEST_APIS="transformer_int4_gpu" \ + -e DEVICE=Arc \ + $DOCKER_IMAGE /llm/benchmark.sh ``` -### 3. Start Inference -**Chat Interface**: Use `chat.py` for conversational AI. +Customize environment variables to specify: -For example, if your model is Llama-2-7b-chat-hf and mounted on /llm/models, you can excute the following command to initiate a conversation: - ```bash - cd /llm - python chat.py --model-path /llm/models/Llama-2-7b-chat-hf - ``` +- **REPO_IDS:** Model's name and organization, separated by commas if multiple values exist. +- **TEST_APIS:** Different test functions based on the machine, separated by commas if multiple values exist. +- **DEVICE:** Type of device - Max, Flex, Arc. -To run inference using `IPEX-LLM` using xpu, you could refer to this [documentation](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU). +**Result** +Upon completion, you can obtain a CSV result file, the content of CSV results will be printed out. You can mainly look at the results of columns `1st token avg latency (ms)` and `2+ avg latency (ms/token)` for the benchmark results. ## IPEX-LLM Serving on CPU FastChat is an open platform for training, serving, and evaluating large language model based chatbots. You can find the detailed information at their [homepage](https://github.com/lm-sys/FastChat). diff --git a/docker/llm/inference/xpu/docker/Dockerfile b/docker/llm/inference/xpu/docker/Dockerfile index b3269627..2862a463 100644 --- a/docker/llm/inference/xpu/docker/Dockerfile +++ b/docker/llm/inference/xpu/docker/Dockerfile @@ -9,6 +9,7 @@ ENV USE_XETLA=OFF ENV SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 COPY chat.py /llm/chat.py +COPY benchmark.sh /llm/benchmark.sh # Disable pip's cache behavior ARG PIP_NO_CACHE_DIR=false @@ -44,10 +45,20 @@ RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-P apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \ # Install related libary of chat.py pip install --upgrade colorama && \ + # Download all-in-one benchmark and examples + git clone https://github.com/intel-analytics/ipex-llm && \ + cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \ + cp -r ./ipex-llm/python/llm/example/GPU/HF-Transformers-AutoModels/Model ./examples && \ # Install vllm dependencies pip install --upgrade fastapi && \ pip install --upgrade "uvicorn[standard]" && \ # Download vLLM-Serving git clone https://github.com/intel-analytics/IPEX-LLM && \ cp -r ./IPEX-LLM/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \ - rm -rf ./IPEX-LLM + rm -rf ./IPEX-LLM && \ + # Install related library of benchmarking + pip install pandas && \ + pip install omegaconf && \ + chmod +x /llm/benchmark.sh + +WORKDIR /llm/ \ No newline at end of file diff --git a/docker/llm/inference/xpu/docker/benchmark.sh b/docker/llm/inference/xpu/docker/benchmark.sh new file mode 100644 index 00000000..06161d80 --- /dev/null +++ b/docker/llm/inference/xpu/docker/benchmark.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +echo "Repo ID is: $REPO_IDS" +echo "Test API is: $TEST_APIS" +echo "Device is: $DEVICE" + +cd /benchmark/all-in-one + +# Replace local_model_hub +sed -i "s/'path to your local model hub'/'\/llm\/models'/" config.yaml + +# Comment out repo_id +sed -i -E "/repo_id:/,/local_model_hub/ s/^(\s*-)/ #&/" config.yaml + +# Modify config.yaml with repo_id +if [ -n "$REPO_IDS" ]; then + for REPO_ID in $(echo "$REPO_IDS" | tr ',' '\n'); do + # Add each repo_id value as a subitem of repo_id list + sed -i -E "/^(repo_id:)/a \ - '$REPO_ID'" config.yaml + done +fi + +# Comment out test_api +sed -i -E "/test_api:/,/cpu_embedding/ s/^(\s*-)/ #&/" config.yaml + +# Modify config.yaml with test_api +if [ -n "$TEST_APIS" ]; then + for TEST_API in $(echo "$TEST_APIS" | tr ',' '\n'); do + # Add each test_api value as a subitem of test_api list + sed -i -E "/^(test_api:)/a \ - '$TEST_API'" config.yaml + done +fi + + +if [[ "$DEVICE" == "Arc" || "$DEVICE" == "ARC" ]]; then + source ipex-llm-init -g --device Arc + python run.py +elif [[ "$DEVICE" == "Flex" || "$DEVICE" == "FLEX" ]]; then + source ipex-llm-init -g --device Flex + python run.py +elif [[ "$DEVICE" == "Max" || "$DEVICE" == "MAX" ]]; then + source ipex-llm-init -g --device Max + python run.py +else + echo "Invalid DEVICE specified." +fi + +# print out results +for file in *.csv; do + echo "" + echo "filename: $file" + cat "$file" +done diff --git a/docs/readthedocs/source/_templates/sidebar_quicklinks.html b/docs/readthedocs/source/_templates/sidebar_quicklinks.html index b720c4f7..fd7f865b 100644 --- a/docs/readthedocs/source/_templates/sidebar_quicklinks.html +++ b/docs/readthedocs/source/_templates/sidebar_quicklinks.html @@ -28,6 +28,9 @@
  • Install IPEX-LLM in Docker on Windows with Intel GPU
  • +
  • + Run PyTorch Inference on Intel GPU using Docker (on Linux or WSL) +
  • Run Local RAG using Langchain-Chatchat on Intel GPU
  • diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/docker_pytorch_inference_gpu.md b/docs/readthedocs/source/doc/LLM/Quickstart/docker_pytorch_inference_gpu.md new file mode 100644 index 00000000..7902bcde --- /dev/null +++ b/docs/readthedocs/source/doc/LLM/Quickstart/docker_pytorch_inference_gpu.md @@ -0,0 +1,210 @@ +# Run PyTorch Inference on Intel GPU using Docker (on Linux or WSL) + +We can run PyTorch Inference Benchmark, Chat Service and PyTorch Examples on Intel GPUs within Docker (on Linux or WSL). + +## Install Docker + +1. Linux Installation + + Follow the instructions in this [guide](https://www.docker.com/get-started/) to install Docker on Linux. + +2. Windows Installation + + For Windows installation, refer to this [guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/docker_windows_gpu.html#install-docker-on-windows). + +## Launch Docker + +Prepare ipex-llm-xpu Docker Image: +```bash +docker pull intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT +``` + +Start ipex-llm-xpu Docker Container: +```bash +export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT +export CONTAINER_NAME=my_container +export MODEL_PATH=/llm/models[change to your model path] + +docker run -itd \ + --net=host \ + --device=/dev/dri \ + --memory="32G" \ + --name=$CONTAINER_NAME \ + --shm-size="16g" \ + -v $MODEL_PATH:/llm/models \ + $DOCKER_IMAGE +``` + +Access the container: +``` +docker exec -it $CONTAINER_NAME bash +``` + +To verify the device is successfully mapped into the container, run `sycl-ls` to check the result. In a machine with Arc A770, the sampled output is: + +```bash +root@arda-arc12:/# sycl-ls +[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device 1.2 [2023.16.7.0.21_160000] +[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i9-13900K 3.0 [2023.16.7.0.21_160000] +[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics 3.0 [23.17.26241.33] +[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26241] +``` + +```eval_rst +.. tip:: + + You can run the Env-Check script to verify your ipex-llm installation and runtime environment. + + .. code-block:: bash + + cd /ipex-llm/python/llm/scripts + bash env-check.sh + + +``` + +## Run Inference Benchmark + +Navigate to benchmark directory, and modify the `config.yaml` under the `all-in-one` folder for benchmark configurations. +```bash +cd /benchmark/all-in-one +vim config.yaml +``` + +**Modify config.yaml** +```eval_rst +.. note:: + + ``dtype``: The model is originally loaded in this data type. After ipex-llm conversion, all the non-linear layers remain to use this data type. + + ``qtype``: ipex-llm will convert all the linear-layers' weight to this data type. +``` + + +```yaml +repo_id: + # - 'THUDM/chatglm2-6b' + - 'meta-llama/Llama-2-7b-chat-hf' + # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now +local_model_hub: 'path to your local model hub' +warm_up: 1 # must set >=2 when run "pipeline_parallel_gpu" test_api +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 1 # default to 1 +in_out_pairs: + - '32-32' + - '1024-128' +test_api: + - "transformer_int4_gpu" # on Intel GPU, transformer-like API, (qtype=int4) + # - "transformer_int4_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4) + # - "transformer_int4_fp16_gpu" # on Intel GPU, transformer-like API, (qtype=int4), (dtype=fp16) + # - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4), (dtype=fp16) + # - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4), use load_low_bit API. Please make sure you have used the save.py to save the converted low bit model + # - "ipex_fp16_gpu" # on Intel GPU, use native transformers API, (dtype=fp16) + # - "bigdl_fp16_gpu" # on Intel GPU, use ipex-llm transformers API, (dtype=fp16), (qtype=fp16) + # - "optimize_model_gpu" # on Intel GPU, can optimize any pytorch models include transformer model + # - "deepspeed_optimize_model_gpu" # on Intel GPU, deepspeed autotp inference + # - "pipeline_parallel_gpu" # on Intel GPU, pipeline parallel inference + # - "speculative_gpu" # on Intel GPU, inference with self-speculative decoding + # - "transformer_int4" # on Intel CPU, transformer-like API, (qtype=int4) + # - "native_int4" # on Intel CPU + # - "optimize_model" # on Intel CPU, can optimize any pytorch models include transformer model + # - "pytorch_autocast_bf16" # on Intel CPU + # - "transformer_autocast_bf16" # on Intel CPU + # - "bigdl_ipex_bf16" # on Intel CPU, (qtype=bf16) + # - "bigdl_ipex_int4" # on Intel CPU, (qtype=int4) + # - "bigdl_ipex_int8" # on Intel CPU, (qtype=int8) + # - "speculative_cpu" # on Intel CPU, inference with self-speculative decoding + # - "deepspeed_transformer_int4_cpu" # on Intel CPU, deepspeed autotp inference +cpu_embedding: False # whether put embedding to CPU +streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api) +use_fp16_torch_dtype: True # whether use fp16 for non-linear layer (only avaiable now for "pipeline_parallel_gpu" test_api) +n_gpu: 2 # number of GPUs to use (only avaiable now for "pipeline_parallel_gpu" test_api) +``` + +Some parameters in the yaml file that you can configure: + + +- `repo_id`: The name of the model and its organization. +- `local_model_hub`: The folder path where the models are stored on your machine. Replace 'path to your local model hub' with /llm/models. +- `warm_up`: The number of warmup trials before performance benchmarking (must set to >= 2 when using "pipeline_parallel_gpu" test_api). +- `num_trials`: The number of runs for performance benchmarking (the final result is the average of all trials). +- `low_bit`: The low_bit precision you want to convert to for benchmarking. +- `batch_size`: The number of samples on which the models make predictions in one forward pass. +- `in_out_pairs`: Input sequence length and output sequence length combined by '-'. +- `test_api`: Different test functions for different machines. +- `cpu_embedding`: Whether to put embedding on CPU (only available for windows GPU-related test_api). +- `streaming`: Whether to output in a streaming way (only available for GPU Windows-related test_api). +- `use_fp16_torch_dtype`: Whether to use fp16 for the non-linear layer (only available for "pipeline_parallel_gpu" test_api). +- `n_gpu`: Number of GPUs to use (only available for "pipeline_parallel_gpu" test_api). + + +```eval_rst +.. note:: + + If you want to benchmark the performance without warmup, you can set ``warm_up: 0`` and ``num_trials: 1`` in ``config.yaml``, and run each single model and in_out_pair separately. +``` + + +After configuring the `config.yaml`, run the following scripts: +```bash +source ipex-llm-init --gpu --device +python run.py +``` + + +**Result** + +After the benchmarking is completed, you can obtain a CSV result file under the current folder. You can mainly look at the results of columns `1st token avg latency (ms)` and `2+ avg latency (ms/token)` for the benchmark results. You can also check whether the column `actual input/output tokens` is consistent with the column `input/output tokens` and whether the parameters you specified in `config.yaml` have been successfully applied in the benchmarking. + + +## Run Chat Service + +We provide `chat.py` for conversational AI. + +For example, if your model is Llama-2-7b-chat-hf and mounted on /llm/models, you can execute the following command to initiate a conversation: + ```bash + cd /llm + python chat.py --model-path /llm/models/Llama-2-7b-chat-hf + ``` + +Here is a demostration: + + + + +
    + +## Run PyTorch Examples + +We provide several PyTorch examples that you could apply IPEX-LLM INT4 optimizations on models on Intel GPUs + +For example, if your model is Llama-2-7b-chat-hf and mounted on /llm/models, you can navigate to /examples/llama2 directory, excute the following command to run example: + ```bash + cd /examples/ + python ./generate.py --repo-id-or-model-path /llm/models/Llama-2-7b-chat-hf --prompt PROMPT --n-predict N_PREDICT + ``` + + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Llama2 model (e.g. `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +**Sample Output** +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +[INST] <> + +<> + +What is AI? [/INST] +-------------------- Output -------------------- +[INST] <> + +<> + +What is AI? [/INST] Artificial intelligence (AI) is the broader field of research and development aimed at creating machines that can perform tasks that typically require human intelligence, +``` \ No newline at end of file diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/index.rst b/docs/readthedocs/source/doc/LLM/Quickstart/index.rst index a5c58771..472c352e 100644 --- a/docs/readthedocs/source/doc/LLM/Quickstart/index.rst +++ b/docs/readthedocs/source/doc/LLM/Quickstart/index.rst @@ -12,6 +12,7 @@ This section includes efficient guide to show you how to: * `Install IPEX-LLM on Linux with Intel GPU <./install_linux_gpu.html>`_ * `Install IPEX-LLM on Windows with Intel GPU <./install_windows_gpu.html>`_ * `Install IPEX-LLM in Docker on Windows with Intel GPU <./docker_windows_gpu.html>`_ +* `Run PyTorch Inference on Intel GPU using Docker (on Linux or WSL) <./docker_benchmark_quickstart.html>`_ * `Run Performance Benchmarking with IPEX-LLM <./benchmark_quickstart.html>`_ * `Run Local RAG using Langchain-Chatchat on Intel GPU <./chatchat_quickstart.html>`_ * `Run Text Generation WebUI on Intel GPU <./webui_quickstart.html>`_ diff --git a/python/llm/dev/benchmark/all-in-one/config.yaml b/python/llm/dev/benchmark/all-in-one/config.yaml index 30227b0e..9df48a01 100644 --- a/python/llm/dev/benchmark/all-in-one/config.yaml +++ b/python/llm/dev/benchmark/all-in-one/config.yaml @@ -12,27 +12,27 @@ in_out_pairs: - '32-32' - '1024-128' test_api: - - "transformer_int4_gpu" # on Intel GPU - # - "transformer_int4_fp16_gpu" # on Intel GPU, use fp16 for non-linear layer - # - "ipex_fp16_gpu" # on Intel GPU - # - "bigdl_fp16_gpu" # on Intel GPU - # - "optimize_model_gpu" # on Intel GPU - # - "transformer_int4_gpu_win" # on Intel GPU for Windows - # - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer - # - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model - # - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU - # - "pipeline_parallel_gpu" # pipeline parallel inference on Intel GPU - # - "speculative_gpu" - # - "transformer_int4" - # - "native_int4" - # - "optimize_model" - # - "pytorch_autocast_bf16" - # - "transformer_autocast_bf16" - # - "bigdl_ipex_bf16" - # - "bigdl_ipex_int4" - # - "bigdl_ipex_int8" - # - "speculative_cpu" - # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server + - "transformer_int4_gpu" # on Intel GPU, transformer-like API, (qtype=int4) + # - "transformer_int4_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4) + # - "transformer_int4_fp16_gpu" # on Intel GPU, transformer-like API, (qtype=int4), (dtype=fp16) + # - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4), (dtype=fp16) + # - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4), use load_low_bit API. Please make sure you have used the save.py to save the converted low bit model + # - "ipex_fp16_gpu" # on Intel GPU, use native transformers API, (dtype=fp16) + # - "bigdl_fp16_gpu" # on Intel GPU, use ipex-llm transformers API, (dtype=fp16), (qtype=fp16) + # - "optimize_model_gpu" # on Intel GPU, can optimize any pytorch models include transformer model + # - "deepspeed_optimize_model_gpu" # on Intel GPU, deepspeed autotp inference + # - "pipeline_parallel_gpu" # on Intel GPU, pipeline parallel inference + # - "speculative_gpu" # on Intel GPU, inference with self-speculative decoding + # - "transformer_int4" # on Intel CPU, transformer-like API, (qtype=int4) + # - "native_int4" # on Intel CPU + # - "optimize_model" # on Intel CPU, can optimize any pytorch models include transformer model + # - "pytorch_autocast_bf16" # on Intel CPU + # - "transformer_autocast_bf16" # on Intel CPU + # - "bigdl_ipex_bf16" # on Intel CPU, (qtype=bf16) + # - "bigdl_ipex_int4" # on Intel CPU, (qtype=int4) + # - "bigdl_ipex_int8" # on Intel CPU, (qtype=int8) + # - "speculative_cpu" # on Intel CPU, inference with self-speculative decoding + # - "deepspeed_transformer_int4_cpu" # on Intel CPU, deepspeed autotp inference cpu_embedding: False # whether put embedding to CPU streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api) use_fp16_torch_dtype: True # whether use fp16 for non-linear layer (only avaiable now for "pipeline_parallel_gpu" test_api)