LLM: Add CPU vLLM entrypoint (#11083)

Add CPU vLLM entrypoint and update CPU vLLM serving example.
2024-05-24 09:16:59 +08:00 · 2024-05-24 09:16:59 +08:00 · b3f6faa038
commit b3f6faa038
parent 7ed270a4d8
28 changed files with 1288 additions and 277 deletions
--- a/docker/llm/serving/cpu/docker/Dockerfile
+++ b/docker/llm/serving/cpu/docker/Dockerfile
@ -7,20 +7,31 @@ ARG TINI_VERSION=v0.18.0
 # Disable pip's cache behavior
 ARG PIP_NO_CACHE_DIR=false

-COPY ./entrypoint.sh /opt/entrypoint.sh
 COPY ./model_adapter.py.patch /llm/model_adapter.py.patch
 ADD  https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
 # Install Serving Dependencies
 RUN cd /llm && \
+    apt-get update && \
+    apt-get install -y wrk && \
    pip install --pre --upgrade ipex-llm[serving] && \
    # Fix Trivy CVE Issues
    pip install Jinja2==3.1.3 transformers==4.36.2 gradio==4.19.2 cryptography==42.0.4 && \
    # Fix Qwen model adpater in fastchat
    patch /usr/local/lib/python3.11/dist-packages/fastchat/model/model_adapter.py < /llm/model_adapter.py.patch && \
-    chmod +x /opt/entrypoint.sh && \
    chmod +x /sbin/tini && \
-    cp /sbin/tini /usr/bin/tini
+    cp /sbin/tini /usr/bin/tini && \
+    # Install vllm
+    git clone https://github.com/vllm-project/vllm.git && \
+    cd ./vllm && \
+    git checkout v0.4.2 && \
+    pip install wheel packaging ninja setuptools>=49.4.0 numpy && \
+    pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu && \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py install

+ADD ./vllm_offline_inference.py       /llm/
+ADD ./payload-1024.lua                /llm/
+ADD ./start-vllm-service.sh           /llm/
+ADD ./benchmark_vllm_throughput.py    /llm/
+ADD ./start-fastchat-service.sh       /llm/

 WORKDIR /llm/
-ENTRYPOINT [ "/opt/entrypoint.sh" ]
--- a/docker/llm/serving/cpu/docker/README.md
+++ b/docker/llm/serving/cpu/docker/README.md
@ -95,3 +95,61 @@ curl -X POST -H "Content-Type: application/json" -d '{
    "stream": false
 }' http://localhost:8000/v1/completions
 ```
+
+
+#### vLLM serving engine
+
+To run vLLM engine using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/GPU/vLLM-Serving/README.md).
+
+We have included multiple example files in `/llm/`:
+1. `vllm_offline_inference.py`: Used for vLLM offline inference example
+2. `benchmark_vllm_throughput.py`: Used for benchmarking throughput
+3. `payload-1024.lua`: Used for testing request per second using 1k-128 request
+4. `start-vllm-service.sh`: Used for template for starting vLLM service
+
+##### Online benchmark throurgh api_server
+
+We can benchmark the api_server to get an estimation about TPS (transactions per second).  To do so, you need to start the service first according to the instructions in this [section](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/GPU/vLLM-Serving/README.md#service).
+
+
+In container, do the following:
+1. modify the `/llm/payload-1024.lua` so that the "model" attribute is correct.  By default, we use a prompt that is roughly 1024 token long, you can change it if needed.
+2. Start the benchmark using `wrk` using the script below:
+
+```bash
+cd /llm
+# You can change -t and -c to control the concurrency.
+# By default, we use 12 connections to benchmark the service.
+wrk -t4 -c4 -d15m -s payload-1024.lua http://localhost:8000/v1/completions --timeout 1h
+
+```
+#### Offline benchmark through benchmark_vllm_throughput.py
+
+We have included the benchmark_throughput script provied by `vllm` in our image as `/llm/benchmark_vllm_throughput.py`.  To use the benchmark_throughput script, you will need to download the test dataset through:
+
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+The full example looks like this:
+```bash
+cd /llm/
+
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+export MODEL="YOUR_MODEL"
+
+# You can change load-in-low-bit from values in [sym_int4, fp8, fp16]
+
+python3 /llm/benchmark_vllm_throughput.py \
+    --backend vllm \
+    --dataset /llm/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --model $MODEL \
+    --num-prompts 1000 \
+    --seed 42 \
+    --trust-remote-code \
+    --enforce-eager \
+    --dtype bfloat16 \
+    --device cpu \
+    --load-in-low-bit sym_int4
+```
--- a/docker/llm/serving/cpu/docker/benchmark_vllm_throughput.py
+++ b/docker/llm/serving/cpu/docker/benchmark_vllm_throughput.py
@ -0,0 +1,357 @@
+"""Benchmark offline inference throughput."""
+import argparse
+import json
+import random
+import time
+from typing import List, Optional, Tuple
+
+import torch
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+from tqdm import tqdm
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
+    # Tokenize the prompts and completions.
+    prompts = [prompt for prompt, _ in dataset]
+    prompt_token_ids = tokenizer(prompts).input_ids
+    completions = [completion for _, completion in dataset]
+    completion_token_ids = tokenizer(completions).input_ids
+    tokenized_dataset = []
+    for i in range(len(dataset)):
+        output_len = len(completion_token_ids[i])
+        if fixed_output_len is not None:
+            output_len = fixed_output_len
+        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+
+    # Filter out too long sequences.
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+        prompt_len = len(prompt_token_ids)
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    # Sample the requests.
+    sampled_requests = random.sample(filtered_dataset, num_requests)
+    return sampled_requests
+
+
+def run_vllm(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    device: str,
+    enable_prefix_caching: bool,
+    gpu_memory_utilization: float = 0.9,
+    load_in_low_bit: str = "sym_int4",
+) -> float:
+    from vllm import SamplingParams
+    from ipex_llm.vllm.cpu.engine import IPEXLLMClass as LLM
+    llm = LLM(model=model,
+              tokenizer=tokenizer,
+              quantization=quantization,
+              tensor_parallel_size=tensor_parallel_size,
+              seed=seed,
+              trust_remote_code=trust_remote_code,
+              dtype=dtype,
+              max_model_len=max_model_len,
+              gpu_memory_utilization=gpu_memory_utilization,
+              enforce_eager=enforce_eager,
+              kv_cache_dtype=kv_cache_dtype,
+              device=device,
+              enable_prefix_caching=enable_prefix_caching,
+              load_in_low_bit=load_in_low_bit)
+
+    # Add the requests to the engine.
+    for prompt, _, output_len in requests:
+        sampling_params = SamplingParams(
+            n=n,
+            temperature=0.0 if use_beam_search else 1.0,
+            top_p=1.0,
+            use_beam_search=use_beam_search,
+            ignore_eos=True,
+            max_tokens=output_len,
+        )
+        # FIXME(woosuk): Do not use internal method.
+        llm._add_request(
+            prompt=prompt,
+            prompt_token_ids=None,
+            sampling_params=sampling_params,
+        )
+
+    start = time.perf_counter()
+    # FIXME(woosuk): Do not use internal method.
+    llm._run_engine(use_tqdm=True)
+    end = time.perf_counter()
+    return end - start
+
+
+def run_hf(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    use_beam_search: bool,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    assert not use_beam_search
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: List[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=not use_beam_search,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import pipeline
+    llm = pipeline(model, tensor_parallel=tensor_parallel_size)
+    prompts = [prompt for prompt, _, _ in requests]
+
+    start = time.perf_counter()
+    llm(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [(prompt, args.input_len, args.output_len)
+                    for _ in range(args.num_prompts)]
+    else:
+        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+                                   args.output_len)
+
+    if args.backend == "vllm":
+        elapsed_time = run_vllm(
+            requests, args.model, args.tokenizer, args.quantization,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.trust_remote_code, args.dtype, args.max_model_len,
+            args.enforce_eager, args.kv_cache_dtype, args.device,
+            args.enable_prefix_caching, args.gpu_memory_utilization, args.load_in_low_bit)
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.use_beam_search, args.hf_max_batch_size,
+                              args.trust_remote_code)
+    elif args.backend == "mii":
+        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
+                               args.output_len)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(prompt_len + output_len
+                           for _, prompt_len, output_len in requests)
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii"],
+                        default="vllm")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=['awq', 'gptq', 'squeezellm', None],
+                        default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument("--enforce-eager",
+                        action="store_true",
+                        help="enforce eager execution")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8_e5m2"],
+        default="auto",
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type.')
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "xpu"],
+        help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument(
+        "--enable-prefix-caching",
+        action='store_true',
+        help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument(
+        "--load-in-low-bit",
+        type=str,
+        choices=["sym_int4", "fp8", "fp16"],
+        default="sym_int4",
+        help="Low-bit format quantization with IPEX-LLM")
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.use_beam_search:
+            raise ValueError("Beam search is not supported for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII "
+                             "backend.")
+    main(args)
--- a/docker/llm/serving/cpu/docker/entrypoint.sh
+++ b/docker/llm/serving/cpu/docker/entrypoint.sh
@ -1,232 +0,0 @@
-#!/bin/bash
-
-usage() {
-  echo "Usage: $0 [-m --mode <controller|worker>] [-h --help] [-w --worker <model_worker|vllm_worker>]"
-  echo "-h: Print help message."
-  echo "Controller mode reads the following env:"
-  echo "CONTROLLER_HOST (default: localhost)."
-  echo "CONTROLLER_PORT (default: 21001)."
-  echo "API_HOST (default: localhost)."
-  echo "API_PORT (default: 8000)."
-  echo "Worker mode reads the following env:"
-  echo "CONTROLLER_HOST (default: localhost)."
-  echo "CONTROLLER_PORT (default: 21001)."
-  echo "WORKER_HOST (default: localhost)."
-  echo "WORKER_PORT (default: 21002)."
-  echo "MODEL_PATH (default: empty)."
-  echo "STREAM_INTERVAL (default: 1)."
-  exit 1
-}
-
-# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
-calculate_total_cores() {
-  local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"
-
-  if [[ -f "$cpuset_file" ]]; then
-    local cpuset_cpus=$(cat "$cpuset_file")
-    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
-
-    local total_cores=0
-    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
-    for cpu in "${cpu_list[@]}"; do
-      if [[ $cpu =~ - ]]; then
-        # Range of CPUs
-        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
-        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
-        local range_cores=$((end_cpu - start_cpu + 1))
-        total_cores=$((total_cores + range_cores))
-      else
-        # Single CPU
-        total_cores=$((total_cores + 1))
-      fi
-    done
-
-    echo $total_cores
-    return
-  fi
-  # Kubernetes core-binding will use this file
-  cpuset_file="/sys/fs/cgroup/cpuset.cpus"
-  if [[ -f "$cpuset_file" ]]; then
-    local cpuset_cpus=$(cat "$cpuset_file")
-    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
-
-    local total_cores=0
-    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
-    for cpu in "${cpu_list[@]}"; do
-      if [[ $cpu =~ - ]]; then
-        # Range of CPUs
-        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
-        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
-        local range_cores=$((end_cpu - start_cpu + 1))
-        total_cores=$((total_cores + range_cores))
-      else
-        # Single CPU
-        total_cores=$((total_cores + 1))
-      fi
-    done
-
-    echo $total_cores
-    return
-  else
-    echo -1
-    return
-  fi
-}
-
-# Default values
-controller_host="localhost"
-controller_port="21001"
-gradio_port="8002"
-api_host="localhost"
-api_port="8000"
-worker_host="localhost"
-worker_port="21002"
-model_path=""
-mode=""
-omp_num_threads=""
-dispatch_method="shortest_queue" # shortest_queue or lottery
-stream_interval=1
-worker_type="model_worker"
-
-# Update rootCA config if needed
-update-ca-certificates
-
-# Remember the value of `OMP_NUM_THREADS`:
-if [[ -n "${OMP_NUM_THREADS}" ]]; then
-  omp_num_threads="${OMP_NUM_THREADS}"
-fi
-
-# We do not have any arguments, just run bash
-if [ "$#" == 0 ]; then
-  echo "[INFO] no command is passed in"
-  echo "[INFO] enter pass-through mode"
-  exec /usr/bin/tini -s -- "bash"
-else
-  # Parse command-line options
-  options=$(getopt -o "m:hw:" --long "mode:,help,worker:" -n "$0" -- "$@")
-  if [ $? != 0 ]; then
-    usage
-  fi
-  eval set -- "$options"
-
-  while true; do
-    case "$1" in
-      -m|--mode)
-        mode="$2"
-        [[ $mode == "controller" || $mode == "worker" ]] || usage
-        shift 2
-        ;;
-      -w|--worker)
-        worker_type="$2"
-        [[ $worker_type == "model_worker" || $worker_type == "vllm_worker" ]] || usage
-        shift 2
-        ;;
-      -h|--help)
-        usage
-        ;;
-      --)
-        shift
-        break
-        ;;
-      *)
-        usage
-        ;;
-    esac
-  done
-
-  if [ "$worker_type" == "model_worker" ]; then
-      worker_type="ipex_llm.serving.model_worker"
-  elif [ "$worker_type" == "vllm_worker" ]; then
-      worker_type="ipex_llm.serving.vllm_worker"
-  fi
-
-  if [[ -n $CONTROLLER_HOST ]]; then
-    controller_host=$CONTROLLER_HOST
-  fi
-
-  if [[ -n $CONTROLLER_PORT ]]; then
-    controller_port=$CONTROLLER_PORT
-  fi
-
-  if [[ -n $API_HOST ]]; then
-    api_host=$API_HOST
-  fi
-
-  if [[ -n $API_PORT ]]; then
-    api_port=$API_PORT
-  fi
-
-  if [[ -n $GRADIO_PORT ]]; then
-    gradio_port=$GRADIO_PORT
-  fi
-
-  if [[ -n $WORKER_HOST ]]; then
-    worker_host=$WORKER_HOST
-  fi
-
-  if [[ -n $WORKER_PORT ]]; then
-    worker_port=$WORKER_PORT
-  fi
-
-  if [[ -n $MODEL_PATH ]]; then
-    model_path=$MODEL_PATH
-  fi
-
-  if [[ -n $DISPATCH_METHOD ]]; then
-    dispatch_method=$DISPATCH_METHOD
-  fi
-
-  if [[ -n $STREAM_INTERVAL ]]; then
-    stream_interval=$STREAM_INTERVAL
-  fi
-
-  controller_address="http://$controller_host:$controller_port"
-  # Execute logic based on options
-  if [[ $mode == "controller" ]]; then
-    # Logic for controller mode
-    # Boot Controller
-    api_address="http://$api_host:$api_port"
-    echo "Controller address: $controller_address"
-    echo "OpenAI API address: $api_address"
-    python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
-    # Boot openai api server
-    python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address &
-    # Boot gradio_web_server
-    python3 -m fastchat.serve.gradio_web_server --host $controller_host --port $gradio_port --controller-url $controller_address --model-list-mode reload
-  else
-    # Logic for non-controller(worker) mode
-    worker_address="http://$worker_host:$worker_port"
-    # Apply optimizations from ipex-llm
-    source ipex-llm-init -t
-    # First check if user have set OMP_NUM_THREADS by themselves
-    if [[ -n "${omp_num_threads}" ]]; then
-      echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
-      export OMP_NUM_THREADS=$omp_num_threads
-    else
-      # Use calculate_total_cores to acquire cpuset settings
-      # Set OMP_NUM_THREADS to correct numbers
-      cores=$(calculate_total_cores)
-      if [[ $cores == -1 || $cores == 0 ]]; then
-        echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
-      else
-        echo "Setting OMP_NUM_THREADS to $cores"
-        export OMP_NUM_THREADS=$cores
-      fi
-    fi
-    if [[ -z "${model_path}" ]]; then
-          echo "Please set env MODEL_PATH used for worker"
-          usage
-    fi
-    echo "Worker type: $worker_type"
-    echo "Worker address: $worker_address"
-    echo "Controller address: $controller_address"
-    if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
-      python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
-    elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
-      python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
-    fi
-  fi
-fi
-
-exec /usr/bin/bash -s -- "bash"
-
--- a/docker/llm/serving/cpu/docker/payload-1024.lua
+++ b/docker/llm/serving/cpu/docker/payload-1024.lua
@ -0,0 +1,20 @@
+wrk.method = "POST"
+wrk.headers["Content-Type"] = "application/json"
+ 
+wrk.body   = [[
+{
+  "model": "llama2",
+  "prompt": "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. However, her parents were always telling her to stay close to home, to be careful, and to avoid any danger. But the little girl was stubborn, and she wanted to see what was on the other side of the mountain. So she sneaked out of the house one night, leaving a note for her parents, and set off on her journey. As she climbed the mountain, the little girl felt a sense of excitement and wonder. She had never been this far away from home before, and she couldnt wait to see what she would find on the other side. She climbed higher and higher, her lungs burning from the thin air, until she finally reached the top of the mountain. And there, she found a beautiful meadow filled with wildflowers and a sparkling stream. The little girl danced and played in the meadow, feeling free and alive. She knew she had to return home eventually, but for now, she was content to enjoy her adventure. As the sun began to set, the little girl reluctantly made her way back down the mountain, but she knew that she would never forget her adventure and the joy of discovering something new and exciting. And whenever she felt scared or unsure, she would remember the thrill of climbing the mountain and the beauty of the meadow on the other side, and she would know that she could face any challenge that came her way, with courage and determination. She carried the memories of her journey in her heart, a constant reminder of the strength she possessed. The little girl returned home to her worried parents, who had discovered her note and anxiously awaited her arrival. They scolded her for disobeying their instructions and venturing into the unknown. But as they looked into her sparkling eyes and saw the glow on her face, their anger softened. They realized that their little girl had grown, that she had experienced something extraordinary. The little girl shared her tales of the mountain and the meadow with her parents, painting vivid pictures with her words. She spoke of the breathtaking view from the mountaintop, where the world seemed to stretch endlessly before her. She described the delicate petals of the wildflowers, vibrant hues that danced in the gentle breeze. And she recounted the soothing melody of the sparkling stream, its waters reflecting the golden rays of the setting sun. Her parents listened intently, captivated by her story. They realized that their daughter had discovered a part of herself on that journey—a spirit of curiosity and a thirst for exploration. They saw that she had learned valuable lessons about independence, resilience, and the beauty that lies beyond ones comfort zone. From that day forward, the little girls parents encouraged her to pursue her dreams and embrace new experiences. They understood that while there were risks in the world, there were also rewards waiting to be discovered. They supported her as she continued to embark on adventures, always reminding her to stay safe but never stifling her spirit. As the years passed, the little girl grew into a remarkable woman, fearlessly exploring the world and making a difference wherever she went. The lessons she had learned on that fateful journey stayed with her, guiding her through challenges and inspiring her to live life to the fullest. And so, the once timid little girl became a symbol of courage and resilience, a reminder to all who knew her that the greatest joys in life often lie just beyond the mountains we fear to climb. Her story spread far and wide, inspiring others to embrace their own journeys and discover the wonders that awaited them. In the end, the little girls adventure became a timeless tale, passed down through generations, reminding us all that sometimes, the greatest rewards come to those who dare to step into the unknown and follow their hearts. With each passing day, the little girls story continued to inspire countless individuals, igniting a spark within their souls and encouraging them to embark on their own extraordinary adventures. The tale of her bravery and determination resonated deeply with people from all walks of life, reminding them of the limitless possibilities that awaited them beyond the boundaries of their comfort zones. People marveled at the little girls unwavering spirit and her unwavering belief in the power of dreams. They saw themselves reflected in her journey, finding solace in the knowledge that they too could overcome their fears and pursue their passions. The little girl's story became a beacon of hope, a testament to the human spirit",
+  "max_tokens": 128,
+  "temperature": 0.5,
+  "n": 1,
+  "use_beam_search": false
+}
+]]
+
+logfile = io.open("wrk.log", "w");
+ 
+response = function(status, header, body)
+     logfile:write("status:" .. status .. "\n" .. body .. "\n-------------------------------------------------\n");
+end
+ 
--- a/docker/llm/serving/cpu/docker/start-fastchat-service.sh
+++ b/docker/llm/serving/cpu/docker/start-fastchat-service.sh
@ -0,0 +1,125 @@
+#!/bin/bash
+
+usage() {
+    echo "Usage: $0 [-w --worker <model_worker|vllm_worker>] [--help]"
+    echo "--help: Print help message."
+    echo "The following environment variables can be set."
+    echo "MODEL_PATH (default: empty)."
+    echo "LOW_BIT_FORMAT (default: sym_int4)"
+    echo "CONTROLLER_HOST (default: localhost)."
+    echo "CONTROLLER_PORT (default: 21001)."
+    echo "WORKER_HOST (default: localhost)."
+    echo "WORKER_PORT (default: 21002)."
+    echo "API_HOST (default: localhost)."
+    echo "API_PORT (default: 8000)."
+    exit 1
+}
+
+# Default values
+controller_host="localhost"
+controller_port="21001"
+worker_host="localhost"
+worker_port="21002"
+api_host="localhost"
+api_port="8000"
+model_path=""
+mode=""
+dispatch_method="shortest_queue" # shortest_queue or lottery
+stream_interval=1
+worker_type="model_worker"
+low_bit_format="sym_int4"
+
+# We do not have any arguments, just run bash
+
+# Parse command-line options
+options=$(getopt -o "hw:" --long "help,worker:" -n "$0" -- "$@")
+if [ $? != 0 ]; then
+    usage
+fi
+eval set -- "$options"
+
+while true; do
+    case "$1" in
+        -w|--worker)
+            worker_type="$2"
+            [[ $worker_type == "model_worker" || $worker_type == "vllm_worker" ]] || usage
+            shift 2
+        ;;
+        -h|--help)
+            usage
+        ;;
+        --)
+            shift
+            break
+        ;;
+        *)
+            usage
+        ;;
+    esac
+done
+
+if [ "$worker_type" == "model_worker" ]; then
+    worker_type="ipex_llm.serving.fastchat.ipex_llm_worker"
+elif [ "$worker_type" == "vllm_worker" ]; then
+    worker_type="ipex_llm.serving.fastchat.vllm_worker"
+fi
+
+if [[ -n $CONTROLLER_HOST ]]; then
+    controller_host=$CONTROLLER_HOST
+fi
+
+if [[ -n $CONTROLLER_PORT ]]; then
+    controller_port=$CONTROLLER_PORT
+fi
+
+if [[ -n $LOW_BIT_FORMAT ]]; then
+    low_bit_format=$LOW_BIT_FORMAT
+fi
+
+if [[ -n $WORKER_HOST ]]; then
+    worker_host=$WORKER_HOST
+fi
+
+if [[ -n $WORKER_PORT ]]; then
+    worker_port=$WORKER_PORT
+fi
+
+if [[ -n $MODEL_PATH ]]; then
+    model_path=$MODEL_PATH
+fi
+
+if [[ -n $API_HOST ]]; then
+    api_host=$API_HOST
+fi
+
+if [[ -n $API_PORT ]]; then
+    api_port=$API_PORT
+fi
+
+if [[ -n $DISPATCH_METHOD ]]; then
+    dispatch_method=$DISPATCH_METHOD
+fi
+
+if [[ -n $STREAM_INTERVAL ]]; then
+    stream_interval=$STREAM_INTERVAL
+fi
+
+controller_address="http://$controller_host:$controller_port"
+echo "Controller address: $controller_address"
+python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
+
+worker_address="http://$worker_host:$worker_port"
+echo "Worker type: $worker_type"
+echo "Worker address: $worker_address"
+
+if [ "$worker_type" == "ipex_llm.serving.fastchat.ipex_llm_worker" ]; then
+    python3 -m "$worker_type" --model-path $model_path --device cpu --low-bit $low_bit_format --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval &
+elif [ "$worker_type" == "ipex_llm.serving.fastchat.vllm_worker" ]; then
+    python3 -m "$worker_type" --model-path $model_path --device cpu --load-in-low-bit $low_bit_format --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --enforce-eager &
+fi
+
+sleep 10
+
+api_address="http://$api_host:$api_port"
+echo "OpenAI API address: $api_address"
+python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
--- a/docker/llm/serving/cpu/docker/start-vllm-service.sh
+++ b/docker/llm/serving/cpu/docker/start-vllm-service.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+model="YOUR_MODEL_PATH"
+served_model_name="YOUR_MODEL_NAME"
+ 
+ 
+python -m ipex_llm.vllm.cpu.entrypoints.openai.api_server \
+  --served-model-name $served_model_name \
+  --port 8000 \
+  --model $model \
+  --trust-remote-code \
+  --device cpu \
+  --dtype bfloat16 \
+  --enforce-eager \
+  --load-in-low-bit sym_int4 \
+  --max-model-len 4096 \
+  --max-num-batched-tokens 10240 \
+  --max-num-seqs 12 \
+  --tensor-parallel-size 1
--- a/docker/llm/serving/cpu/docker/vllm_offline_inference.py
+++ b/docker/llm/serving/cpu/docker/vllm_offline_inference.py
@ -0,0 +1,61 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://github.com/vllm-project/vllm/blob/v0.2.1.post1/examples/offline_inference.py
+# which is licensed under Apache License 2.0
+#
+# Copyright 2023 The vLLM team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from vllm import SamplingParams
+from ipex_llm.vllm.cpu.engine import IPEXLLMClass as LLM
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="YOUR_MODEL",
+          device="cpu",
+          dtype="bfloat16",
+          enforce_eager=True,
+          load_in_low_bit="sym_int4",
+          tensor_parallel_size=1)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py
+++ b/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py
@ -78,7 +78,7 @@ def run_vllm(
    load_in_low_bit: str = "sym_int4",
 ) -> float:
    from vllm import SamplingParams
-    from ipex_llm.vllm.engine import IPEXLLMClass as LLM
+    from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
    llm = LLM(model=model,
              tokenizer=tokenizer,
              quantization=quantization,
--- a/docker/llm/serving/xpu/docker/start-vllm-service.sh
+++ b/docker/llm/serving/xpu/docker/start-vllm-service.sh
@ -3,7 +3,7 @@ model="YOUR_MODEL_PATH"
 served_model_name="YOUR_MODEL_NAME"
 
 
-python -m ipex_llm.vllm.entrypoints.openai.api_server \
+python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
  --served-model-name $served_model_name \
  --port 8000 \
  --model $model \
--- a/docker/llm/serving/xpu/docker/vllm_offline_inference.py
+++ b/docker/llm/serving/xpu/docker/vllm_offline_inference.py
@ -32,7 +32,7 @@
 # limitations under the License.

 from vllm import SamplingParams
-from ipex_llm.vllm.engine import IPEXLLMClass as LLM
+from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM

 # Sample prompts.
 prompts = [
--- a/docs/readthedocs/source/doc/LLM/Quickstart/vLLM_quickstart.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/vLLM_quickstart.md
@ -111,7 +111,7 @@ served_model_name="YOUR_MODEL_NAME"
 # --max-model-len, --max-num-batched-tokens, --max-num-seqs
 # to acquire the best performance

-python -m ipex_llm.vllm.entrypoints.openai.api_server \
+python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
  --served-model-name $served_model_name \
  --port 8000 \
  --model $model \
@ -202,7 +202,7 @@ export CCL_ATL_SHM=1
 # --max-model-len, --max-num-batched-tokens, --max-num-seqs
 # to acquire the best performance

-python -m ipex_llm.vllm.entrypoints.openai.api_server \
+python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
  --served-model-name $served_model_name \
  --port 8000 \
  --model $model \
--- a/python/llm/example/CPU/vLLM-Serving/README.md
+++ b/python/llm/example/CPU/vLLM-Serving/README.md
@ -24,6 +24,13 @@ pip3 install sentencepiece  # Required for LLaMA tokenizer.
 pip3 install fastapi
 pip3 install "uvicorn[standard]"
 pip3 install "pydantic<2"  # Required for OpenAI server.
+
+# Install vllm
+git clone https://github.com/vllm-project/vllm.git && \
+cd ./vllm && \
+pip install wheel packaging ninja setuptools>=49.4.0 numpy && \
+pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu && \
+VLLM_TARGET_DEVICE=cpu python3 setup.py install
 ```

 ### 2. Configure recommended environment variables
@ -56,8 +63,8 @@ To fully utilize the continuous batching feature of the `vLLM`, you can send req
 #!/bin/bash
 # You may also want to adjust the `--max-num-batched-tokens` argument, it indicates the hard limit
 # of batched prompt length the server will accept
-numactl -C 48-95 -m 1 python -m ipex_llm.vllm.entrypoints.openai.api_server \
-        --model /MODEL_PATH/Llama-2-7b-chat-hf-ipex/ --port 8000  \
+numactl -C 48-95 -m 1 python -m ipex_llm.vllm.cpu.entrypoints.openai.api_server \
+        --model /MODEL_PATH/Llama-2-7b-chat-hf/ --port 8000  \
        --load-format 'auto' --device cpu --dtype bfloat16 \
        --load-in-low-bit sym_int4 \
        --max-num-batched-tokens 4096
@ -70,25 +77,9 @@ Then you can access the api server as follows:
 curl http://localhost:8000/v1/completions \
         -H "Content-Type: application/json" \
         -d '{
-                 "model": "/MODEL_PATH/Llama-2-7b-chat-hf-ipex/",
+                 "model": "/MODEL_PATH/Llama-2-7b-chat-hf/",
                 "prompt": "San Francisco is a",
                 "max_tokens": 128,
                 "temperature": 0
 }' &
 ```
-
-### 4. (Optional) Add a new model
-
-Currently we have only supported LLaMA family model (including `llama`, `vicuna`, `llama-2`, etc.). To use aother model, you may need add some adaptions.
-
-#### 4.1 Add model code
-
-Create or clone the Pytorch model code to `IPEX/python/llm/src/ipex/llm/vllm/model_executor/models`.
-
-#### 4.2 Rewrite the forward methods
-
-Refering to `IPEX/python/llm/src/ipex/llm/vllm/model_executor/models/ipex_llama.py`, it's necessary to maintain a `kv_cache`, which is a nested list of dictionary that maps `req_id` to a three-dimensional tensor **(the structure may vary from models)**. Before the model's actual `forward` method, you could prepare a `past_key_values` according to current `req_id`, and after that you need to update the `kv_cache` with `output.past_key_values`. The clearence will be executed when the request is finished.
-
-#### 4.3 Register new model
-
-Finally, register your `*ForCausalLM` class to the _MODEL_REGISTRY in `IPEX/python/llm/src/ipex/llm/vllm/model_executor/model_loader.py`.
--- a/python/llm/example/CPU/vLLM-Serving/offline_inference.py
+++ b/python/llm/example/CPU/vLLM-Serving/offline_inference.py
@ -31,8 +31,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from ipex_llm.vllm.entrypoints.llm import LLM
-from ipex_llm.vllm.sampling_params import SamplingParams
+from ipex_llm.vllm.cpu.engine import IPEXLLMClass as LLM
+from vllm import SamplingParams

 # Sample prompts.
 prompts = [
@ -46,7 +46,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

 # Create an LLM.
 # llm = LLM(model="facebook/opt-125m")
-llm = LLM(model="YOUR_MODEL_PATH", load_in_low_bit="sym_int4")
+llm = LLM(model="YOUR_MODEL_PATH", device="cpu", load_in_low_bit="sym_int4")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
--- a/python/llm/example/GPU/vLLM-Serving/README.md
+++ b/python/llm/example/GPU/vLLM-Serving/README.md
@ -91,7 +91,7 @@ served_model_name="YOUR_MODEL_NAME"
 # --max-model-len, --max-num-batched-tokens, --max-num-seqs
 # to acquire the best performance

-python -m ipex_llm.vllm.entrypoints.openai.api_server \
+python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
  --served-model-name $served_model_name \
  --port 8000 \
  --model $model \
@ -158,7 +158,7 @@ export CCL_ATL_SHM=1
 # --max-model-len, --max-num-batched-tokens, --max-num-seqs
 # to acquire the best performance

-python -m ipex_llm.vllm.entrypoints.openai.api_server \
+python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
  --served-model-name $served_model_name \
  --port 8000 \
  --model $model \
--- a/python/llm/example/GPU/vLLM-Serving/offline_inference.py
+++ b/python/llm/example/GPU/vLLM-Serving/offline_inference.py
@ -32,7 +32,7 @@
 # limitations under the License.

 from vllm import SamplingParams
-from ipex_llm.vllm.engine import IPEXLLMClass as LLM
+from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM

 # Sample prompts.
 prompts = [
--- a/python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py
+++ b/python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py
@ -30,7 +30,7 @@ from typing import List
 from fastapi import FastAPI, Request, BackgroundTasks
 from fastapi.responses import StreamingResponse, JSONResponse
 import uvicorn
-from ipex_llm.vllm.engine import IPEXLLMAsyncLLMEngine as AsyncLLMEngine
+
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
@ -300,6 +300,10 @@ if __name__ == "__main__":
        args.tensor_parallel_size = args.num_gpus

    engine_args = AsyncEngineArgs.from_cli_args(args)
+    if args.device == 'cpu':
+        from ipex_llm.vllm.cpu.engine import IPEXLLMAsyncLLMEngine as AsyncLLMEngine
+    elif args.device == 'xpu':
+        from ipex_llm.vllm.xpu.engine import IPEXLLMAsyncLLMEngine as AsyncLLMEngine
    engine = AsyncLLMEngine.from_engine_args(engine_args, load_in_low_bit=args.load_in_low_bit)
    worker = VLLMWorker(
        args.controller_address,
--- a/python/llm/src/ipex_llm/vllm/cpu/init.py
+++ b/python/llm/src/ipex_llm/vllm/cpu/init.py
@ -0,0 +1,15 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
--- a/python/llm/src/ipex_llm/vllm/cpu/engine/init.py
+++ b/python/llm/src/ipex_llm/vllm/cpu/engine/init.py
--- a/python/llm/src/ipex_llm/vllm/cpu/engine/engine.py
+++ b/python/llm/src/ipex_llm/vllm/cpu/engine/engine.py
@ -0,0 +1,171 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional, Union
+from vllm.engine.llm_engine import LLMEngine
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.llm import LLM
+from vllm.executor.ray_utils import initialize_ray_cluster
+from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
+                                  usage_message)
+from vllm.utils import Counter
+
+from ipex_llm.utils.common import invalidInputError
+
+
+class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        load_in_low_bit: str = "sym_int4",
+    ) -> "AsyncLLMEngine":
+        """Creates an async LLM engine from the engine arguments."""
+        # Enable ipex-llm optimizations
+        engine_config = engine_args.create_engine_config()
+        from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
+        _ipex_llm_convert(load_in_low_bit)
+        if engine_config.device_config.device_type == "neuron":
+            from vllm.executor.neuron_executor import NeuronExecutorAsync
+            executor_class = NeuronExecutorAsync
+        elif engine_config.device_config.device_type == "cpu":
+            invalidInputError(not engine_config.parallel_config.worker_use_ray, (
+                "Ray is not supported with the CPU backend."))
+            from vllm.executor.cpu_executor import CPUExecutorAsync
+            executor_class = CPUExecutorAsync
+        elif engine_config.parallel_config.worker_use_ray:
+            initialize_ray_cluster(engine_config.parallel_config)
+            from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
+            executor_class = RayGPUExecutorAsync
+        else:
+            invalidInputError(engine_config.parallel_config.world_size == 1, (
+                "Ray is required if parallel_config.world_size > 1."))
+            from vllm.executor.gpu_executor import GPUExecutorAsync
+            executor_class = GPUExecutorAsync
+        # Create the async LLM engine.
+        engine = cls(
+            engine_config.parallel_config.worker_use_ray,
+            engine_args.engine_use_ray,
+            **engine_config.to_dict(),
+            executor_class=executor_class,
+            log_requests=not engine_args.disable_log_requests,
+            log_stats=not engine_args.disable_log_stats,
+            max_log_len=engine_args.max_log_len,
+            start_engine_loop=start_engine_loop,
+            usage_context=usage_context,
+        )
+        return engine
+
+
+class IPEXLLMClass(LLM):
+
+    def __init__(
+        self,
+        model: str,
+        tokenizer: Optional[str] = None,
+        tokenizer_mode: str = "auto",
+        skip_tokenizer_init: bool = False,
+        trust_remote_code: bool = False,
+        tensor_parallel_size: int = 1,
+        dtype: str = "auto",
+        quantization: Optional[str] = None,
+        revision: Optional[str] = None,
+        tokenizer_revision: Optional[str] = None,
+        seed: int = 0,
+        gpu_memory_utilization: float = 0.9,
+        swap_space: int = 4,
+        enforce_eager: bool = False,
+        max_context_len_to_capture: Optional[int] = None,
+        max_seq_len_to_capture: int = 8192,
+        disable_custom_all_reduce: bool = False,
+        load_in_low_bit: str = "sym_int4",
+        **kwargs,
+    ) -> None:
+        if "disable_log_stats" not in kwargs:
+            kwargs["disable_log_stats"] = True
+        engine_args = EngineArgs(
+            model=model,
+            tokenizer=tokenizer,
+            tokenizer_mode=tokenizer_mode,
+            skip_tokenizer_init=skip_tokenizer_init,
+            trust_remote_code=trust_remote_code,
+            tensor_parallel_size=tensor_parallel_size,
+            dtype=dtype,
+            quantization=quantization,
+            revision=revision,
+            tokenizer_revision=tokenizer_revision,
+            seed=seed,
+            gpu_memory_utilization=gpu_memory_utilization,
+            swap_space=swap_space,
+            enforce_eager=enforce_eager,
+            max_context_len_to_capture=max_context_len_to_capture,
+            max_seq_len_to_capture=max_seq_len_to_capture,
+            disable_custom_all_reduce=disable_custom_all_reduce,
+            **kwargs,
+        )
+        self.llm_engine = IPEXLLMLLMEngine.from_engine_args(engine_args,
+                                                            load_in_low_bit=load_in_low_bit)
+        self.request_counter = Counter()
+
+
+class IPEXLLMLLMEngine(LLMEngine):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        load_in_low_bit: str = "sym_int4",
+        # ipex_llm_optimize_mode: str = 'NATIVE',
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_config = engine_args.create_engine_config()
+        from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
+        _ipex_llm_convert(load_in_low_bit)
+
+        # Initialize the cluster and specify the executor class.
+        if engine_config.device_config.device_type == "neuron":
+            from vllm.executor.neuron_executor import NeuronExecutor
+            executor_class = NeuronExecutor
+        elif engine_config.device_config.device_type == "cpu":
+            from vllm.executor.cpu_executor import CPUExecutor
+            executor_class = CPUExecutor
+        elif engine_config.parallel_config.worker_use_ray:
+            initialize_ray_cluster(engine_config.parallel_config)
+            from vllm.executor.ray_gpu_executor import RayGPUExecutor
+            executor_class = RayGPUExecutor
+        else:
+            invalidInputError(engine_config.parallel_config.world_size == 1, (
+                "Ray is required if parallel_config.world_size > 1."))
+            from vllm.executor.gpu_executor import GPUExecutor
+            executor_class = GPUExecutor
+
+        # Create the LLM engine.
+        engine = cls(**engine_config.to_dict(),
+                     executor_class=executor_class,
+                     log_stats=not engine_args.disable_log_stats,
+                     usage_context=usage_context,
+                     )
+        return engine
--- a/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py
+++ b/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py
@ -0,0 +1,195 @@
+import asyncio
+import importlib
+import inspect
+import os
+import re
+from contextlib import asynccontextmanager
+from http import HTTPStatus
+from typing import Any, Set
+
+import fastapi
+import uvicorn
+from fastapi import Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+from prometheus_client import make_asgi_app
+from starlette.routing import Mount
+
+import vllm
+import vllm.envs as envs
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionResponse,
+                                              CompletionRequest, ErrorResponse)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.logger import init_logger
+from vllm.usage.usage_lib import UsageContext
+
+from ipex_llm.vllm.cpu.engine import IPEXLLMAsyncLLMEngine
+from ipex_llm.utils.common import invalidInputError
+
+TIMEOUT_KEEP_ALIVE = 5  # seconds
+
+openai_serving_chat: OpenAIServingChat
+openai_serving_completion: OpenAIServingCompletion
+logger = init_logger(__name__)
+
+_running_tasks: Set[asyncio.Task[Any]] = set()
+
+
+@asynccontextmanager
+async def lifespan(app: fastapi.FastAPI):
+
+    async def _force_log():
+        while True:
+            await asyncio.sleep(10)
+            await engine.do_log_stats()
+
+    if not engine_args.disable_log_stats:
+        task = asyncio.create_task(_force_log())
+        _running_tasks.add(task)
+        task.add_done_callback(_running_tasks.remove)
+
+    yield
+
+
+app = fastapi.FastAPI(lifespan=lifespan)
+
+
+def parse_args():
+    parser = make_arg_parser()
+    parser.add_argument(
+        "--load-in-low-bit",
+        type=str,
+        default="sym_int4",
+        help="Low-bit quantization for IPEX-LLM models")
+    return parser.parse_args()
+
+
+# Add prometheus asgi middleware to route /metrics requests
+route = Mount("/metrics", make_asgi_app())
+# Workaround for 307 Redirect for /metrics
+route.path_regex = re.compile('^/metrics(?P<path>.*)$')
+app.routes.append(route)
+
+
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(_, exc):
+    err = openai_serving_chat.create_error_response(message=str(exc))
+    return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.get("/health")
+async def health() -> Response:
+    """Health check."""
+    await openai_serving_chat.engine.check_health()
+    return Response(status_code=200)
+
+
+@app.get("/v1/models")
+async def show_available_models():
+    models = await openai_serving_chat.show_available_models()
+    return JSONResponse(content=models.model_dump())
+
+
+@app.get("/version")
+async def show_version():
+    ver = {"version": vllm.__version__}
+    return JSONResponse(content=ver)
+
+
+@app.post("/v1/chat/completions")
+async def create_chat_completion(request: ChatCompletionRequest,
+                                 raw_request: Request):
+    generator = await openai_serving_chat.create_chat_completion(
+        request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    if request.stream:
+        return StreamingResponse(content=generator,
+                                 media_type="text/event-stream")
+    else:
+        invalidInputError(isinstance(generator, ChatCompletionResponse))
+        return JSONResponse(content=generator.model_dump())
+
+
+@app.post("/v1/completions")
+async def create_completion(request: CompletionRequest, raw_request: Request):
+    generator = await openai_serving_completion.create_completion(
+        request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    if request.stream:
+        return StreamingResponse(content=generator,
+                                 media_type="text/event-stream")
+    else:
+        return JSONResponse(content=generator.model_dump())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=args.allowed_origins,
+        allow_credentials=args.allow_credentials,
+        allow_methods=args.allowed_methods,
+        allow_headers=args.allowed_headers,
+    )
+
+    token = os.environ.get("VLLM_API_KEY") or args.api_key
+    if token:
+        @app.middleware("http")
+        async def authentication(request: Request, call_next):
+            root_path = "" if args.root_path is None else args.root_path
+            if not request.url.path.startswith(f"{root_path}/v1"):
+                return await call_next(request)
+            if request.headers.get("Authorization") != "Bearer " + token:
+                return JSONResponse(content={"error": "Unauthorized"},
+                                    status_code=401)
+            return await call_next(request)
+
+    for middleware in args.middleware:
+        module_path, object_name = middleware.rsplit(".", 1)
+        imported = getattr(importlib.import_module(module_path), object_name)
+        if inspect.isclass(imported):
+            app.add_middleware(imported)
+        elif inspect.iscoroutinefunction(imported):
+            app.middleware("http")(imported)
+        else:
+            invalidInputError(False, (f"Invalid middleware {middleware}. "
+                              f"Must be a function or a class."))
+
+    logger.info("vLLM API server version %s", vllm.__version__)
+    logger.info("args: %s", args)
+
+    if args.served_model_name is not None:
+        served_model_names = args.served_model_name
+    else:
+        served_model_names = [args.model]
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = IPEXLLMAsyncLLMEngine.from_engine_args(
+        engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
+    openai_serving_chat = OpenAIServingChat(engine, served_model_names,
+                                            args.response_role,
+                                            args.lora_modules,
+                                            args.chat_template)
+    openai_serving_completion = OpenAIServingCompletion(
+        engine, served_model_names, args.lora_modules)
+
+    app.root_path = args.root_path
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level=args.uvicorn_log_level,
+                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+                ssl_keyfile=args.ssl_keyfile,
+                ssl_certfile=args.ssl_certfile,
+                ssl_ca_certs=args.ssl_ca_certs,
+                ssl_cert_reqs=args.ssl_cert_reqs)
--- a/python/llm/src/ipex_llm/vllm/cpu/model_convert.py
+++ b/python/llm/src/ipex_llm/vllm/cpu/model_convert.py
@ -0,0 +1,181 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import torch
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models.llama import LlamaMLP, LlamaAttention
+from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Attention
+from vllm.model_executor.models.qwen import QWenMLP, QWenAttention
+from vllm.model_executor.models.baichuan import BaiChuanMLP, BaiChuanAttention
+from vllm.model_executor.models.chatglm import GLMMLP, GLMAttention
+from vllm.attention import Attention, AttentionMetadata
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.config import DeviceConfig
+from typing import Tuple
+from ipex_llm.utils.common import invalidInputError
+
+
+def _MLP_forward(self, x):
+    gate_up = self.gate_up_proj(x)
+    x = self.act_fn(gate_up)
+    x = self.down_proj(x)
+    return x
+
+
+def _Attention_forward(
+    self,
+    positions: torch.Tensor,
+    hidden_states: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    qkv = self.qkv_proj(hidden_states)
+    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+    q, k = self.rotary_emb(positions, q, k)
+    attn_output = self.attn(q, k, v, kv_cache, attn_metadata, self.kv_scale)
+    output = self.o_proj(attn_output)
+    return output
+
+
+def _QWen_Attention_forward(
+    self,
+    positions: torch.Tensor,
+    hidden_states: torch.Tensor,
+    kv_cache: Tuple[torch.Tensor, torch.Tensor],
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    qkv = self.c_attn(hidden_states)
+    q, k, v = qkv.chunk(chunks=3, dim=-1)
+    q, k = self.rotary_emb(positions, q, k)
+    attn_output = self.attn(q, k, v, kv_cache, attn_metadata, self.kv_scale)
+    output = self.c_proj(attn_output)
+    return output
+
+
+def _QWen_MLP_forward(self, x):
+    gate_up = self.gate_up_proj(x)
+    x = self.act_fn(gate_up)
+    x = self.c_proj(x)
+    return x
+
+
+def _ChatGLM_MLP_forward(self, hidden_states):
+    # [s, b, 4hp]
+    intermediate_parallel = self.dense_h_to_4h(hidden_states)
+    intermediate_parallel = self.activation_func(intermediate_parallel)
+    # [s, b, h]
+    output = self.dense_4h_to_h(intermediate_parallel)
+    return output
+
+
+def _Baichuan_Attention_forward(
+    self,
+    positions: torch.Tensor,
+    hidden_states: torch.Tensor,
+    kv_cache: Tuple[torch.Tensor, torch.Tensor],
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    qkv = self.W_pack(hidden_states)
+    q, k, v = qkv.chunk(chunks=3, dim=-1)
+    if self.postion_embedding != "ALIBI":
+        q, k = self.rotary_emb(positions, q, k)
+    attn_output = self.attn(q, k, v, kv_cache, attn_metadata, self.kv_scale)
+    output = self.o_proj(attn_output)
+    return output
+
+
+def _ChatGLM_Attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    position_ids: torch.Tensor,
+    kv_cache: Tuple[torch.Tensor, torch.Tensor],
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    qkv = self.query_key_value(hidden_states)
+    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+    q, k = self.rotary_emb(position_ids, q, k)
+    context_layer = self.attn(
+        q,
+        k,
+        v,
+        kv_cache,
+        attn_metadata,
+    )
+    attn_output = self.dense(context_layer)
+    return attn_output
+
+_REPLACED_MLP_LAYERS = {
+    LlamaMLP: _MLP_forward,
+    Qwen2MLP: _MLP_forward,
+    BaiChuanMLP: _MLP_forward,
+    QWenMLP: _QWen_MLP_forward,
+    GLMMLP: _ChatGLM_MLP_forward
+}
+
+_REPLACED_ATTENTION_LAYERS = {
+    LlamaAttention: _Attention_forward,
+    Qwen2Attention: _Attention_forward,
+    QWenAttention: _QWen_Attention_forward,
+    BaiChuanAttention: _Baichuan_Attention_forward,
+    GLMAttention: _ChatGLM_Attention_forward
+}
+
+
+def _model_mlp_convert():
+    for module, replaced_func in _REPLACED_MLP_LAYERS.items():
+        setattr(module, "forward", replaced_func)
+
+
+def _model_attention_convert():
+    for module, replaced_func in _REPLACED_ATTENTION_LAYERS.items():
+        setattr(module, "forward", replaced_func)
+
+
+def _ipex_llm_convert(load_in_low_bit):
+    from vllm.worker.model_runner import ModelRunner
+    import vllm.model_executor.model_loader as model_loader
+    setattr(ModelRunner, "load_model", get_load_function(load_in_low_bit))
+
+
+def get_load_function(low_bit):
+    def _ipex_llm_load_model(self) -> None:
+        _model_mlp_convert()
+        _model_attention_convert()
+
+        self.model = get_model(self.model_config,
+                               self.device_config,
+                               lora_config=self.lora_config,
+                               parallel_config=self.parallel_config,
+                               scheduler_config=self.scheduler_config)
+        from ipex_llm import optimize_model
+        optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype)
+
+        if self.lora_config:
+            invalidInputError(hasattr(self.model, "supported_lora_modules")
+                              and self.model.supported_lora_modules,
+                              "Model does not support LoRA")
+            invalidInputError(hasattr(self.model, "embedding_modules"),
+                              "Model does not have embedding_modules")
+            invalidInputError(hasattr(self.model, "embedding_padding_modules"),
+                              "Model does not have embedding_padding_modules")
+            self.lora_manager = LRUCacheWorkerLoRAManager(
+                self.scheduler_config.max_num_seqs,
+                self.scheduler_config.max_num_batched_tokens +
+                self.scheduler_config.max_paddings, self.vocab_size,
+                self.lora_config, self.device, self.model.embedding_modules,
+                self.model.embedding_padding_modules)
+            self.model = self.lora_manager.create_lora_manager(self.model)
+    return _ipex_llm_load_model
--- a/python/llm/src/ipex_llm/vllm/xpu/init.py
+++ b/python/llm/src/ipex_llm/vllm/xpu/init.py
@ -0,0 +1,15 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
--- a/python/llm/src/ipex_llm/vllm/xpu/engine/init.py
+++ b/python/llm/src/ipex_llm/vllm/xpu/engine/init.py
@ -0,0 +1,21 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass
+__all__ = [
+    "IPEXLLMAsyncLLMEngine",
+    "IPEXLLMLLMEngine",
+    "IPEXLLMClass",
+]
--- a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py
+++ b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py
@ -21,8 +21,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.ray_utils import initialize_ray_cluster
 from vllm.entrypoints.llm import LLM
 from vllm.utils import Counter
-
-from ipex_llm.vllm.model_convert import _ipex_llm_convert
+from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
 from ipex_llm.utils.common import invalidInputError


@ -40,13 +39,14 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
    ) -> "AsyncLLMEngine":
        """Creates an async LLM engine from the engine arguments."""
        # Enable ipex-llm optimizations
-        _ipex_llm_convert(load_in_low_bit)
        engine_configs = engine_args.create_engine_configs()
+
+        _ipex_llm_convert(load_in_low_bit)
        parallel_config = engine_configs[2]
        if parallel_config.worker_use_ray or engine_args.engine_use_ray:
            initialize_ray_cluster(parallel_config)
            # from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
-            from ipex_llm.vllm.ipex_llm_gpu_executor import get_gpu_executor_class_async
+            from ipex_llm.vllm.xpu.ipex_llm_gpu_executor import get_gpu_executor_class_async
            executor_class = get_gpu_executor_class_async(load_in_low_bit)
        else:
            invalidInputError(parallel_config.world_size == 1, (
@ -124,15 +124,15 @@ class IPEXLLMLLMEngine(LLMEngine):
    ) -> "LLMEngine":
        """Creates an LLM engine from the engine arguments."""
        # Create the engine configs.
-        _ipex_llm_convert(load_in_low_bit)
        engine_configs = engine_args.create_engine_configs()
+        _ipex_llm_convert(load_in_low_bit)
        parallel_config = engine_configs[2]

        # Initialize the cluster and specify the executor class.
        if parallel_config.worker_use_ray:
            initialize_ray_cluster(parallel_config)
            # from vllm.executor.ray_gpu_executor import RayGPUExecutor
-            from ipex_llm.vllm.ipex_llm_gpu_executor import get_gpu_executor_class
+            from ipex_llm.vllm.xpu.ipex_llm_gpu_executor import get_gpu_executor_class
            executor_class = get_gpu_executor_class(load_in_low_bit)
        else:
            invalidInputError(parallel_config.world_size == 1,
--- a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py
+++ b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py
@ -25,7 +25,7 @@ from vllm.logger import init_logger
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_engine import LoRA
-from ipex_llm.vllm.engine import IPEXLLMAsyncLLMEngine
+from ipex_llm.vllm.xpu.engine import IPEXLLMAsyncLLMEngine
 from ipex_llm.utils.common import invalidInputError

 TIMEOUT_KEEP_ALIVE = 5  # seconds
--- a/python/llm/src/ipex_llm/vllm/xpu/ipex_llm_gpu_executor.py
+++ b/python/llm/src/ipex_llm/vllm/xpu/ipex_llm_gpu_executor.py
@ -17,6 +17,7 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (set_cuda_visible_devices, get_ip, get_open_port,
                        get_distributed_init_method, make_async)
 import functools
+from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
 from ipex_llm.utils.common import invalidInputError

 if ray is not None:
@ -175,7 +176,6 @@ class IPEXLLMGPUExecutor(ExecutorBase):
                start=1,
        ):
            local_rank = node_workers[node_id].index(rank)
-            from ipex_llm.vllm.model_convert import _ipex_llm_convert

            def create_worker_function(rank, local_rank, load_in_low_bit):
                def worker_function():
--- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py
+++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py
@ -21,9 +21,8 @@ from vllm.model_executor.models.qwen import QWenMLP, QWenAttention
 from vllm.model_executor.models.baichuan import BaiChuanMLP, BaiChuanAttention
 from vllm.model_executor.models.chatglm import GLMMLP, GLMAttention
 from vllm.model_executor.model_loader import get_model
-from vllm.utils import measure_device_memory
+
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
-from vllm.utils import measure_device_memory
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.config import DeviceConfig
 from typing import Tuple
@ -162,6 +161,7 @@ def get_load_function(low_bit):
        _model_mlp_convert()
        _model_attention_convert()

+        from vllm.utils import measure_device_memory
        with measure_device_memory() as m:
            # only support xpu for now
            # We have to create a new DeviceConfig.