diff --git a/python/llm/example/CPU/vLLM-Serving/README.md b/python/llm/example/CPU/vLLM-Serving/README.md index af619562..44162d30 100644 --- a/python/llm/example/CPU/vLLM-Serving/README.md +++ b/python/llm/example/CPU/vLLM-Serving/README.md @@ -17,13 +17,12 @@ To run vLLM continuous batching on Intel CPUs, install the dependencies as follo conda create -n bigdl-vllm python==3.9 conda activate bigdl-vllm # Install dependencies -pip install --pre --upgrade bigdl-llm[all] +pip3 install numpy +pip3 install --pre --upgrade bigdl-llm[all] pip3 install psutil pip3 install sentencepiece # Required for LLaMA tokenizer. -pip3 install numpy pip3 install "torch==2.0.1" pip3 install "transformers>=4.33.1" # Required for Code Llama. -pip3 install "xformers == 0.0.22" pip3 install fastapi pip3 install "uvicorn[standard]" pip3 install "pydantic<2" # Required for OpenAI server. diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/input_metadata.py b/python/llm/src/bigdl/llm/vllm/model_executor/input_metadata.py index 9d88fcd4..0a7a24e5 100644 --- a/python/llm/src/bigdl/llm/vllm/model_executor/input_metadata.py +++ b/python/llm/src/bigdl/llm/vllm/model_executor/input_metadata.py @@ -33,7 +33,7 @@ from typing import Dict, List, Optional, Tuple import torch -from xformers.ops import AttentionBias +# from xformers.ops import AttentionBias from bigdl.llm.vllm.sequence import SequenceData from bigdl.llm.vllm.sampling_params import SamplingParams @@ -74,7 +74,7 @@ class InputMetadata: # Set during the execution of the first attention op. # TODO(gc): we might want to delete this - self.attn_bias: List[AttentionBias] = [] + # self.attn_bias: List[AttentionBias] = [] def __repr__(self) -> str: # Print only useful metadata.