Reenable pp and lightweight-serving serving on 0.6.6 (#12814)
* reenable pp ang lightweight serving on 066 * update readme * updat * update tag
This commit is contained in:
		
							parent
							
								
									af693425f1
								
							
						
					
					
						commit
						1083fe5508
					
				
					 5 changed files with 22 additions and 5 deletions
				
			
		| 
						 | 
				
			
			@ -93,6 +93,12 @@ RUN apt-get update && \
 | 
			
		|||
    cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \
 | 
			
		||||
    cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \
 | 
			
		||||
    cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
 | 
			
		||||
    # Download pp_serving
 | 
			
		||||
    mkdir -p /llm/pp_serving && \
 | 
			
		||||
    cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-Serving/*.py /llm/pp_serving/ && \
 | 
			
		||||
    # Download lightweight_serving
 | 
			
		||||
    mkdir -p /llm/lightweight_serving && \
 | 
			
		||||
    cp ./ipex-llm/python/llm/example/GPU/Lightweight-Serving/*.py /llm/lightweight_serving/ && \
 | 
			
		||||
    rm -rf ./ipex-llm && \
 | 
			
		||||
    # Install vllm dependencies
 | 
			
		||||
    pip install --upgrade fastapi && \
 | 
			
		||||
| 
						 | 
				
			
			@ -120,7 +126,7 @@ RUN apt-get update && \
 | 
			
		|||
    cd /llm && \
 | 
			
		||||
    rm -rf /tmp/neo && \
 | 
			
		||||
    # Install vllm
 | 
			
		||||
    git clone -b 0.6.6-pre https://github.com/analytics-zoo/vllm.git /llm/vllm && \
 | 
			
		||||
    git clone -b 0.6.6 https://github.com/analytics-zoo/vllm.git /llm/vllm && \
 | 
			
		||||
    cd /llm/vllm && \
 | 
			
		||||
    pip install setuptools-scm && \
 | 
			
		||||
    pip install --upgrade cmake && \
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -49,14 +49,14 @@ Currently, we provide two different serving engines in the image, which are Fast
 | 
			
		|||
 | 
			
		||||
To run Lightweight serving on one intel gpu using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving).
 | 
			
		||||
 | 
			
		||||
For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image.
 | 
			
		||||
For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#### Pipeline parallel serving engine
 | 
			
		||||
 | 
			
		||||
To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI).
 | 
			
		||||
 | 
			
		||||
For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image.
 | 
			
		||||
For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#### vLLM serving engine
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,7 @@
 | 
			
		|||
# need to update transformers version first
 | 
			
		||||
# pip install transformers==4.37.0
 | 
			
		||||
cd /llm/lightweight_serving
 | 
			
		||||
export IPEX_LLM_NOT_USE_VLLM=True
 | 
			
		||||
model_path="/llm/models/Llama-2-7b-chat-hf"
 | 
			
		||||
low_bit="sym_int4"
 | 
			
		||||
python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit
 | 
			
		||||
| 
						 | 
				
			
			@ -1,9 +1,12 @@
 | 
			
		|||
# update transformers version first
 | 
			
		||||
# pip install transformers==4.37.0
 | 
			
		||||
source /opt/intel/oneapi/setvars.sh --force
 | 
			
		||||
export IPEX_LLM_NOT_USE_VLLM=True
 | 
			
		||||
export no_proxy=localhost
 | 
			
		||||
export FI_PROVIDER=tcp
 | 
			
		||||
export OMP_NUM_THREADS=32
 | 
			
		||||
 | 
			
		||||
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 | 
			
		||||
#export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 | 
			
		||||
basekit_root=/opt/intel/oneapi
 | 
			
		||||
source $basekit_root/setvars.sh --force
 | 
			
		||||
# source $basekit_root/ccl/latest/env/vars.sh --force
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -70,8 +70,9 @@ def is_auto_awq_available():
 | 
			
		|||
 | 
			
		||||
def is_vllm_available():
 | 
			
		||||
    global _IS_VLLM_AVAILABLE
 | 
			
		||||
    _IS_VLLM_AVAILABLE = os.getenv("IPEX_LLM_NOT_USE_VLLM", None)
 | 
			
		||||
    if _IS_VLLM_AVAILABLE is not None:
 | 
			
		||||
        return _IS_VLLM_AVAILABLE
 | 
			
		||||
        return False
 | 
			
		||||
    import sys
 | 
			
		||||
    original_path = sys.path
 | 
			
		||||
    # Temporally remove current directory
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue