From a2718038f7f4beb49123ecbbb0c9dc375001406d Mon Sep 17 00:00:00 2001 From: Lilac09 <74996885+Zhengjin-Wang@users.noreply.github.com> Date: Wed, 24 Jan 2024 11:01:29 +0800 Subject: [PATCH] Fix qwen model adapter in docker (#9969) * fix qwen in docker * add patch for model_adapter.py in fastchat * add patch for model_adapter.py in fastchat --- docker/llm/serving/cpu/docker/Dockerfile | 3 +++ .../serving/cpu/docker/model_adapter.py.patch | 21 +++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 docker/llm/serving/cpu/docker/model_adapter.py.patch diff --git a/docker/llm/serving/cpu/docker/Dockerfile b/docker/llm/serving/cpu/docker/Dockerfile index 9047c03f..c549248a 100644 --- a/docker/llm/serving/cpu/docker/Dockerfile +++ b/docker/llm/serving/cpu/docker/Dockerfile @@ -8,10 +8,13 @@ ARG TINI_VERSION=v0.18.0 ARG PIP_NO_CACHE_DIR=false COPY ./entrypoint.sh /opt/entrypoint.sh +COPY ./model_adapter.py/patch /llm/model_adapter.py.patch ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini # Install Serving Dependencies RUN cd /llm && \ pip install --pre --upgrade bigdl-llm[serving] && \ +# Fix Qwen model adpater in fastchat + patch /usr/local/lib/python3.9/dist-packages/fastchat/model/model_adapter.py < /llm/model_adapter.py.patch && \ chmod +x /opt/entrypoint.sh && \ chmod +x /sbin/tini && \ cp /sbin/tini /usr/bin/tini diff --git a/docker/llm/serving/cpu/docker/model_adapter.py.patch b/docker/llm/serving/cpu/docker/model_adapter.py.patch new file mode 100644 index 00000000..2957468d --- /dev/null +++ b/docker/llm/serving/cpu/docker/model_adapter.py.patch @@ -0,0 +1,21 @@ +--- model_adapter.py.old 2024-01-24 01:56:23.903144335 +0000 ++++ model_adapter.py 2024-01-24 01:59:22.605062765 +0000 +@@ -1346,15 +1346,17 @@ + ) + # NOTE: if you use the old version of model file, please remove the comments below + # config.use_flash_attn = False +- config.fp16 = True ++ # config.fp16 = True + generation_config = GenerationConfig.from_pretrained( + model_path, trust_remote_code=True + ) ++ from bigdl.llm.transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained( + model_path, + config=config, + low_cpu_mem_usage=True, + trust_remote_code=True, ++ load_in_4bit=True, + **from_pretrained_kwargs, + ).eval() + if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk: