Fix qwen model adapter in docker (#9969)
* fix qwen in docker * add patch for model_adapter.py in fastchat * add patch for model_adapter.py in fastchat
This commit is contained in:
		
							parent
							
								
									50a851e3b3
								
							
						
					
					
						commit
						a2718038f7
					
				
					 2 changed files with 24 additions and 0 deletions
				
			
		| 
						 | 
					@ -8,10 +8,13 @@ ARG TINI_VERSION=v0.18.0
 | 
				
			||||||
ARG PIP_NO_CACHE_DIR=false
 | 
					ARG PIP_NO_CACHE_DIR=false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
COPY ./entrypoint.sh /opt/entrypoint.sh
 | 
					COPY ./entrypoint.sh /opt/entrypoint.sh
 | 
				
			||||||
 | 
					COPY ./model_adapter.py/patch /llm/model_adapter.py.patch
 | 
				
			||||||
ADD  https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
 | 
					ADD  https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
 | 
				
			||||||
# Install Serving Dependencies
 | 
					# Install Serving Dependencies
 | 
				
			||||||
RUN cd /llm && \
 | 
					RUN cd /llm && \
 | 
				
			||||||
    pip install --pre --upgrade bigdl-llm[serving] && \
 | 
					    pip install --pre --upgrade bigdl-llm[serving] && \
 | 
				
			||||||
 | 
					# Fix Qwen model adpater in fastchat
 | 
				
			||||||
 | 
					    patch /usr/local/lib/python3.9/dist-packages/fastchat/model/model_adapter.py < /llm/model_adapter.py.patch && \
 | 
				
			||||||
    chmod +x /opt/entrypoint.sh && \
 | 
					    chmod +x /opt/entrypoint.sh && \
 | 
				
			||||||
    chmod +x /sbin/tini && \
 | 
					    chmod +x /sbin/tini && \
 | 
				
			||||||
    cp /sbin/tini /usr/bin/tini
 | 
					    cp /sbin/tini /usr/bin/tini
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										21
									
								
								docker/llm/serving/cpu/docker/model_adapter.py.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								docker/llm/serving/cpu/docker/model_adapter.py.patch
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,21 @@
 | 
				
			||||||
 | 
					--- model_adapter.py.old	2024-01-24 01:56:23.903144335 +0000
 | 
				
			||||||
 | 
					+++ model_adapter.py	2024-01-24 01:59:22.605062765 +0000
 | 
				
			||||||
 | 
					@@ -1346,15 +1346,17 @@
 | 
				
			||||||
 | 
					         )
 | 
				
			||||||
 | 
					         # NOTE: if you use the old version of model file, please remove the comments below
 | 
				
			||||||
 | 
					         # config.use_flash_attn = False
 | 
				
			||||||
 | 
					-        config.fp16 = True
 | 
				
			||||||
 | 
					+        # config.fp16 = True
 | 
				
			||||||
 | 
					         generation_config = GenerationConfig.from_pretrained(
 | 
				
			||||||
 | 
					             model_path, trust_remote_code=True
 | 
				
			||||||
 | 
					         )
 | 
				
			||||||
 | 
					+        from bigdl.llm.transformers import AutoModelForCausalLM
 | 
				
			||||||
 | 
					         model = AutoModelForCausalLM.from_pretrained(
 | 
				
			||||||
 | 
					             model_path,
 | 
				
			||||||
 | 
					             config=config,
 | 
				
			||||||
 | 
					             low_cpu_mem_usage=True,
 | 
				
			||||||
 | 
					             trust_remote_code=True,
 | 
				
			||||||
 | 
					+            load_in_4bit=True,
 | 
				
			||||||
 | 
					             **from_pretrained_kwargs,
 | 
				
			||||||
 | 
					         ).eval()
 | 
				
			||||||
 | 
					         if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk:
 | 
				
			||||||
		Loading…
	
		Reference in a new issue