Add load low-bit in model-serving for reduce EPC (#9239)
* init load low-bit * fix * fix
This commit is contained in:
		
							parent
							
								
									0383306688
								
							
						
					
					
						commit
						c14a61681b
					
				
					 1 changed files with 18 additions and 0 deletions
				
			
		| 
						 | 
				
			
			@ -256,10 +256,28 @@ class BigDLLLMAdapter(BaseModelAdapter):
 | 
			
		|||
        return model, tokenizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BigDLLMLOWBITAdapter(BaseModelAdapter):
 | 
			
		||||
    "Model adapter for bigdl-llm backend low-bit models"
 | 
			
		||||
 | 
			
		||||
    def match(self, model_path: str):
 | 
			
		||||
        return "bigdl-lowbit" in model_path
 | 
			
		||||
 | 
			
		||||
    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
 | 
			
		||||
        revision = from_pretrained_kwargs.get("revision", "main")
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(
 | 
			
		||||
            model_path, use_fast=False, revision=revision
 | 
			
		||||
        )
 | 
			
		||||
        print("Customized bigdl-llm loader")
 | 
			
		||||
        from bigdl.llm.transformers import AutoModelForCausalLM
 | 
			
		||||
        model = AutoModelForCausalLM.load_low_bit(model_path)
 | 
			
		||||
        return model, tokenizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def patch_fastchat():
 | 
			
		||||
    global is_fastchat_patched
 | 
			
		||||
    if is_fastchat_patched:
 | 
			
		||||
        return
 | 
			
		||||
    register_model_adapter(BigDLLMLOWBITAdapter)
 | 
			
		||||
    register_model_adapter(BigDLLLMAdapter)
 | 
			
		||||
    mapping_fastchat = _get_patch_map()
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue