LLM: Fix ipex torchscript=True error (#10832)
* remove * update * remove torchscript
This commit is contained in:
		
							parent
							
								
									fc33aa3721
								
							
						
					
					
						commit
						23c6a52fb0
					
				
					 9 changed files with 14 additions and 20 deletions
				
			
		| 
						 | 
				
			
			@ -1210,15 +1210,15 @@ def run_bigdl_ipex_bf16(repo_id,
 | 
			
		|||
    st = time.perf_counter()
 | 
			
		||||
    if repo_id in CHATGLM_IDS:
 | 
			
		||||
        model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
 | 
			
		||||
                                          use_cache=True, torchscript=True)
 | 
			
		||||
                                          use_cache=True)
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    elif repo_id in LLAMA_IDS:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
 | 
			
		||||
                                                     use_cache=True, torchscript=True)
 | 
			
		||||
                                                     use_cache=True)
 | 
			
		||||
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    else:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
 | 
			
		||||
                                                     use_cache=True, torchscript=True)
 | 
			
		||||
                                                     use_cache=True)
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    if not hasattr(model.config, "token_latency"):
 | 
			
		||||
        model.config.token_latency = True
 | 
			
		||||
| 
						 | 
				
			
			@ -1280,15 +1280,15 @@ def run_bigdl_ipex_int4(repo_id,
 | 
			
		|||
    st = time.perf_counter()
 | 
			
		||||
    if repo_id in CHATGLM_IDS:
 | 
			
		||||
        model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
 | 
			
		||||
                                          use_cache=True, torchscript=True)
 | 
			
		||||
                                          use_cache=True)
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    elif repo_id in LLAMA_IDS:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
 | 
			
		||||
                                                     use_cache=True, torchscript=True)
 | 
			
		||||
                                                     use_cache=True)
 | 
			
		||||
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    else:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
 | 
			
		||||
                                                     use_cache=True, torchscript=True)
 | 
			
		||||
                                                     use_cache=True)
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    if not hasattr(model.config, "token_latency"):
 | 
			
		||||
        model.config.token_latency = True
 | 
			
		||||
| 
						 | 
				
			
			@ -1350,15 +1350,15 @@ def run_bigdl_ipex_int8(repo_id,
 | 
			
		|||
    st = time.perf_counter()
 | 
			
		||||
    if repo_id in CHATGLM_IDS:
 | 
			
		||||
        model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
 | 
			
		||||
                                          use_cache=True, torchscript=True)
 | 
			
		||||
                                          use_cache=True)
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    elif repo_id in LLAMA_IDS:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
 | 
			
		||||
                                                     use_cache=True, torchscript=True)
 | 
			
		||||
                                                     use_cache=True)
 | 
			
		||||
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    else:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
 | 
			
		||||
                                                     use_cache=True, torchscript=True)
 | 
			
		||||
                                                     use_cache=True)
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    if not hasattr(model.config, "token_latency"):
 | 
			
		||||
        model.config.token_latency = True
 | 
			
		||||
| 
						 | 
				
			
			@ -1537,15 +1537,15 @@ def run_speculative_cpu(repo_id,
 | 
			
		|||
    st = time.perf_counter()
 | 
			
		||||
    if repo_id in CHATGLM_IDS:
 | 
			
		||||
        model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
 | 
			
		||||
                                          use_cache=True, torchscript=True, speculative=True)
 | 
			
		||||
                                          use_cache=True, speculative=True)
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    elif repo_id in LLAMA_IDS:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
 | 
			
		||||
                                                     use_cache=True, torchscript=True, speculative=True)
 | 
			
		||||
                                                     use_cache=True, speculative=True)
 | 
			
		||||
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    else:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
 | 
			
		||||
                                                     use_cache=True, torchscript=True, speculative=True)
 | 
			
		||||
                                                     use_cache=True, speculative=True)
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    if tokenizer.pad_token is None:
 | 
			
		||||
        tokenizer.pad_token = tokenizer.eos_token
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -59,7 +59,6 @@ if __name__ == '__main__':
 | 
			
		|||
                                                 optimize_model=True,
 | 
			
		||||
                                                 torch_dtype=torch.bfloat16,
 | 
			
		||||
                                                 load_in_low_bit="bf16",
 | 
			
		||||
                                                 torchscript=True,
 | 
			
		||||
                                                 speculative=True,
 | 
			
		||||
                                                 trust_remote_code=True,
 | 
			
		||||
                                                 use_cache=True)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -57,7 +57,6 @@ if __name__ == '__main__':
 | 
			
		|||
                                      load_in_low_bit="bf16",
 | 
			
		||||
                                      speculative=True,
 | 
			
		||||
                                      trust_remote_code=True,
 | 
			
		||||
                                      torchscript=True,
 | 
			
		||||
                                      use_cache=True)
 | 
			
		||||
 | 
			
		||||
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -74,7 +74,6 @@ if __name__ == '__main__':
 | 
			
		|||
                                                 optimize_model=True,
 | 
			
		||||
                                                 torch_dtype=torch.bfloat16,
 | 
			
		||||
                                                 load_in_low_bit="bf16",
 | 
			
		||||
                                                 torchscript=True,
 | 
			
		||||
                                                 speculative=True,
 | 
			
		||||
                                                 trust_remote_code=True,
 | 
			
		||||
                                                 use_cache=True)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -65,7 +65,6 @@ if __name__ == '__main__':
 | 
			
		|||
                                                 torch_dtype=torch.bfloat16,
 | 
			
		||||
                                                 load_in_low_bit="bf16",
 | 
			
		||||
                                                 speculative=True,
 | 
			
		||||
                                                 torchscript=True,
 | 
			
		||||
                                                 trust_remote_code=True,
 | 
			
		||||
                                                 use_cache=True)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,7 +51,6 @@ if __name__ == '__main__':
 | 
			
		|||
                                                 torch_dtype=torch.bfloat16,
 | 
			
		||||
                                                 load_in_low_bit="bf16",
 | 
			
		||||
                                                 speculative=True,
 | 
			
		||||
                                                 torchscript=True,
 | 
			
		||||
                                                 trust_remote_code=True,
 | 
			
		||||
                                                 use_cache=True)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -69,7 +69,6 @@ if __name__ == '__main__':
 | 
			
		|||
                                                 torch_dtype=torch.bfloat16,
 | 
			
		||||
                                                 load_in_low_bit="bf16",
 | 
			
		||||
                                                 speculative=True,
 | 
			
		||||
                                                 torchscript=True,
 | 
			
		||||
                                                 trust_remote_code=True,
 | 
			
		||||
                                                 use_cache=True)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,7 +51,6 @@ if __name__ == '__main__':
 | 
			
		|||
                                                 torch_dtype=torch.bfloat16,
 | 
			
		||||
                                                 load_in_low_bit="bf16",
 | 
			
		||||
                                                 speculative=True,
 | 
			
		||||
                                                 torchscript=True,
 | 
			
		||||
                                                 trust_remote_code=True,
 | 
			
		||||
                                                 use_cache=True)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -150,7 +150,8 @@ def _ipex_optimize_model(model, rms_classes, qtype):
 | 
			
		|||
    _ipex_optimize_attention(model, is_tpp=is_tpp, is_woq=is_woq)
 | 
			
		||||
    _ipex_optimize_decoder(model, is_tpp=is_tpp, is_woq=is_woq)
 | 
			
		||||
 | 
			
		||||
    model.register_forward_hook(output_hook, with_kwargs=True)
 | 
			
		||||
    # need to register_forward_hook after torch.jit.trace
 | 
			
		||||
    # model.register_forward_hook(output_hook, with_kwargs=True)
 | 
			
		||||
    return model
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue