LLM: change fp16 benchmark to model.half (#10477)
* LLM: change fp16 benchmark to model.half * fix
This commit is contained in:
		
							parent
							
								
									749bedaf1e
								
							
						
					
					
						commit
						e41d556436
					
				
					 1 changed files with 8 additions and 4 deletions
				
			
		| 
						 | 
				
			
			@ -930,27 +930,31 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
 | 
			
		|||
    # which convert the relevant layers in the model into INT4 format
 | 
			
		||||
    st = time.perf_counter()
 | 
			
		||||
    if repo_id in CHATGLM_IDS:
 | 
			
		||||
        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, torch_dtype=torch.float16,
 | 
			
		||||
        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
 | 
			
		||||
                                          trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
        model = model.half()
 | 
			
		||||
        model = model.to('xpu')
 | 
			
		||||
    elif repo_id in LLAMA_IDS:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, torch_dtype=torch.float16,
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
 | 
			
		||||
                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
			
		||||
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
        model = model.half()
 | 
			
		||||
        model = model.to('xpu')
 | 
			
		||||
    elif repo_id in LLAVA_IDS:
 | 
			
		||||
        llava_repo_dir = os.environ.get('LLAVA_REPO_DIR')
 | 
			
		||||
        sys.path.append(rf"{llava_repo_dir}")
 | 
			
		||||
        from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, torch_dtype=torch.float16,
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
 | 
			
		||||
                                          trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
        model = model.half()
 | 
			
		||||
        model = model.to('xpu')
 | 
			
		||||
    else:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, torch_dtype=torch.float16,
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
 | 
			
		||||
                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
        model = model.half()
 | 
			
		||||
        model = model.to('xpu')
 | 
			
		||||
        if isinstance(model, GPTJForCausalLM):
 | 
			
		||||
            # For gpt-j model family, this optimization can provide a better performance.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue