add minicpm-v models to transformers_int4_npu_win api (#12352)
				
					
				
			* add minicpm npu * optimize model
This commit is contained in:
		
							parent
							
								
									a7b66683f1
								
							
						
					
					
						commit
						79f2877413
					
				
					 1 changed files with 5 additions and 10 deletions
				
			
		| 
						 | 
					@ -629,18 +629,13 @@ def transformers_int4_npu_win(repo_id,
 | 
				
			||||||
    # Load model in 4 bit,
 | 
					    # Load model in 4 bit,
 | 
				
			||||||
    # which convert the relevant layers in the model into INT4 format
 | 
					    # which convert the relevant layers in the model into INT4 format
 | 
				
			||||||
    st = time.perf_counter()
 | 
					    st = time.perf_counter()
 | 
				
			||||||
    if repo_id in CHATGLM_IDS:
 | 
					    if repo_id in MINICPM_V_IDS:
 | 
				
			||||||
        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
 | 
					        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=optimize_model,
 | 
				
			||||||
                                          optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
 | 
					                                          trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
 | 
				
			||||||
                                          quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
 | 
					                                          quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
 | 
				
			||||||
                                          torch_dtype=torch.float16, attn_implementation="eager").eval()
 | 
					                                          attn_implementation="eager", modules_to_not_convert=["vpm", "resampler"]).eval()
 | 
				
			||||||
 | 
					        model = model.llm
 | 
				
			||||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
    elif repo_id in LLAMA_IDS:
 | 
					 | 
				
			||||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
 | 
					 | 
				
			||||||
                                                     optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
 | 
					 | 
				
			||||||
                                                     quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
 | 
					 | 
				
			||||||
                                                     use_cache=True, attn_implementation="eager").eval()
 | 
					 | 
				
			||||||
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
 | 
				
			||||||
                                                     optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
 | 
					                                                     optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue