Add dummy model in iGPU perf (#12341)
* Add dummy model in iGPU perf * Add dummy model in iGPU perf * Fix
This commit is contained in:
		
							parent
							
								
									8e9a3a1158
								
							
						
					
					
						commit
						7240c283a3
					
				
					 8 changed files with 86 additions and 36 deletions
				
			
		| 
						 | 
					@ -53,6 +53,8 @@ QWENVL_IDS = ['Qwen/Qwen-VL-Chat']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
MINICPM_V_IDS = ['openbmb/MiniCPM-V-2_6', 'openbmb/MiniCPM-Llama3-V-2_5']
 | 
					MINICPM_V_IDS = ['openbmb/MiniCPM-V-2_6', 'openbmb/MiniCPM-Llama3-V-2_5']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DUMMY_IDS = ['dummy/dummy-1.5B', 'dummy/dummy-4B']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
results = []
 | 
					results = []
 | 
				
			||||||
excludes = []
 | 
					excludes = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1245,6 +1247,10 @@ def run_transformer_int4_gpu_win(repo_id,
 | 
				
			||||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
        model = model.to('xpu')
 | 
					        model = model.to('xpu')
 | 
				
			||||||
        model = model.llm
 | 
					        model = model.llm
 | 
				
			||||||
 | 
					    elif repo_id in DUMMY_IDS:
 | 
				
			||||||
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
 | 
				
			||||||
 | 
					                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
				
			||||||
 | 
					        model = model.to('xpu')
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
 | 
				
			||||||
                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
					                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
				
			||||||
| 
						 | 
					@ -1255,7 +1261,10 @@ def run_transformer_int4_gpu_win(repo_id,
 | 
				
			||||||
    print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
 | 
					    print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    model = BenchmarkWrapper(model)
 | 
					    model = BenchmarkWrapper(model)
 | 
				
			||||||
    streamer = TextStreamer(tokenizer, skip_prompt=True)
 | 
					    if repo_id not in DUMMY_IDS:
 | 
				
			||||||
 | 
					        streamer = TextStreamer(tokenizer, skip_prompt=True)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        streaming = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    result = {}
 | 
					    result = {}
 | 
				
			||||||
    with torch.inference_mode():
 | 
					    with torch.inference_mode():
 | 
				
			||||||
| 
						 | 
					@ -1264,14 +1273,17 @@ def run_transformer_int4_gpu_win(repo_id,
 | 
				
			||||||
                in_out_len = in_out.split("-")
 | 
					                in_out_len = in_out.split("-")
 | 
				
			||||||
                in_len = int(in_out_len[0])
 | 
					                in_len = int(in_out_len[0])
 | 
				
			||||||
                out_len = int(in_out_len[1])
 | 
					                out_len = int(in_out_len[1])
 | 
				
			||||||
                input_str = get_continuation_input_str(in_len, tokenizer)
 | 
					                if repo_id not in DUMMY_IDS:
 | 
				
			||||||
                # As different tokenizer has different encodings,
 | 
					                    input_str = get_continuation_input_str(in_len, tokenizer)
 | 
				
			||||||
                # slice the input_ids to ensure the prompt length is required length.
 | 
					                    # As different tokenizer has different encodings,
 | 
				
			||||||
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
					                    # slice the input_ids to ensure the prompt length is required length.
 | 
				
			||||||
                input_ids = input_ids[:, :in_len]
 | 
					                    input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
				
			||||||
                true_str = tokenizer.batch_decode(input_ids)[0]
 | 
					                    input_ids = input_ids[:, :in_len]
 | 
				
			||||||
                input_list = [true_str] * batch_size
 | 
					                    true_str = tokenizer.batch_decode(input_ids)[0]
 | 
				
			||||||
                input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
 | 
					                    input_list = [true_str] * batch_size
 | 
				
			||||||
 | 
					                    input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu')
 | 
				
			||||||
                actual_in_len = input_ids.shape[1]
 | 
					                actual_in_len = input_ids.shape[1]
 | 
				
			||||||
                result[in_out] = []
 | 
					                result[in_out] = []
 | 
				
			||||||
                for i in range(num_trials + warm_up):
 | 
					                for i in range(num_trials + warm_up):
 | 
				
			||||||
| 
						 | 
					@ -1288,9 +1300,10 @@ def run_transformer_int4_gpu_win(repo_id,
 | 
				
			||||||
                    end = time.perf_counter()
 | 
					                    end = time.perf_counter()
 | 
				
			||||||
                    output_ids = output_ids.cpu()
 | 
					                    output_ids = output_ids.cpu()
 | 
				
			||||||
                    print("model generate cost: " + str(end - st))
 | 
					                    print("model generate cost: " + str(end - st))
 | 
				
			||||||
                    output = tokenizer.batch_decode(output_ids)
 | 
					                    if repo_id not in DUMMY_IDS:
 | 
				
			||||||
                    if not streaming:
 | 
					                        output = tokenizer.batch_decode(output_ids)
 | 
				
			||||||
                        print(output[0])
 | 
					                        if not streaming:
 | 
				
			||||||
 | 
					                            print(output[0])
 | 
				
			||||||
                    actual_out_len = output_ids.shape[1] - actual_in_len
 | 
					                    actual_out_len = output_ids.shape[1] - actual_in_len
 | 
				
			||||||
                    if i >= warm_up:
 | 
					                    if i >= warm_up:
 | 
				
			||||||
                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
 | 
					                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
 | 
				
			||||||
| 
						 | 
					@ -1369,6 +1382,11 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
 | 
				
			||||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
					        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
				
			||||||
        model = model.to('xpu')
 | 
					        model = model.to('xpu')
 | 
				
			||||||
        model = model.llm
 | 
					        model = model.llm
 | 
				
			||||||
 | 
					    elif repo_id in DUMMY_IDS:
 | 
				
			||||||
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
 | 
				
			||||||
 | 
					                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
 | 
				
			||||||
 | 
					                                                     torch_dtype=torch.float16).eval()
 | 
				
			||||||
 | 
					        model = model.to('xpu')
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
 | 
					        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
 | 
				
			||||||
                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
 | 
					                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
 | 
				
			||||||
| 
						 | 
					@ -1380,7 +1398,10 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
 | 
				
			||||||
    print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
 | 
					    print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    model = BenchmarkWrapper(model)
 | 
					    model = BenchmarkWrapper(model)
 | 
				
			||||||
    streamer = TextStreamer(tokenizer, skip_prompt=True)
 | 
					    if repo_id not in DUMMY_IDS:
 | 
				
			||||||
 | 
					        streamer = TextStreamer(tokenizer, skip_prompt=True)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        streaming = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    result = {}
 | 
					    result = {}
 | 
				
			||||||
    with torch.inference_mode():
 | 
					    with torch.inference_mode():
 | 
				
			||||||
| 
						 | 
					@ -1389,14 +1410,17 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
 | 
				
			||||||
                in_out_len = in_out.split("-")
 | 
					                in_out_len = in_out.split("-")
 | 
				
			||||||
                in_len = int(in_out_len[0])
 | 
					                in_len = int(in_out_len[0])
 | 
				
			||||||
                out_len = int(in_out_len[1])
 | 
					                out_len = int(in_out_len[1])
 | 
				
			||||||
                input_str = get_continuation_input_str(in_len, tokenizer)
 | 
					                if repo_id not in DUMMY_IDS:
 | 
				
			||||||
                # As different tokenizer has different encodings,
 | 
					                    input_str = get_continuation_input_str(in_len, tokenizer)
 | 
				
			||||||
                # slice the input_ids to ensure the prompt length is required length.
 | 
					                    # As different tokenizer has different encodings,
 | 
				
			||||||
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
					                    # slice the input_ids to ensure the prompt length is required length.
 | 
				
			||||||
                input_ids = input_ids[:, :in_len]
 | 
					                    input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
				
			||||||
                true_str = tokenizer.batch_decode(input_ids)[0]
 | 
					                    input_ids = input_ids[:, :in_len]
 | 
				
			||||||
                input_list = [true_str] * batch_size
 | 
					                    true_str = tokenizer.batch_decode(input_ids)[0]
 | 
				
			||||||
                input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
 | 
					                    input_list = [true_str] * batch_size
 | 
				
			||||||
 | 
					                    input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu')
 | 
				
			||||||
                actual_in_len = input_ids.shape[1]
 | 
					                actual_in_len = input_ids.shape[1]
 | 
				
			||||||
                result[in_out] = []
 | 
					                result[in_out] = []
 | 
				
			||||||
                for i in range(num_trials + warm_up):
 | 
					                for i in range(num_trials + warm_up):
 | 
				
			||||||
| 
						 | 
					@ -1413,9 +1437,10 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
 | 
				
			||||||
                    end = time.perf_counter()
 | 
					                    end = time.perf_counter()
 | 
				
			||||||
                    output_ids = output_ids.cpu()
 | 
					                    output_ids = output_ids.cpu()
 | 
				
			||||||
                    print("model generate cost: " + str(end - st))
 | 
					                    print("model generate cost: " + str(end - st))
 | 
				
			||||||
                    output = tokenizer.batch_decode(output_ids)
 | 
					                    if repo_id not in DUMMY_IDS:
 | 
				
			||||||
                    if not streaming:
 | 
					                        output = tokenizer.batch_decode(output_ids)
 | 
				
			||||||
                        print(output[0])
 | 
					                        if not streaming:
 | 
				
			||||||
 | 
					                            print(output[0])
 | 
				
			||||||
                    actual_out_len = output_ids.shape[1] - actual_in_len
 | 
					                    actual_out_len = output_ids.shape[1] - actual_in_len
 | 
				
			||||||
                    if i >= warm_up:
 | 
					                    if i >= warm_up:
 | 
				
			||||||
                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
 | 
					                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
 | 
				
			||||||
| 
						 | 
					@ -1590,6 +1615,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
 | 
				
			||||||
                                                  use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
					                                                  use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
				
			||||||
        tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
 | 
					        tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
 | 
				
			||||||
        model = model.half().to('xpu')
 | 
					        model = model.half().to('xpu')
 | 
				
			||||||
 | 
					    elif repo_id in DUMMY_IDS:
 | 
				
			||||||
 | 
					        model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
 | 
				
			||||||
 | 
					                                                  use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
				
			||||||
 | 
					        model = model.to('xpu')
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
 | 
					        model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
 | 
				
			||||||
                                                  use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
					                                                  use_cache=True, cpu_embedding=cpu_embedding).eval()
 | 
				
			||||||
| 
						 | 
					@ -1600,7 +1629,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
 | 
				
			||||||
    print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
 | 
					    print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    model = BenchmarkWrapper(model)
 | 
					    model = BenchmarkWrapper(model)
 | 
				
			||||||
    streamer = TextStreamer(tokenizer, skip_prompt=True)
 | 
					    if repo_id not in DUMMY_IDS:
 | 
				
			||||||
 | 
					        streamer = TextStreamer(tokenizer, skip_prompt=True)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        streaming = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    result = {}
 | 
					    result = {}
 | 
				
			||||||
    with torch.inference_mode():
 | 
					    with torch.inference_mode():
 | 
				
			||||||
| 
						 | 
					@ -1609,14 +1641,17 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
 | 
				
			||||||
                in_out_len = in_out.split("-")
 | 
					                in_out_len = in_out.split("-")
 | 
				
			||||||
                in_len = int(in_out_len[0])
 | 
					                in_len = int(in_out_len[0])
 | 
				
			||||||
                out_len = int(in_out_len[1])
 | 
					                out_len = int(in_out_len[1])
 | 
				
			||||||
                input_str = get_continuation_input_str(in_len, tokenizer)
 | 
					                if repo_id not in DUMMY_IDS:
 | 
				
			||||||
                # As different tokenizer has different encodings,
 | 
					                    input_str = get_continuation_input_str(in_len, tokenizer)
 | 
				
			||||||
                # slice the input_ids to ensure the prompt length is required length.
 | 
					                    # As different tokenizer has different encodings,
 | 
				
			||||||
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
					                    # slice the input_ids to ensure the prompt length is required length.
 | 
				
			||||||
                input_ids = input_ids[:, :in_len]
 | 
					                    input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
				
			||||||
                true_str = tokenizer.batch_decode(input_ids)[0]
 | 
					                    input_ids = input_ids[:, :in_len]
 | 
				
			||||||
                input_list = [true_str] * batch_size
 | 
					                    true_str = tokenizer.batch_decode(input_ids)[0]
 | 
				
			||||||
                input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
 | 
					                    input_list = [true_str] * batch_size
 | 
				
			||||||
 | 
					                    input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    input_ids = torch.randint(1000, 2000, [batch_size, in_len], dtype=torch.int64).to('xpu')
 | 
				
			||||||
                actual_in_len = input_ids.shape[1]
 | 
					                actual_in_len = input_ids.shape[1]
 | 
				
			||||||
                result[in_out] = []
 | 
					                result[in_out] = []
 | 
				
			||||||
                for i in range(num_trials + warm_up):
 | 
					                for i in range(num_trials + warm_up):
 | 
				
			||||||
| 
						 | 
					@ -1633,9 +1668,10 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
 | 
				
			||||||
                    end = time.perf_counter()
 | 
					                    end = time.perf_counter()
 | 
				
			||||||
                    output_ids = output_ids.cpu()
 | 
					                    output_ids = output_ids.cpu()
 | 
				
			||||||
                    print("model generate cost: " + str(end - st))
 | 
					                    print("model generate cost: " + str(end - st))
 | 
				
			||||||
                    output = tokenizer.batch_decode(output_ids)
 | 
					                    if repo_id not in DUMMY_IDS:
 | 
				
			||||||
                    if not streaming:
 | 
					                        output = tokenizer.batch_decode(output_ids)
 | 
				
			||||||
                        print(output[0])
 | 
					                        if not streaming:
 | 
				
			||||||
 | 
					                            print(output[0])
 | 
				
			||||||
                    actual_out_len = output_ids.shape[1] - actual_in_len
 | 
					                    actual_out_len = output_ids.shape[1] - actual_in_len
 | 
				
			||||||
                    if i >= warm_up:
 | 
					                    if i >= warm_up:
 | 
				
			||||||
                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
 | 
					                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,6 +19,8 @@ repo_id:
 | 
				
			||||||
  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
					  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
				
			||||||
  - 'microsoft/phi-3-vision-128k-instruct'
 | 
					  - 'microsoft/phi-3-vision-128k-instruct'
 | 
				
			||||||
  - 'openbmb/MiniCPM-V-2_6'
 | 
					  - 'openbmb/MiniCPM-V-2_6'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-1.5B'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-4B'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -18,6 +18,8 @@ repo_id:
 | 
				
			||||||
  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
					  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
				
			||||||
  - 'microsoft/phi-3-vision-128k-instruct'
 | 
					  - 'microsoft/phi-3-vision-128k-instruct'
 | 
				
			||||||
  - 'openbmb/MiniCPM-V-2_6'
 | 
					  - 'openbmb/MiniCPM-V-2_6'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-1.5B'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-4B'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,6 +17,8 @@ repo_id:
 | 
				
			||||||
  - 'microsoft/Phi-3-mini-4k-instruct'
 | 
					  - 'microsoft/Phi-3-mini-4k-instruct'
 | 
				
			||||||
  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
					  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
				
			||||||
  - 'microsoft/phi-3-vision-128k-instruct'
 | 
					  - 'microsoft/phi-3-vision-128k-instruct'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-1.5B'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-4B'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -18,6 +18,8 @@ repo_id:
 | 
				
			||||||
  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
					  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
				
			||||||
  - 'microsoft/phi-3-vision-128k-instruct'
 | 
					  - 'microsoft/phi-3-vision-128k-instruct'
 | 
				
			||||||
  - 'openbmb/MiniCPM-V-2_6'
 | 
					  - 'openbmb/MiniCPM-V-2_6'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-1.5B'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-4B'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,6 +17,8 @@ repo_id:
 | 
				
			||||||
  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
					  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
				
			||||||
  - 'microsoft/phi-3-vision-128k-instruct'
 | 
					  - 'microsoft/phi-3-vision-128k-instruct'
 | 
				
			||||||
  - 'openbmb/MiniCPM-V-2_6'
 | 
					  - 'openbmb/MiniCPM-V-2_6'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-1.5B'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-4B'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -18,6 +18,8 @@ repo_id:
 | 
				
			||||||
  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
					  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
				
			||||||
  - 'microsoft/phi-3-vision-128k-instruct'
 | 
					  - 'microsoft/phi-3-vision-128k-instruct'
 | 
				
			||||||
  - 'openbmb/MiniCPM-V-2_6'
 | 
					  - 'openbmb/MiniCPM-V-2_6'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-1.5B'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-4B'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 3
 | 
					warm_up: 3
 | 
				
			||||||
num_trials: 5
 | 
					num_trials: 5
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,6 +17,8 @@ repo_id:
 | 
				
			||||||
  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
					  - 'microsoft/Phi-3-mini-128k-instruct'
 | 
				
			||||||
  - 'microsoft/phi-3-vision-128k-instruct'
 | 
					  - 'microsoft/phi-3-vision-128k-instruct'
 | 
				
			||||||
  - 'openbmb/MiniCPM-V-2_6'
 | 
					  - 'openbmb/MiniCPM-V-2_6'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-1.5B'
 | 
				
			||||||
 | 
					  - 'dummy/dummy-4B'
 | 
				
			||||||
local_model_hub: 'path to your local model hub'
 | 
					local_model_hub: 'path to your local model hub'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue