LLM: add more models and skip runtime error (#9349)
* add more models and skip runtime error * upgrade transformers * temporarily removed Mistral-7B-v0.1 * temporarily disable the upload of arc perf result
This commit is contained in:
		
							parent
							
								
									fae6db3ddc
								
							
						
					
					
						commit
						84ab614aab
					
				
					 3 changed files with 43 additions and 34 deletions
				
			
		
							
								
								
									
										2
									
								
								.github/workflows/llm_performance_tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/llm_performance_tests.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -108,6 +108,7 @@ jobs:
 | 
				
			||||||
          python -m pip install --upgrade einops
 | 
					          python -m pip install --upgrade einops
 | 
				
			||||||
          python -m pip install --upgrade transformers_stream_generator
 | 
					          python -m pip install --upgrade transformers_stream_generator
 | 
				
			||||||
          python -m pip install --upgrade tiktoken
 | 
					          python -m pip install --upgrade tiktoken
 | 
				
			||||||
 | 
					          python -m pip install transformers==4.34.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Download llm binary
 | 
					      - name: Download llm binary
 | 
				
			||||||
        uses: ./.github/actions/llm/download-llm-binary
 | 
					        uses: ./.github/actions/llm/download-llm-binary
 | 
				
			||||||
| 
						 | 
					@ -134,7 +135,6 @@ jobs:
 | 
				
			||||||
          export http_proxy=${HTTP_PROXY}
 | 
					          export http_proxy=${HTTP_PROXY}
 | 
				
			||||||
          export https_proxy=${HTTPS_PROXY}
 | 
					          export https_proxy=${HTTPS_PROXY}
 | 
				
			||||||
          python run.py
 | 
					          python run.py
 | 
				
			||||||
          curl -T ./*.csv ${LLM_FTP_URL}/llm/ggml-actions/perf/
 | 
					 | 
				
			||||||
          cp ./*.csv /mnt/disk1/nightly_perf_gpu/
 | 
					          cp ./*.csv /mnt/disk1/nightly_perf_gpu/
 | 
				
			||||||
          cd ../../../test/benchmark
 | 
					          cd ../../../test/benchmark
 | 
				
			||||||
          python csv_to_html.py -f /mnt/disk1/nightly_perf_gpu/
 | 
					          python csv_to_html.py -f /mnt/disk1/nightly_perf_gpu/
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -59,7 +59,7 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
 | 
				
			||||||
        result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
 | 
					        result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for in_out_pair in in_out_pairs:
 | 
					    for in_out_pair in in_out_pairs:
 | 
				
			||||||
        if result:
 | 
					        if result and result[in_out_pair]:
 | 
				
			||||||
            results.append([repo_id,
 | 
					            results.append([repo_id,
 | 
				
			||||||
                            round(np.mean(result[in_out_pair], axis=0)[0]*1000.0, 2),
 | 
					                            round(np.mean(result[in_out_pair], axis=0)[0]*1000.0, 2),
 | 
				
			||||||
                            round(np.mean(result[in_out_pair], axis=0)[1]*1000.0, 2),
 | 
					                            round(np.mean(result[in_out_pair], axis=0)[1]*1000.0, 2),
 | 
				
			||||||
| 
						 | 
					@ -357,38 +357,41 @@ def run_transformer_int4_gpu(repo_id,
 | 
				
			||||||
    result = {}
 | 
					    result = {}
 | 
				
			||||||
    with torch.inference_mode():
 | 
					    with torch.inference_mode():
 | 
				
			||||||
        for in_out in in_out_pairs:
 | 
					        for in_out in in_out_pairs:
 | 
				
			||||||
            in_out_len = in_out.split("-")
 | 
					            try:
 | 
				
			||||||
            in_len = int(in_out_len[0])
 | 
					                in_out_len = in_out.split("-")
 | 
				
			||||||
            out_len = int(in_out_len[1])
 | 
					                in_len = int(in_out_len[0])
 | 
				
			||||||
            # As different tokenizer has different encodings,
 | 
					                out_len = int(in_out_len[1])
 | 
				
			||||||
            # in_len.txt maybe shorter than we need,
 | 
					                # As different tokenizer has different encodings,
 | 
				
			||||||
            # use much longer context to make sure input length
 | 
					                # in_len.txt maybe shorter than we need,
 | 
				
			||||||
            test_length = min(in_len*2, 8192)
 | 
					                # use much longer context to make sure input length
 | 
				
			||||||
            while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
					                test_length = min(in_len*2, 8192)
 | 
				
			||||||
                test_length = test_length * 2
 | 
					                while test_length not in [32, 256, 1024, 2048, 8192]:
 | 
				
			||||||
            input_str = open(f"prompt/{test_length}.txt", 'r').read()
 | 
					                    test_length = test_length * 2
 | 
				
			||||||
            # As different tokenizer has different encodings,
 | 
					                input_str = open(f"prompt/{test_length}.txt", 'r').read()
 | 
				
			||||||
            # slice the input_ids to ensure the prompt length is required length.
 | 
					                # As different tokenizer has different encodings,
 | 
				
			||||||
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
					                # slice the input_ids to ensure the prompt length is required length.
 | 
				
			||||||
            input_ids = input_ids[:, :in_len]
 | 
					                input_ids = tokenizer.encode(input_str, return_tensors="pt")
 | 
				
			||||||
            true_str = tokenizer.batch_decode(input_ids)[0]
 | 
					                input_ids = input_ids[:, :in_len]
 | 
				
			||||||
            input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
 | 
					                true_str = tokenizer.batch_decode(input_ids)[0]
 | 
				
			||||||
            actual_in_len = input_ids.shape[1]
 | 
					                input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
 | 
				
			||||||
            result[in_out] = []
 | 
					                actual_in_len = input_ids.shape[1]
 | 
				
			||||||
            for i in range(num_trials + warm_up):
 | 
					                result[in_out] = []
 | 
				
			||||||
                st = time.perf_counter()
 | 
					                for i in range(num_trials + warm_up):
 | 
				
			||||||
                output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
 | 
					                    st = time.perf_counter()
 | 
				
			||||||
                                            num_beams=num_beams)
 | 
					                    output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
 | 
				
			||||||
                torch.xpu.synchronize()
 | 
					                                                num_beams=num_beams)
 | 
				
			||||||
                end = time.perf_counter()
 | 
					                    torch.xpu.synchronize()
 | 
				
			||||||
                output_ids = output_ids.cpu()
 | 
					                    end = time.perf_counter()
 | 
				
			||||||
                print("model generate cost: " + str(end - st))
 | 
					                    output_ids = output_ids.cpu()
 | 
				
			||||||
                output = tokenizer.batch_decode(output_ids)
 | 
					                    print("model generate cost: " + str(end - st))
 | 
				
			||||||
                print(output[0])
 | 
					                    output = tokenizer.batch_decode(output_ids)
 | 
				
			||||||
                actual_out_len = output_ids.shape[1] - actual_in_len
 | 
					                    print(output[0])
 | 
				
			||||||
                if i >= warm_up:
 | 
					                    actual_out_len = output_ids.shape[1] - actual_in_len
 | 
				
			||||||
                    result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
 | 
					                    if i >= warm_up:
 | 
				
			||||||
                                           actual_in_len, actual_out_len])
 | 
					                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
 | 
				
			||||||
 | 
					                                            actual_in_len, actual_out_len])
 | 
				
			||||||
 | 
					            except RuntimeError:
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
    torch.xpu.empty_cache()
 | 
					    torch.xpu.empty_cache()
 | 
				
			||||||
    return result
 | 
					    return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,13 +4,19 @@ repo_id:
 | 
				
			||||||
  - 'THUDM/chatglm2-6b'
 | 
					  - 'THUDM/chatglm2-6b'
 | 
				
			||||||
  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
					  - 'tiiuae/falcon-7b-instruct-with-patch'
 | 
				
			||||||
  - 'mosaicml/mpt-7b-chat'
 | 
					  - 'mosaicml/mpt-7b-chat'
 | 
				
			||||||
 | 
					  # - 'bigscience/bloomz-7b1' # temporarily removed
 | 
				
			||||||
  - 'redpajama/gptneox-7b-redpajama-bf16'
 | 
					  - 'redpajama/gptneox-7b-redpajama-bf16'
 | 
				
			||||||
 | 
					  - 'bigcode/starcoder-15.5b'
 | 
				
			||||||
  - 'databricks/dolly-v1-6b'
 | 
					  - 'databricks/dolly-v1-6b'
 | 
				
			||||||
  - 'databricks/dolly-v2-7b'
 | 
					  - 'databricks/dolly-v2-7b'
 | 
				
			||||||
  - 'databricks/dolly-v2-12b'
 | 
					  - 'databricks/dolly-v2-12b'
 | 
				
			||||||
  - 'internlm/internlm-chat-7b-8k'
 | 
					  - 'internlm/internlm-chat-7b-8k'
 | 
				
			||||||
 | 
					  - 'baichuan-inc/Baichuan-13B-Chat'
 | 
				
			||||||
 | 
					  - 'fnlp/moss-moon-003-sft'
 | 
				
			||||||
  - 'Qwen/Qwen-7B-Chat-10-12'
 | 
					  - 'Qwen/Qwen-7B-Chat-10-12'
 | 
				
			||||||
  - 'BAAI/AquilaChat-7B'
 | 
					  - 'BAAI/AquilaChat-7B'
 | 
				
			||||||
 | 
					  - 'baichuan-inc/Baichuan2-7B-Chat'
 | 
				
			||||||
 | 
					  # - 'mistralai/Mistral-7B-v0.1' # temporarily removed
 | 
				
			||||||
local_model_hub: '/mnt/disk1/models'
 | 
					local_model_hub: '/mnt/disk1/models'
 | 
				
			||||||
warm_up: 1
 | 
					warm_up: 1
 | 
				
			||||||
num_trials: 3
 | 
					num_trials: 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue