LLM: add benchmark tool for gpu (#8760)
* add benchmark tool for gpu * update
This commit is contained in:
		
							parent
							
								
									97283c033c
								
							
						
					
					
						commit
						8805186f2f
					
				
					 2 changed files with 4720 additions and 2 deletions
				
			
		| 
						 | 
				
			
			@ -1,8 +1,10 @@
 | 
			
		|||
# Benchmark tool for transformers int4 (separate 1st token and rest)
 | 
			
		||||
 | 
			
		||||
`benchmark_util.py` is used to provide a simple benchmark tool for transformer int4 model to calculate 1st token performance and the rest.
 | 
			
		||||
`benchmark_util.py` is used to provide a simple benchmark tool for transformer int4 model to calculate 1st token performance and the rest on CPU.
 | 
			
		||||
 | 
			
		||||
## Usage
 | 
			
		||||
`gpu_benchmark_util.py` is used to provide a simple benchmark tool for transformer int4 model to calculate 1st token performance and the rest on GPU.
 | 
			
		||||
 | 
			
		||||
## CPU Usage
 | 
			
		||||
Just put this file into your benchmark directory, and then wrap your transformer int4 model with `BenchmarkWrapper` (`model = BenchmarkWrapper(model)`).
 | 
			
		||||
Take `chatglm-6b` as an example:
 | 
			
		||||
```python
 | 
			
		||||
| 
						 | 
				
			
			@ -30,3 +32,34 @@ Output will be like:
 | 
			
		|||
=========First token cost xx.xxxxs=========
 | 
			
		||||
=========Last token cost average xx.xxxxs (31 tokens in all)=========
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## GPU Usage
 | 
			
		||||
Just put this file into your benchmark directory, and then wrap your transformer int4 model with `BenchmarkWrapper` (`model = BenchmarkWrapper(model)`).
 | 
			
		||||
Take `chatglm-6b` as an example:
 | 
			
		||||
```python
 | 
			
		||||
import torch
 | 
			
		||||
import os
 | 
			
		||||
import intel_extension_for_pytorch as ipex
 | 
			
		||||
from bigdl.llm.transformers import AutoModel
 | 
			
		||||
from transformers import AutoTokenizer
 | 
			
		||||
import time
 | 
			
		||||
import numpy as np
 | 
			
		||||
from gpu_benchmark_util import BenchmarkWrapper
 | 
			
		||||
 | 
			
		||||
model_path ='THUDM/chatglm-6b'
 | 
			
		||||
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
 | 
			
		||||
model = model.half().to('xpu')
 | 
			
		||||
model = BenchmarkWrapper(model)
 | 
			
		||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
prompt = "今天睡不着怎么办"
 | 
			
		||||
 
 | 
			
		||||
with torch.inference_mode():
 | 
			
		||||
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 | 
			
		||||
    output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
 | 
			
		||||
    output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
```
 | 
			
		||||
Output will be like:
 | 
			
		||||
```bash
 | 
			
		||||
=========First token cost xx.xxxxs=========
 | 
			
		||||
=========Last token cost average xx.xxxxs (31 tokens in all)=========
 | 
			
		||||
```
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										4685
									
								
								python/llm/dev/benchmark/gpu_benchmark_util.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4685
									
								
								python/llm/dev/benchmark/gpu_benchmark_util.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
		Loading…
	
		Reference in a new issue