LLM: reduce GPU 1st token latency and update example (#8763)
* reduce 1st token latency * update example * fix * fix style * update readme of gpu benchmark
This commit is contained in:
		
							parent
							
								
									06609d9260
								
							
						
					
					
						commit
						e9aa2bd890
					
				
					 4 changed files with 15 additions and 6 deletions
				
			
		| 
						 | 
				
			
			@ -54,9 +54,16 @@ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		|||
prompt = "今天睡不着怎么办"
 | 
			
		||||
 
 | 
			
		||||
with torch.inference_mode():
 | 
			
		||||
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 | 
			
		||||
    output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
 | 
			
		||||
    output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
    # wamup two times as use ipex
 | 
			
		||||
    for i in range(2):
 | 
			
		||||
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 | 
			
		||||
        output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
 | 
			
		||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
    # collect performance data now
 | 
			
		||||
    for i in range(5):
 | 
			
		||||
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 | 
			
		||||
        output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
 | 
			
		||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
```
 | 
			
		||||
Output will be like:
 | 
			
		||||
```bash
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -46,7 +46,7 @@ if __name__ == '__main__':
 | 
			
		|||
                                      load_in_4bit=True,
 | 
			
		||||
                                      optimize_model=False,
 | 
			
		||||
                                      trust_remote_code=True)
 | 
			
		||||
    model = model.half().to('xpu')
 | 
			
		||||
    model = model.to('xpu')
 | 
			
		||||
 | 
			
		||||
    # Load tokenizer
 | 
			
		||||
    tokenizer = AutoTokenizer.from_pretrained(model_path,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -49,7 +49,7 @@ if __name__ == '__main__':
 | 
			
		|||
                                                 load_in_4bit=True,
 | 
			
		||||
                                                 optimize_model=False,
 | 
			
		||||
                                                 trust_remote_code=True)
 | 
			
		||||
    model = model.half().to('xpu')
 | 
			
		||||
    model = model.to('xpu')
 | 
			
		||||
 | 
			
		||||
    # Load tokenizer
 | 
			
		||||
    tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -169,7 +169,6 @@ class ParamsQuant(torch.nn.Parameter):
 | 
			
		|||
                                    quantized=self.quantized,
 | 
			
		||||
                                    _shape=self._shape,
 | 
			
		||||
                                    qtype=self.qtype)
 | 
			
		||||
 | 
			
		||||
            return new_param
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -244,6 +243,9 @@ class LinearQuant(nn.Linear):
 | 
			
		|||
 | 
			
		||||
            if x_2d.is_contiguous() is False:
 | 
			
		||||
                x_2d = x_2d.contiguous()
 | 
			
		||||
            # current workaround to reduce first token latency of fp32 input
 | 
			
		||||
            if x_2d.shape[0] > 1 and x_2d.dtype == torch.float32:
 | 
			
		||||
                x_2d = x_2d.half()
 | 
			
		||||
            # input format of linear_q4.forward is 1: input, 2: weight
 | 
			
		||||
            result = linear_q4_0.forward(x_2d, x0)
 | 
			
		||||
            new_shape = x_shape[:-1] + (self.out_len,)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue