[LLM] Move embedding layer to CPU for iGPU inference (#9343)
* Move embedding layer to CPU for iGPU llm inference * Empty cache after to cpu * Remove empty cache as it seems to have some negative effect to first token
This commit is contained in:
		
							parent
							
								
									8f23fb04dc
								
							
						
					
					
						commit
						a0150bb205
					
				
					 1 changed files with 3 additions and 2 deletions
				
			
		| 
						 | 
				
			
			@ -21,5 +21,6 @@ from torch import Tensor
 | 
			
		|||
 | 
			
		||||
class LLMEmbedding(torch.nn.Embedding):
 | 
			
		||||
    def forward(self, x: Tensor):
 | 
			
		||||
        x_shape = x.shape
 | 
			
		||||
        return self.weight[x.reshape(-1)].reshape(*x_shape, -1)
 | 
			
		||||
        if self.weight.device != 'cpu':
 | 
			
		||||
            self.to('cpu')
 | 
			
		||||
        return super().forward(x.to('cpu')).to(x.device)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue