[LLM] Move embedding layer to CPU for iGPU inference (#9343)
* Move embedding layer to CPU for iGPU llm inference * Empty cache after to cpu * Remove empty cache as it seems to have some negative effect to first token
This commit is contained in:
parent
8f23fb04dc
commit
a0150bb205
1 changed files with 3 additions and 2 deletions
|
|
@ -21,5 +21,6 @@ from torch import Tensor
|
|||
|
||||
class LLMEmbedding(torch.nn.Embedding):
|
||||
def forward(self, x: Tensor):
|
||||
x_shape = x.shape
|
||||
return self.weight[x.reshape(-1)].reshape(*x_shape, -1)
|
||||
if self.weight.device != 'cpu':
|
||||
self.to('cpu')
|
||||
return super().forward(x.to('cpu')).to(x.device)
|
||||
|
|
|
|||
Loading…
Reference in a new issue