[LLM] Move embedding layer to CPU for iGPU inference (#9343)
* Move embedding layer to CPU for iGPU llm inference * Empty cache after to cpu * Remove empty cache as it seems to have some negative effect to first token
This commit is contained in:
parent
8f23fb04dc
commit
a0150bb205
1 changed files with 3 additions and 2 deletions
|
|
@ -21,5 +21,6 @@ from torch import Tensor
|
||||||
|
|
||||||
class LLMEmbedding(torch.nn.Embedding):
|
class LLMEmbedding(torch.nn.Embedding):
|
||||||
def forward(self, x: Tensor):
|
def forward(self, x: Tensor):
|
||||||
x_shape = x.shape
|
if self.weight.device != 'cpu':
|
||||||
return self.weight[x.reshape(-1)].reshape(*x_shape, -1)
|
self.to('cpu')
|
||||||
|
return super().forward(x.to('cpu')).to(x.device)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue