[LLM] Move embedding layer to CPU for iGPU inference (#9343)

* Move embedding layer to CPU for iGPU llm inference

* Empty cache after to cpu

* Remove empty cache as it seems to have some negative effect to first token
This commit is contained in:
Yuwen Hu 2023-11-03 11:13:45 +08:00 committed by GitHub
parent 8f23fb04dc
commit a0150bb205

View file

@ -21,5 +21,6 @@ from torch import Tensor
class LLMEmbedding(torch.nn.Embedding): class LLMEmbedding(torch.nn.Embedding):
def forward(self, x: Tensor): def forward(self, x: Tensor):
x_shape = x.shape if self.weight.device != 'cpu':
return self.weight[x.reshape(-1)].reshape(*x_shape, -1) self.to('cpu')
return super().forward(x.to('cpu')).to(x.device)