[LLM] Move embedding layer to CPU for iGPU inference (#9343)

* Move embedding layer to CPU for iGPU llm inference * Empty cache after to cpu * Remove empty cache as it seems to have some negative effect to first token
2023-11-03 11:13:45 +08:00 · 2023-11-03 11:13:45 +08:00 · a0150bb205
commit a0150bb205
parent 8f23fb04dc
1 changed files with 3 additions and 2 deletions
--- a/python/llm/src/bigdl/llm/transformers/embedding.py
+++ b/python/llm/src/bigdl/llm/transformers/embedding.py
@ -21,5 +21,6 @@ from torch import Tensor
 class LLMEmbedding(torch.nn.Embedding):
    def forward(self, x: Tensor):
-        x_shape = x.shape
+        if self.weight.device != 'cpu':
-        return self.weight[x.reshape(-1)].reshape(*x_shape, -1)
+            self.to('cpu')
        return super().forward(x.to('cpu')).to(x.device)