From a0150bb205117b75e8b8c807c33e73eaaa4f8d38 Mon Sep 17 00:00:00 2001 From: Yuwen Hu <54161268+Oscilloscope98@users.noreply.github.com> Date: Fri, 3 Nov 2023 11:13:45 +0800 Subject: [PATCH] [LLM] Move embedding layer to CPU for iGPU inference (#9343) * Move embedding layer to CPU for iGPU llm inference * Empty cache after to cpu * Remove empty cache as it seems to have some negative effect to first token --- python/llm/src/bigdl/llm/transformers/embedding.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/llm/src/bigdl/llm/transformers/embedding.py b/python/llm/src/bigdl/llm/transformers/embedding.py index 2764d01e..38aa4db7 100644 --- a/python/llm/src/bigdl/llm/transformers/embedding.py +++ b/python/llm/src/bigdl/llm/transformers/embedding.py @@ -21,5 +21,6 @@ from torch import Tensor class LLMEmbedding(torch.nn.Embedding): def forward(self, x: Tensor): - x_shape = x.shape - return self.weight[x.reshape(-1)].reshape(*x_shape, -1) + if self.weight.device != 'cpu': + self.to('cpu') + return super().forward(x.to('cpu')).to(x.device)