diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index 9c9022e7..d38b1e43 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -26,6 +26,7 @@ from intel_npu_acceleration_library.nn.autograd import AutogradMatMul from intel_npu_acceleration_library.backend import run_matmul from intel_npu_acceleration_library.dtypes import NPUDtype from typing import Optional, Union +import os import torch from torch.nn import Parameter import uuid @@ -177,6 +178,14 @@ class QuantizedLinear(torch.nn.Module): Returns: torch.Tensor: result """ + + # we assume a Linear is lm_head when its out_features > 30000, + # if out_features > 100000, enable lm_head optimization automatically + if x.size(1) > 500 and ( + (self.outC > 100_000 and os.environ.get("IPEX_LLM_LAST_LM_HEAD") != "0") or + (self.outC > 30_000 and os.environ.get("IPEX_LLM_LAST_LM_HEAD") == "1") + ): + x = x[:, -1:, :] if self.training: invalidInputError( False,