diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 2b59d29f..efc5aaf2 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -145,8 +145,8 @@ python minicpm.py --disable-transpose-value-cache python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --disable-transpose-value-cache ``` -#### High CPU Utilization -You can reduce CPU utilization by setting the environment variable with `set IPEX_LLM_CPU_LM_HEAD=0`. +#### Better Performance with High CPU Utilization +You could enable optimization by setting the environment variable with `set IPEX_LLM_CPU_LM_HEAD=1` for better performance. But this will cause high CPU utilization. ### Sample Output diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index a1b07a8c..ba40729a 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -43,7 +43,8 @@ def optimize_llm_pre(model: torch.nn.Module, qtype): model.apply(pre_compute_inv_freq) # lm_head to cpu optimization - if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "1") != "0": + if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0": + # disable the optimization by default from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8 if qtype == "sym_int4_rtn": lm_qtype = SYM_INT4