Disable lm head (#11972)

This commit is contained in:
binbin Deng 2024-08-30 11:05:18 +08:00 committed by GitHub
parent 7d103417b8
commit cd077881f1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 4 additions and 3 deletions

View file

@ -145,8 +145,8 @@ python minicpm.py --disable-transpose-value-cache
python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --disable-transpose-value-cache python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --disable-transpose-value-cache
``` ```
#### High CPU Utilization #### Better Performance with High CPU Utilization
You can reduce CPU utilization by setting the environment variable with `set IPEX_LLM_CPU_LM_HEAD=0`. You could enable optimization by setting the environment variable with `set IPEX_LLM_CPU_LM_HEAD=1` for better performance. But this will cause high CPU utilization.
### Sample Output ### Sample Output

View file

@ -43,7 +43,8 @@ def optimize_llm_pre(model: torch.nn.Module, qtype):
model.apply(pre_compute_inv_freq) model.apply(pre_compute_inv_freq)
# lm_head to cpu optimization # lm_head to cpu optimization
if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "1") != "0": if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0":
# disable the optimization by default
from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8 from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8
if qtype == "sym_int4_rtn": if qtype == "sym_int4_rtn":
lm_qtype = SYM_INT4 lm_qtype = SYM_INT4