Disable lm head (#11972)
This commit is contained in:
parent
7d103417b8
commit
cd077881f1
2 changed files with 4 additions and 3 deletions
|
|
@ -145,8 +145,8 @@ python minicpm.py --disable-transpose-value-cache
|
|||
python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --disable-transpose-value-cache
|
||||
```
|
||||
|
||||
#### High CPU Utilization
|
||||
You can reduce CPU utilization by setting the environment variable with `set IPEX_LLM_CPU_LM_HEAD=0`.
|
||||
#### Better Performance with High CPU Utilization
|
||||
You could enable optimization by setting the environment variable with `set IPEX_LLM_CPU_LM_HEAD=1` for better performance. But this will cause high CPU utilization.
|
||||
|
||||
|
||||
### Sample Output
|
||||
|
|
|
|||
|
|
@ -43,7 +43,8 @@ def optimize_llm_pre(model: torch.nn.Module, qtype):
|
|||
model.apply(pre_compute_inv_freq)
|
||||
|
||||
# lm_head to cpu optimization
|
||||
if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "1") != "0":
|
||||
if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0":
|
||||
# disable the optimization by default
|
||||
from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8
|
||||
if qtype == "sym_int4_rtn":
|
||||
lm_qtype = SYM_INT4
|
||||
|
|
|
|||
Loading…
Reference in a new issue