Add lnl npu driver recommend version and enable cpu_lm_head on llama3 (#11952)
* update lnl npu driver version and enable cpu_lm_head on llama3 * update * fix style * typo * address comments * update * add qwen2-7b
This commit is contained in:
parent
71f03dcc39
commit
882f4a5ff7
2 changed files with 29 additions and 25 deletions
|
|
@ -86,11 +86,18 @@ The example below shows how to run the **_optimized model implementations_** on
|
|||
- [MiniCPM-2B](./minicpm.py)
|
||||
- [Baichuan2-7B](./baichuan2.py)
|
||||
|
||||
### Recommended NPU Driver Version for LNL Users
|
||||
#### 32.0.100.2625
|
||||
Supported models: Llama2-7B, Qwen2-1.5B, Qwen2-7B, MiniCPM-1B, Baichuan2-7B
|
||||
#### 32.0.101.2715
|
||||
Supported models: Llama3-8B, MiniCPM-2B
|
||||
|
||||
### Run Models
|
||||
```bash
|
||||
# to run Llama-2-7b-chat-hf
|
||||
python llama.py
|
||||
|
||||
# to run Meta-Llama-3-8B-Instruct
|
||||
# to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
|
||||
python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
# to run Qwen2-1.5B-Instruct
|
||||
|
|
@ -124,7 +131,7 @@ If you encounter output problem, please try to disable the optimization of trans
|
|||
# to run Llama-2-7b-chat-hf
|
||||
python llama.py --disable-transpose-value-cache
|
||||
|
||||
# to run Meta-Llama-3-8B-Instruct
|
||||
# to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
|
||||
python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --disable-transpose-value-cache
|
||||
|
||||
# to run Qwen2-1.5B-Instruct
|
||||
|
|
|
|||
|
|
@ -44,29 +44,26 @@ def optimize_llm_pre(model: torch.nn.Module, qtype):
|
|||
|
||||
# lm_head to cpu optimization
|
||||
if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "1") != "0":
|
||||
is_unsupported_model = (model.config.model_type == "llama"
|
||||
and model.vocab_size > 32000)
|
||||
if not is_unsupported_model:
|
||||
from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8
|
||||
if qtype == "sym_int4_rtn":
|
||||
lm_qtype = SYM_INT4
|
||||
else:
|
||||
lm_qtype = SYM_INT8
|
||||
# lm_head opt to mp opt (llama, qwen2)
|
||||
optimize_lm_head = model.config.model_type not in ["llama", "qwen2"]
|
||||
new_linear = LowBitLinear(model.lm_head.in_features,
|
||||
model.lm_head.out_features,
|
||||
lm_qtype,
|
||||
False,
|
||||
optimize_lm_head=optimize_lm_head)
|
||||
paramsLowBit = FP4Params(data=model.lm_head.weight.data,
|
||||
requires_grad=False,
|
||||
quantized=False,
|
||||
_shape=None,
|
||||
qtype=lm_qtype,
|
||||
in_features=model.lm_head.in_features).to("cpu")
|
||||
new_linear._parameters['weight'] = paramsLowBit
|
||||
model.lm_head = new_linear
|
||||
from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8
|
||||
if qtype == "sym_int4_rtn":
|
||||
lm_qtype = SYM_INT4
|
||||
else:
|
||||
lm_qtype = SYM_INT8
|
||||
# lm_head opt to mp opt (llama, qwen2)
|
||||
optimize_lm_head = model.config.model_type not in ["llama", "qwen2"]
|
||||
new_linear = LowBitLinear(model.lm_head.in_features,
|
||||
model.lm_head.out_features,
|
||||
lm_qtype,
|
||||
False,
|
||||
optimize_lm_head=optimize_lm_head)
|
||||
paramsLowBit = FP4Params(data=model.lm_head.weight.data,
|
||||
requires_grad=False,
|
||||
quantized=False,
|
||||
_shape=None,
|
||||
qtype=lm_qtype,
|
||||
in_features=model.lm_head.in_features).to("cpu")
|
||||
new_linear._parameters['weight'] = paramsLowBit
|
||||
model.lm_head = new_linear
|
||||
|
||||
|
||||
def optimize_llm(
|
||||
|
|
|
|||
Loading…
Reference in a new issue