Add lnl npu driver recommend version and enable cpu_lm_head on llama3 (#11952)

* update lnl npu driver version and enable cpu_lm_head on llama3

* update

* fix style

* typo

* address comments

* update

* add qwen2-7b
This commit is contained in:
Yina Chen 2024-08-29 10:01:18 +03:00 committed by GitHub
parent 71f03dcc39
commit 882f4a5ff7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 29 additions and 25 deletions

View file

@ -86,11 +86,18 @@ The example below shows how to run the **_optimized model implementations_** on
- [MiniCPM-2B](./minicpm.py) - [MiniCPM-2B](./minicpm.py)
- [Baichuan2-7B](./baichuan2.py) - [Baichuan2-7B](./baichuan2.py)
### Recommended NPU Driver Version for LNL Users
#### 32.0.100.2625
Supported models: Llama2-7B, Qwen2-1.5B, Qwen2-7B, MiniCPM-1B, Baichuan2-7B
#### 32.0.101.2715
Supported models: Llama3-8B, MiniCPM-2B
### Run Models
```bash ```bash
# to run Llama-2-7b-chat-hf # to run Llama-2-7b-chat-hf
python llama.py python llama.py
# to run Meta-Llama-3-8B-Instruct # to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct
# to run Qwen2-1.5B-Instruct # to run Qwen2-1.5B-Instruct
@ -124,7 +131,7 @@ If you encounter output problem, please try to disable the optimization of trans
# to run Llama-2-7b-chat-hf # to run Llama-2-7b-chat-hf
python  llama.py --disable-transpose-value-cache python  llama.py --disable-transpose-value-cache
# to run Meta-Llama-3-8B-Instruct # to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --disable-transpose-value-cache python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --disable-transpose-value-cache
# to run Qwen2-1.5B-Instruct # to run Qwen2-1.5B-Instruct

View file

@ -44,9 +44,6 @@ def optimize_llm_pre(model: torch.nn.Module, qtype):
# lm_head to cpu optimization # lm_head to cpu optimization
if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "1") != "0": if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "1") != "0":
is_unsupported_model = (model.config.model_type == "llama"
and model.vocab_size > 32000)
if not is_unsupported_model:
from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8 from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8
if qtype == "sym_int4_rtn": if qtype == "sym_int4_rtn":
lm_qtype = SYM_INT4 lm_qtype = SYM_INT4