[NPU] Add minicpm-2b support for npu multi-processing (#11949)
* add minicpm-2b support * update example for minicpm-2b * add LNL NPU driver requirement in readme
This commit is contained in:
parent
0fbb10259a
commit
5ca7390082
2 changed files with 16 additions and 2 deletions
|
|
@ -82,6 +82,7 @@ The example below shows how to run the **_optimized model implementations_** on
|
||||||
- [Llama3-8B](./llama.py)
|
- [Llama3-8B](./llama.py)
|
||||||
- [Qwen2-1.5B](./qwen2.py)
|
- [Qwen2-1.5B](./qwen2.py)
|
||||||
- [MiniCPM-1B](./minicpm.py)
|
- [MiniCPM-1B](./minicpm.py)
|
||||||
|
- [MiniCPM-2B](./minicpm.py)
|
||||||
- [Baichuan2-7B](./baichuan2.py)
|
- [Baichuan2-7B](./baichuan2.py)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
@ -97,6 +98,9 @@ python qwen2.py
|
||||||
# to run MiniCPM-1B-sft-bf16
|
# to run MiniCPM-1B-sft-bf16
|
||||||
python minicpm.py
|
python minicpm.py
|
||||||
|
|
||||||
|
# to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715)
|
||||||
|
python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16
|
||||||
|
|
||||||
# to run Baichuan2-7B-Chat
|
# to run Baichuan2-7B-Chat
|
||||||
python baichuan2.py
|
python baichuan2.py
|
||||||
```
|
```
|
||||||
|
|
@ -124,6 +128,9 @@ python qwen2.py --disable-transpose-value-cache
|
||||||
|
|
||||||
# to run MiniCPM-1B-sft-bf16
|
# to run MiniCPM-1B-sft-bf16
|
||||||
python minicpm.py --disable-transpose-value-cache
|
python minicpm.py --disable-transpose-value-cache
|
||||||
|
|
||||||
|
# to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715)
|
||||||
|
python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --disable-transpose-value-cache
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -151,18 +151,25 @@ def optimize_llm(
|
||||||
modeling_module_name = model.__class__.__module__
|
modeling_module_name = model.__class__.__module__
|
||||||
module = importlib.import_module(modeling_module_name)
|
module = importlib.import_module(modeling_module_name)
|
||||||
|
|
||||||
|
if model.config.num_hidden_layers == 52:
|
||||||
|
# for minicpm-1b
|
||||||
|
transpose_cache = transpose_value_cache
|
||||||
|
elif model.config.num_hidden_layers == 40:
|
||||||
|
# for minicpm-2b
|
||||||
|
transpose_cache = False
|
||||||
|
|
||||||
decode_runner = DecodeRunner(
|
decode_runner = DecodeRunner(
|
||||||
model,
|
model,
|
||||||
max_seq_len=max_output_len,
|
max_seq_len=max_output_len,
|
||||||
inter_pp=inter_pp,
|
inter_pp=inter_pp,
|
||||||
intra_pp=intra_pp,
|
intra_pp=intra_pp,
|
||||||
transpose_value_cache=transpose_value_cache,
|
transpose_value_cache=transpose_cache,
|
||||||
)
|
)
|
||||||
prefill_runner = PrefillRunner(
|
prefill_runner = PrefillRunner(
|
||||||
model,
|
model,
|
||||||
max_output_len=max_output_len,
|
max_output_len=max_output_len,
|
||||||
max_prompt_len=max_prompt_len,
|
max_prompt_len=max_prompt_len,
|
||||||
transpose_value_cache=transpose_value_cache,
|
transpose_value_cache=transpose_cache,
|
||||||
)
|
)
|
||||||
minicpm_model_forward = gen_minicpm_fused_model_forward(
|
minicpm_model_forward = gen_minicpm_fused_model_forward(
|
||||||
prefill_runner=prefill_runner, decode_runner=decode_runner
|
prefill_runner=prefill_runner, decode_runner=decode_runner
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue