[NPU pipeline] Support save & load and update examples (#12293)

* support save & load, update llama examples

* update baichuan2 example

* update readme
This commit is contained in:
Ruonan Wang 2024-10-30 10:02:00 +08:00 committed by GitHub
parent 5a15098835
commit 2b2cb9c693
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 147 additions and 56 deletions

View file

@ -51,9 +51,12 @@ python baichuan2.py
Arguments info:
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder.
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`.
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
### Sample Output
#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)

View file

@ -15,6 +15,7 @@
#
import os
import torch
import time
import argparse
@ -48,28 +49,49 @@ if __name__ == "__main__":
help="The huggingface repo id for the Baichuan2 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
args = parser.parse_args()
model_path = args.repo_id_or_model_path
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True)
if not args.lowbit_path or not os.path.exists(args.lowbit_path):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
pipeline=True,
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)
DEFAULT_SYSTEM_PROMPT = """\
"""

View file

@ -15,6 +15,7 @@
#
import os
import torch
import time
import argparse
@ -48,29 +49,49 @@ if __name__ == "__main__":
help="The huggingface repo id for the Llama2 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
args = parser.parse_args()
model_path = args.repo_id_or_model_path
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
if not args.lowbit_path or not os.path.exists(args.lowbit_path):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
pipeline=True,
transpose_value_cache=not args.disable_transpose_value_cache,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)
DEFAULT_SYSTEM_PROMPT = """\
"""

View file

@ -15,6 +15,7 @@
#
import os
import torch
import time
import argparse
@ -54,29 +55,49 @@ if __name__ == "__main__":
help="The huggingface repo id for the Llama3 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
args = parser.parse_args()
model_path = args.repo_id_or_model_path
model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
if not args.lowbit_path or not os.path.exists(args.lowbit_path):
model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
pipeline=True,
transpose_value_cache=not args.disable_transpose_value_cache,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)
print("-" * 80)
print("done")
with torch.inference_mode():

View file

@ -127,7 +127,7 @@ Arguments info:
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`.
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.

View file

@ -166,6 +166,8 @@ class _BaseAutoModelClass:
logger.info(f"Converting model, it may takes up to several minutes ...")
model.config.update({"optimize_model": optimize_model})
if mock_device == "cpu":
with torch.no_grad():
# Only mock quantization_group_size=0 for now
@ -262,7 +264,6 @@ class _BaseAutoModelClass:
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
model.save_low_bit = types.MethodType(save_low_bit, model)
else:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
@ -271,7 +272,7 @@ class _BaseAutoModelClass:
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size)
model.save_low_bit = types.MethodType(save_low_bit, model)
return model
@classmethod
@ -304,8 +305,10 @@ class _BaseAutoModelClass:
ignore_argument(kwargs, "pipeline_parallel_stages")
ignore_argument(kwargs, "mixed_precision")
ignore_argument(kwargs, "quantization_group_size")
optimize_model = kwargs.pop("optimize_model", False)
max_output_len = kwargs.pop("max_output_len", 1024)
ignore_argument(kwargs, "optimize_model")
pipeline = kwargs.pop("pipeline", False)
max_context_len = kwargs.pop("max_context_len", 1024)
max_context_len = max_context_len - 1
max_prompt_len = kwargs.pop("max_prompt_len", 512)
inter_pp = kwargs.pop("inter_pp", None)
intra_pp = kwargs.pop("intra_pp", None)
@ -355,6 +358,7 @@ class _BaseAutoModelClass:
bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True)
mixed_precision = config_dict.pop("mixed_precision", False)
quantization_group_size = config_dict.pop("group_size", 0)
optimize_model = config_dict.pop("optimize_model", False)
invalidInputError(
qtype,
@ -450,13 +454,12 @@ class _BaseAutoModelClass:
quant_device = "meta" if bigdl_lcmu_enabled else "cpu"
logger.info(f"Converting model, it may takes up to several minutes ...")
from intel_npu_acceleration_library.compiler import create_npu_kernels
if optimize_model:
invalidInputError(
max_prompt_len < max_output_len,
max_prompt_len < max_context_len,
(
f"max_prompt_len ({max_prompt_len}) should be less"
" than max_output_len ({max_output_len})"
" than max_context_len ({max_context_len})"
),
)
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre
@ -468,7 +471,8 @@ class _BaseAutoModelClass:
with torch.no_grad():
optimize_llm_pre(model, qtype, mixed_precision,
quantization_group_size=quantization_group_size)
quantization_group_size=quantization_group_size,
load=bigdl_lcmu_enabled)
cls.load_convert(qtype, model, quant_device, modules_to_not_convert,
quantization_group_size, *model_args, **kwargs)
create_npu_kernels(llm)
@ -541,17 +545,25 @@ class _BaseAutoModelClass:
for param in model.parameters():
param.requires_grad_(False)
if optimize_model:
if optimize_model and not pipeline:
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
optimize_llm(
llm,
max_output_len=max_output_len,
max_output_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
elif optimize_model and pipeline:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
convert_llm(llm,
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size)
return model

View file

@ -43,7 +43,7 @@ def reshape_lm_head_input(x):
return x
def split_linear(module, module_name, n_splits=2):
def split_linear(module, module_name, n_splits=2, load=False):
in_features = module.in_features
invalidInputError(in_features % n_splits == 0,
f"in_features of the linear layer {module_name} must be divisible by"
@ -51,17 +51,27 @@ def split_linear(module, module_name, n_splits=2):
weight_split = torch.tensor_split(module.weight, n_splits, dim=1)
linear_list = torch.nn.ModuleList()
bias = module.bias
for idx, weight in enumerate(weight_split):
new_linear = torch.nn.Linear(weight.size(1),
weight.size(0),
bias=False if bias is None else True)
new_linear.bias = bias
new_linear.weight = torch.nn.Parameter(weight.contiguous(), requires_grad=False)
linear_list.add_module(f"{module_name}_dq_{idx}", new_linear)
from transformers.utils.generic import ContextManagers
init_contexts = []
if load:
from transformers.modeling_utils import no_init_weights
from accelerate.big_modeling import init_empty_weights
init_contexts.append(no_init_weights(_enable=load))
init_contexts.append(init_empty_weights())
with ContextManagers(init_contexts):
for idx, weight in enumerate(weight_split):
new_linear = torch.nn.Linear(weight.size(1),
weight.size(0),
bias=False if bias is None else True)
new_linear.bias = bias
new_linear.weight = torch.nn.Parameter(weight.contiguous(), requires_grad=False)
linear_list.add_module(f"{module_name}_dq_{idx}", new_linear)
return linear_list
def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down_proj=2):
def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down_proj=2,
load=False):
from transformers.models.qwen2.modeling_qwen2 import Qwen2MLP, Qwen2Attention
from transformers.models.llama.modeling_llama import LlamaMLP, LlamaAttention
attn_module_names = ["q_proj", "k_proj", "v_proj", "o_proj"]
@ -69,7 +79,8 @@ def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down
if isinstance(module, (Qwen2Attention, LlamaAttention)):
for name in attn_module_names:
setattr(module, f"{name}_dq_list", split_linear(getattr(module, name), name,
n_splits=n_splits_hidden_size))
n_splits=n_splits_hidden_size,
load=load))
delattr(module, name)
elif isinstance(module, (Qwen2MLP, LlamaMLP)):
for name in mlp_module_names:
@ -77,5 +88,6 @@ def split_linears(module: torch.nn.Module, n_splits_hidden_size=2, n_splits_down
if name == 'down_proj':
n_splits_mlp = n_splits_down_proj
setattr(module, f"{name}_dq_list", split_linear(getattr(module, name), name,
n_splits=n_splits_mlp))
n_splits=n_splits_mlp,
load=load))
delattr(module, name)

View file

@ -31,7 +31,7 @@ def convert_forward(m, target_m, new_forward):
def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
quantization_group_size=0):
quantization_group_size=0, load=False):
if model.config.model_type == "baichuan":
# process NormHead module in Baichuan2 7B
if hasattr(model, 'lm_head') and model.lm_head is not None:
@ -104,9 +104,9 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
)
n_splits_linear = model.config.hidden_size // quantization_group_size
n_splits_down_proj = model.config.intermediate_size // quantization_group_size
model.apply(lambda m: split_linears(m, n_splits_hidden_size=n_splits_linear,
n_splits_down_proj=n_splits_down_proj))
n_splits_down_proj=n_splits_down_proj,
load=load))
if quantization_group_size != 0:
split_num = model.config.hidden_size // quantization_group_size