[NPU] Support split lm_head for Qwen2 with CPP (#12491)
* Use split for Qwen2 lm_head instead of slice in optimize_pre * Support split lm_head in Qwen2 python cpp backend * Fit with Python acc lib pipeline * Removed default mixed_precision=True in all-in-one and related examples * Small fix * Style fix * Fix based on comments * Fix based on comments * Stype fix
This commit is contained in:
parent
5629fdd518
commit
ef4028ac2d
9 changed files with 30 additions and 23 deletions
|
|
@ -641,7 +641,7 @@ def transformers_int4_npu_win(repo_id,
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
|
||||||
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
|
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
|
||||||
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
|
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
|
||||||
mixed_precision=True, save_directory=save_directory, use_cache=True, attn_implementation="eager").eval()
|
save_directory=save_directory, use_cache=True, attn_implementation="eager").eval()
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
load_time = end - st
|
load_time = end - st
|
||||||
|
|
@ -701,7 +701,6 @@ def transformers_int4_npu_pipeline_win(repo_id,
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
in_out_len = in_out_pairs[0].split("-")
|
in_out_len = in_out_pairs[0].split("-")
|
||||||
max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
|
max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
|
||||||
mixed_precision = True if npu_group_size == 0 else False
|
|
||||||
save_directory = "./save_converted_model_dir"
|
save_directory = "./save_converted_model_dir"
|
||||||
# Load model in 4 bit,
|
# Load model in 4 bit,
|
||||||
# which convert the relevant layers in the model into INT4 format
|
# which convert the relevant layers in the model into INT4 format
|
||||||
|
|
@ -710,7 +709,7 @@ def transformers_int4_npu_pipeline_win(repo_id,
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, pipeline=True, torch_dtype=torch.float16,
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, pipeline=True, torch_dtype=torch.float16,
|
||||||
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
|
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
|
||||||
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
|
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
|
||||||
use_cache=True, attn_implementation="eager", mixed_precision=mixed_precision,
|
use_cache=True, attn_implementation="eager",
|
||||||
save_directory=save_directory).eval()
|
save_directory=save_directory).eval()
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,6 @@ if __name__ == "__main__":
|
||||||
torch_dtype=torch.float16,
|
torch_dtype=torch.float16,
|
||||||
attn_implementation="eager",
|
attn_implementation="eager",
|
||||||
transpose_value_cache=not args.disable_transpose_value_cache,
|
transpose_value_cache=not args.disable_transpose_value_cache,
|
||||||
mixed_precision=True,
|
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
convert_model=True,
|
convert_model=True,
|
||||||
save_directory=save_dir)
|
save_directory=save_dir)
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,6 @@ if __name__ == "__main__":
|
||||||
torch_dtype=torch.float16,
|
torch_dtype=torch.float16,
|
||||||
attn_implementation="eager",
|
attn_implementation="eager",
|
||||||
transpose_value_cache=not args.disable_transpose_value_cache,
|
transpose_value_cache=not args.disable_transpose_value_cache,
|
||||||
mixed_precision=True,
|
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
save_directory=args.save_directory)
|
save_directory=args.save_directory)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,6 @@ if __name__ == "__main__":
|
||||||
max_context_len=args.max_context_len,
|
max_context_len=args.max_context_len,
|
||||||
max_prompt_len=args.max_prompt_len,
|
max_prompt_len=args.max_prompt_len,
|
||||||
transpose_value_cache=not args.disable_transpose_value_cache,
|
transpose_value_cache=not args.disable_transpose_value_cache,
|
||||||
mixed_precision=True,
|
|
||||||
quantization_group_size=args.quantization_group_size,
|
quantization_group_size=args.quantization_group_size,
|
||||||
save_directory=args.save_directory
|
save_directory=args.save_directory
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -153,16 +153,19 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
|
||||||
if model.config.model_type == "minicpm" and model.config.num_hidden_layers == 40:
|
if model.config.model_type == "minicpm" and model.config.num_hidden_layers == 40:
|
||||||
# workaround for MiniCPM-2B
|
# workaround for MiniCPM-2B
|
||||||
new_lm_head_0 = SlicedLMHead(model.lm_head_0.weight, split_num=split_num,
|
new_lm_head_0 = SlicedLMHead(model.lm_head_0.weight, split_num=split_num,
|
||||||
bias=model.lm_head_0.bias, use_split=True)
|
bias=model.lm_head_0.bias, use_split=True,
|
||||||
|
group_size=quantization_group_size)
|
||||||
del model.lm_head_0
|
del model.lm_head_0
|
||||||
model.lm_head_0 = new_lm_head_0
|
model.lm_head_0 = new_lm_head_0
|
||||||
new_lm_head_1 = SlicedLMHead(model.lm_head_1.weight, split_num=split_num,
|
new_lm_head_1 = SlicedLMHead(model.lm_head_1.weight, split_num=split_num,
|
||||||
bias=model.lm_head_1.bias, use_split=True)
|
bias=model.lm_head_1.bias, use_split=True,
|
||||||
|
group_size=quantization_group_size)
|
||||||
del model.lm_head_1
|
del model.lm_head_1
|
||||||
model.lm_head_1 = new_lm_head_1
|
model.lm_head_1 = new_lm_head_1
|
||||||
else:
|
else:
|
||||||
new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num,
|
new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num,
|
||||||
bias=model.lm_head.bias, use_split=True)
|
bias=model.lm_head.bias, use_split=True,
|
||||||
|
group_size=quantization_group_size)
|
||||||
del model.lm_head
|
del model.lm_head
|
||||||
model.lm_head = new_lm_head
|
model.lm_head = new_lm_head
|
||||||
|
|
||||||
|
|
@ -176,7 +179,8 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
|
||||||
is_split = (not mixed_precision) and qtype == "sym_int4_rtn"
|
is_split = (not mixed_precision) and qtype == "sym_int4_rtn"
|
||||||
split_num = 14 if is_split else 1
|
split_num = 14 if is_split else 1
|
||||||
new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num,
|
new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num,
|
||||||
bias=model.lm_head.bias, use_split=False)
|
bias=model.lm_head.bias, use_split=True,
|
||||||
|
group_size=quantization_group_size)
|
||||||
del model.lm_head
|
del model.lm_head
|
||||||
model.lm_head = new_lm_head
|
model.lm_head = new_lm_head
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,7 @@ class LMHeadLinear(NNFactory):
|
||||||
device: str = "NPU",
|
device: str = "NPU",
|
||||||
dtype: np.dtype = np.int8,
|
dtype: np.dtype = np.int8,
|
||||||
use_split: bool = False,
|
use_split: bool = False,
|
||||||
|
group_size: int = 0,
|
||||||
):
|
):
|
||||||
"""Initialize the LMHeadLinear class.
|
"""Initialize the LMHeadLinear class.
|
||||||
|
|
||||||
|
|
@ -57,7 +58,7 @@ class LMHeadLinear(NNFactory):
|
||||||
if use_split:
|
if use_split:
|
||||||
input = self.parameter((1, self.batch, self.inC))
|
input = self.parameter((1, self.batch, self.inC))
|
||||||
res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
|
res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
|
||||||
scale_factor=False)
|
scale_factor=(group_size == 0))
|
||||||
else:
|
else:
|
||||||
input = self.parameter((self.batch, self.inC))
|
input = self.parameter((self.batch, self.inC))
|
||||||
split_size = self.inC // split_num // 2 * 2
|
split_size = self.inC // split_num // 2 * 2
|
||||||
|
|
@ -108,12 +109,13 @@ class LMHeadLinear(NNFactory):
|
||||||
|
|
||||||
|
|
||||||
class SlicedLMHead(nn.Module):
|
class SlicedLMHead(nn.Module):
|
||||||
def __init__(self, weight, bias, split_num, use_split=False):
|
def __init__(self, weight, bias, split_num, use_split=False, group_size=0):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.split_num = split_num
|
self.split_num = split_num
|
||||||
self.outC, self.inC = weight.shape
|
self.outC, self.inC = weight.shape
|
||||||
split_size = weight.size(1) // split_num // 2 * 2
|
split_size = weight.size(1) // split_num // 2 * 2
|
||||||
self.lm_heads = nn.Sequential()
|
self.lm_heads = nn.Sequential()
|
||||||
|
self.group_size = group_size
|
||||||
for i in range(split_num):
|
for i in range(split_num):
|
||||||
new_linear = torch.nn.Linear(0, 0, bias=False)
|
new_linear = torch.nn.Linear(0, 0, bias=False)
|
||||||
start_idx = i * split_size
|
start_idx = i * split_size
|
||||||
|
|
@ -159,7 +161,8 @@ class SlicedLMHead(nn.Module):
|
||||||
def get_fused_lm_head(self):
|
def get_fused_lm_head(self):
|
||||||
np_dtype = np.uint8 if self.get_weight_dtype() == torch.uint8 else np.int8
|
np_dtype = np.uint8 if self.get_weight_dtype() == torch.uint8 else np.int8
|
||||||
self.fused_lm_head = LMHeadLinear(self.inC, self.outC, 1, self.split_num,
|
self.fused_lm_head = LMHeadLinear(self.inC, self.outC, 1, self.split_num,
|
||||||
False, "NPU", dtype=np_dtype, use_split=self.use_split)
|
False, "NPU", dtype=np_dtype, use_split=self.use_split,
|
||||||
|
group_size=self.group_size)
|
||||||
if self.use_split:
|
if self.use_split:
|
||||||
weights = []
|
weights = []
|
||||||
scales = []
|
scales = []
|
||||||
|
|
|
||||||
|
|
@ -85,6 +85,7 @@ class LowBitLLMLMHead(LLMBaseNNFactory):
|
||||||
profile: bool = False,
|
profile: bool = False,
|
||||||
device: str = "NPU",
|
device: str = "NPU",
|
||||||
n_splits: int = 1,
|
n_splits: int = 1,
|
||||||
|
group_size: int = 0,
|
||||||
):
|
):
|
||||||
super().__init__(max_seq_len=max_seq_len,
|
super().__init__(max_seq_len=max_seq_len,
|
||||||
transpose_value=transpose_value,
|
transpose_value=transpose_value,
|
||||||
|
|
@ -117,7 +118,7 @@ class LowBitLLMLMHead(LLMBaseNNFactory):
|
||||||
hidden_states = self.linear(
|
hidden_states = self.linear(
|
||||||
hidden_states, self.vocab_size, self.hidden_size, bias=False, wt_dtype=self.dtype,
|
hidden_states, self.vocab_size, self.hidden_size, bias=False, wt_dtype=self.dtype,
|
||||||
n_splits=n_splits,
|
n_splits=n_splits,
|
||||||
scale_factor=(n_splits == 1),
|
scale_factor=(group_size == 0),
|
||||||
)
|
)
|
||||||
|
|
||||||
# define outputs
|
# define outputs
|
||||||
|
|
|
||||||
|
|
@ -355,9 +355,10 @@ def convert_llm(model: torch.nn.Module,
|
||||||
os.mkdir(weight_dir)
|
os.mkdir(weight_dir)
|
||||||
layer_num = len(model.model.layers)
|
layer_num = len(model.model.layers)
|
||||||
from .qwen import convert_qwen_layer, convert_lm_head_and_embedding
|
from .qwen import convert_qwen_layer, convert_lm_head_and_embedding
|
||||||
first_blob_path, last_blob_path = convert_lm_head_and_embedding(model, n_splits_linear,
|
first_blob_path, last_blob_path = convert_lm_head_and_embedding(model, temp_dir,
|
||||||
temp_dir, weight_dir,
|
weight_dir,
|
||||||
convert_model)
|
convert_model,
|
||||||
|
group_size=group_size)
|
||||||
|
|
||||||
param_list = []
|
param_list = []
|
||||||
for layer_idx in range(0, layer_num):
|
for layer_idx in range(0, layer_num):
|
||||||
|
|
@ -470,9 +471,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
||||||
group_size, layernorm_const, "prefill")
|
group_size, layernorm_const, "prefill")
|
||||||
# save blob of lmhead and bin of embedding
|
# save blob of lmhead and bin of embedding
|
||||||
convert_lm_head_and_embedding(model, n_splits_linear,
|
convert_lm_head_and_embedding(model, save_directory, weight_dir,
|
||||||
save_directory, weight_dir,
|
convert_model=True, group_size=group_size)
|
||||||
convert_model=True)
|
|
||||||
elif model.config.model_type == "llama":
|
elif model.config.model_type == "llama":
|
||||||
embedding_post = False
|
embedding_post = False
|
||||||
cos_sin_input = False
|
cos_sin_input = False
|
||||||
|
|
|
||||||
|
|
@ -22,14 +22,15 @@ from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLML
|
||||||
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
||||||
|
|
||||||
|
|
||||||
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
||||||
convert_model=False):
|
convert_model=False, group_size=0):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
rms_norm_eps = model.config.rms_norm_eps
|
rms_norm_eps = model.config.rms_norm_eps
|
||||||
vocab_size = model.config.vocab_size
|
vocab_size = model.config.vocab_size
|
||||||
model_norm = model.model.norm
|
model_norm = model.model.norm
|
||||||
lm_head = model.lm_head
|
lm_head = model.lm_head
|
||||||
|
lm_head_n_splits = 1
|
||||||
if not isinstance(lm_head, SlicedLMHead):
|
if not isinstance(lm_head, SlicedLMHead):
|
||||||
weights = [(lm_head.weight, lm_head.scale)]
|
weights = [(lm_head.weight, lm_head.scale)]
|
||||||
else:
|
else:
|
||||||
|
|
@ -41,6 +42,7 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
||||||
scales.append(l.scale)
|
scales.append(l.scale)
|
||||||
weights = [(torch.stack(lm_head_weights, axis=0),
|
weights = [(torch.stack(lm_head_weights, axis=0),
|
||||||
torch.stack(scales, axis=0))]
|
torch.stack(scales, axis=0))]
|
||||||
|
lm_head_n_splits = lm_head.split_num
|
||||||
if isinstance(weights[0], tuple):
|
if isinstance(weights[0], tuple):
|
||||||
np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8
|
np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8
|
||||||
else: # FP16 Linear
|
else: # FP16 Linear
|
||||||
|
|
@ -56,7 +58,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
||||||
dtype=np_dtype,
|
dtype=np_dtype,
|
||||||
model_norm_weight=model_norm.weight.to(torch.float16),
|
model_norm_weight=model_norm.weight.to(torch.float16),
|
||||||
vocab_size=vocab_size,
|
vocab_size=vocab_size,
|
||||||
n_splits=n_splits_linear
|
n_splits=lm_head_n_splits,
|
||||||
|
group_size=group_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
|
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue