Add initial support for modeling_xlm encoder on NPU (#12393)

* Add initial support for modeling_xlm encoder on NPU

* Add EmbeddingModel class to keep the same usage with bce and npu fp16 linear convert

* Optimize currently implementation to support EmbeddingModel.encode API and convert other torch modules to NPU

* Add related example and documents
This commit is contained in:
SONG Ge 2024-11-14 10:50:27 +08:00 committed by GitHub
parent 6726b198fd
commit d2cbcb060c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 982 additions and 1 deletions

View file

@ -8,6 +8,7 @@ In this directory, you will find examples on how you could apply IPEX-LLM INT4 o
| Phi-3-Vision | [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) |
| MiniCPM-Llama3-V-2_5 | [openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) |
| MiniCPM-V-2_6 | [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) |
| Bce-Embedding-Base-V1 | [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) |
| Speech_Paraformer-Large | [iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch) |
## Requirements
@ -32,6 +33,9 @@ pip install torchvision
# [optional] for MiniCPM-V-2_6
pip install timm torch==2.1.2 torchvision==0.16.2
# [optional] for Bce-Embedding-Base-V1
pip install BCEmbedding==0.1.5 transformers==4.40.0
# [optional] for Speech_Paraformer-Large
pip install -U funasr
pip install modelscope torch==2.1.2 torchaudio==2.1.2
@ -148,3 +152,23 @@ rtf_avg: 0.090: 100%|███████████████████
rtf_avg: 0.232: 100%|███████████████████████████████████| 1/1 [00:01<00:00, 1.29s/it]
[{'key': 'asr_example_zh', 'text': '欢 迎 大 家 来 体 验 达 摩 院 推 出 的 语 音 识 别 模 型'}]
```
### 4.3 Run Bce-Embedding-Base-V1
```bash
# to run Bce-Embedding-Base-V1
python bce-embedding.py
```
Arguments info:
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the asr repo id for the model (i.e. `maidalun1020/bce-embedding-base_v1`) to be downloaded, or the path to the asr checkpoint folder.
#### Sample Output
##### [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) |
```log
Inference time: xxx s
[[-0.00674987 -0.01700369 -0.0028928 ... -0.05296675 -0.00352772
0.00827096]
[-0.04398304 0.00023038 0.00643183 ... -0.02717186 0.00483789
0.02298774]]
```

View file

@ -0,0 +1,77 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import time
import torch
import argparse
from ipex_llm.transformers.npu_model import EmbeddingModel
from transformers.utils import logging
logger = logging.get_logger(__name__)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Predict Tokens using `generate()` API for npu model"
)
parser.add_argument(
"--repo-id-or-model-path",
type=str,
default="maidalun1020/bce-embedding-base_v1",
help="The huggingface repo id for the bce-embedding model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="'sentence_0', 'sentence_1'",
help='Prompt to infer')
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--intra-pp", type=int, default=2)
parser.add_argument("--inter-pp", type=int, default=2)
args = parser.parse_args()
model_path = args.repo_id_or_model_path
model = EmbeddingModel(
model_path,
torch_dtype=torch.float16,
trust_remote_code=True,
attn_implementation="eager",
optimize_model=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,
transpose_value_cache=not args.disable_transpose_value_cache,
)
# list of sentences
split_items = args.prompt.split(',')
sentences = [item.strip().strip("'") for item in split_items]
# extract embeddings
st = time.time()
embeddings = model.encode(sentences)
end = time.time()
print(f'Inference time: {end-st} s')
print(embeddings)

View file

@ -25,4 +25,5 @@ This folder contains examples of running IPEX-LLM on Intel NPU:
| Phi-3-Vision | [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) |
| MiniCPM-Llama3-V-2_5 | [openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) |
| MiniCPM-V-2_6 | [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) |
| Speech_Paraformer-Large | [iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch) |
| Bce-Embedding-Base-V1 | [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) |
| Speech_Paraformer-Large | [iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch) |

View file

@ -659,3 +659,107 @@ class FunAsrAutoModel(_BaseAutoModelClass):
model.save_low_bit = types.MethodType(save_low_bit, model)
return model
class EmbeddingModel(_BaseAutoModelClass):
def __init__(self, *args, **kwargs):
self.model = self.from_pretrained(*args, **kwargs)
self.model_name = args[0]
from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
def __getattr__(self, name):
return getattr(self.model, name)
@classmethod
def get_cls_model(cls):
cls_model = transformers.AutoModel
return cls_model
@classmethod
def optimize_npu_model(cls, *args, **kwargs):
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm, optimize_llm_pre
from intel_npu_acceleration_library.compiler import create_npu_kernels
model = kwargs.pop("model")
qtype = kwargs.pop("qtype", "sym_int4")
mixed_precision = kwargs.pop("mixed_precision", False)
quantization_group_size = kwargs.pop("quantization_group_size", 0)
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
pipeline = kwargs.pop("pipeline", False)
max_context_len = kwargs.pop("max_context_len", 1024)
max_prompt_len = kwargs.pop("max_prompt_len", 512)
inter_pp = kwargs.pop("inter_pp", None)
intra_pp = kwargs.pop("intra_pp", None)
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
with torch.no_grad():
optimize_llm_pre(model, qtype, mixed_precision,
quantization_group_size=quantization_group_size)
cls.load_convert_fp16(qtype, model.encoder, "cpu", modules_to_not_convert,
quantization_group_size, *args, **kwargs)
create_npu_kernels(model.encoder)
model = model.eval()
logger.info(f"Finish to convert model")
optimize_llm(
model,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
)
return model
@classmethod
def load_convert_fp16(cls, q_k, optimize_model, device, modules_to_not_convert,
group_size=0, *arg, **kwarg):
from ipex_llm.transformers.npu_models.xlm_mp import replace_with_FP16Linear
replace_with_FP16Linear(optimize_model, q_k, device=device,
modules_to_not_convert=modules_to_not_convert,
group_size=group_size)
def encode(self,
sentences,
batch_size: int=256,
max_length: int=512,
normalize_to_unit: bool=True,
return_numpy: bool=True,
enable_tqdm: bool=True,
query_instruction: str="",
**kwargs):
from tqdm import tqdm
from numpy import ndarray
if isinstance(sentences, str):
sentences = [sentences]
with torch.no_grad():
embeddings_collection = []
for sentence_id in tqdm(range(0, len(sentences), batch_size),
desc='Extract embeddings', disable=not enable_tqdm):
if isinstance(query_instruction, str) and len(query_instruction) > 0:
sentence_batch = [query_instruction+sent for sent in
sentences[sentence_id:sentence_id+batch_size]]
else:
sentence_batch = sentences[sentence_id:sentence_id+batch_size]
inputs = self.tokenizer(sentence_batch,
padding=True,
truncation=True,
max_length=max_length,
return_tensors="pt",
)
outputs = self.model(**inputs, return_dict=True)
embeddings = outputs.last_hidden_state[:, 0]
if normalize_to_unit:
embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)
embeddings_collection.append(embeddings)
embeddings = torch.cat(embeddings_collection, dim=0)
if return_numpy and not isinstance(embeddings, ndarray):
embeddings = embeddings.numpy()
return embeddings

View file

@ -88,6 +88,14 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
return DequantizedLinear(qweights, scale, layer.bias)
@module_optimization
def replace_with_FP16Linear(layer, qtype, device, modules_to_not_convert,
group_size):
from ipex_llm.transformers.npu_models.linear import Linear
if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
return Linear(layer.weight, layer.bias)
def convert_forward(m, target_m, new_forward):
if m.__class__ == target_m:
bound_method = new_forward.__get__(m, m.__class__)

View file

@ -176,6 +176,19 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
del model.lm_head
model.lm_head = new_lm_head
if model.config.model_type == "xlm-roberta":
from ipex_llm.transformers.npu_models.xlm_mp import XLMPoolLayer, replace_with_Layernorm
pooler_dense = model.pooler.dense
opt_linear = XLMPoolLayer(
weight=pooler_dense.weight.to(torch.float16),
bias=pooler_dense.bias.to(torch.float16),
output_channel=model.config.hidden_size,
input_channel=model.config.hidden_size
)
model.pooler.dense = opt_linear
replace_with_Layernorm(model.embeddings, qtype=None, device='NPU',
modules_to_not_convert=[], group_size=0)
# lm_head to cpu optimization
if cpu_lm_head:
# disable the optimization by default
@ -363,6 +376,27 @@ def convert_qwen(
convert_forward(model, Qwen2ForCausalLM, qwen2_casullm_forward)
def convert_bce(
model: torch.nn.Module,
max_context_len=1024,
max_prompt_len=1024,
transpose_value_cache=True,
):
from ipex_llm.transformers.npu_models.xlm_mp import gen_xlm_fused_encoder_forward
from ipex_llm.transformers.npu_models.xlm_mp import PrefillRunner
prefill_runner = PrefillRunner(
model,
max_output_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
)
encoder_forward = gen_xlm_fused_encoder_forward(
prefill_runner=prefill_runner
)
from transformers.models.xlm_roberta.modeling_xlm_roberta import XLMRobertaEncoder
convert_forward(model, XLMRobertaEncoder, encoder_forward)
def optimize_llm(
model: torch.nn.Module,
max_context_len=1024,
@ -439,6 +473,12 @@ def optimize_llm(
intra_pp=intra_pp,
decoder=True,
transpose_value_cache=transpose_value_cache)
elif model.config.model_type == "xlm-roberta":
# for bce-embedding-base_v1
convert_bce(model,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache)
if hasattr(model, 'lm_head') and isinstance(model.lm_head, SlicedLMHead):
model.lm_head.get_fused_lm_head()
# MiniCPM-2b

View file

@ -0,0 +1,727 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import torch
import time
import ctypes
from typing import Optional, Sequence, List, Union, Any, Tuple
from typing import Optional, List, Generator
import uuid
from functools import partial
from colorama import Fore, Back, Style
import math
import numpy as np
import torch.nn.functional as F
import torch.nn.parallel
import torch.distributed as dist
import torch.multiprocessing as mp
from transformers.cache_utils import Cache
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
from transformers.pytorch_utils import apply_chunking_to_forward
from transformers.utils import logging
from ipex_llm.utils.common import invalidInputError
from ipex_llm.transformers.npu_models.convert import module_optimization
from ipex_llm.transformers.npu_models.mp_models_base import run_model
from ipex_llm.transformers.npu_models.mp_models_base import LLMBaseNNFactory
from intel_npu_acceleration_library.backend.factory import NNFactory
from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
logger = logging.get_logger(__name__)
class LowBitMultiEncoderlayer(LLMBaseNNFactory):
def __init__(
self,
hidden_shape: Sequence[int],
*shapes,
num_layers: int,
rms_norm_eps,
attn_output_norm_weights=None,
attn_output_norm_biases=None,
encoder_output_norm_weights=None,
encoder_output_norm_biases=None,
attn_self_query_biases=None,
attn_self_key_biases=None,
attn_self_value_biases=None,
attn_output_dense_biases=None,
encoder_inter_dense_biases=None,
encoder_output_dense_biases=None,
intermediate_size=None,
num_attention_heads=None,
hidden_size=None,
mode: str = "prefill",
dtype: np.dtype = np.int8,
max_seq_len: int = 1024,
transpose_value: bool = False,
profile: bool = False,
device: str = "NPU",
):
super().__init__(max_seq_len=max_seq_len,
transpose_value=transpose_value,
dtype=dtype,
profile=profile,
device=device)
self.mode = mode
self.dtype = dtype
self.num_layers = num_layers
self.rms_norm_eps = rms_norm_eps
self.inter_size = intermediate_size
self.batch_size, self.seq_len, self.hidden_size = hidden_shape
self.num_attention_heads = num_attention_heads
self.attention_head_size = int(hidden_size / self.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
input = self.create_input_op((self.batch_size, self.seq_len, self.hidden_size))
attention_mask = self.create_input_op((self.batch_size, 1, 1, self.seq_len))
hidden_states = input
if attn_output_norm_weights is None:
attn_output_norm_weights = []
attn_output_norm_biases = []
encoder_output_norm_weights = []
encoder_output_norm_biases = []
attn_self_query_biases = []
attn_self_key_biases = []
attn_self_value_biases = []
attn_output_dense_biases = []
encoder_inter_dense_biases = []
encoder_output_dense_biases = []
for i in range(self.num_layers):
attn_output_norm_weights.append(
self.create_input_op((1, self.hidden_size,))
)
attn_output_norm_biases.append(
self.create_input_op((1, self.hidden_size,))
)
encoder_output_norm_weights.append(
self.create_input_op((1, self.hidden_size,))
)
encoder_output_norm_biases.append(
self.create_input_op((1, self.hidden_size,))
)
attn_self_query_biases.append(
self.create_input_op((1, self.hidden_size,))
)
attn_self_key_biases.append(
self.create_input_op((1, self.hidden_size,))
)
attn_self_value_biases.append(
self.create_input_op((1, self.hidden_size,))
)
attn_output_dense_biases.append(
self.create_input_op((1, self.hidden_size,))
)
encoder_inter_dense_biases.append(
self.create_input_op((1, self.inter_size,))
)
encoder_output_dense_biases.append(
self.create_input_op((1, self.hidden_size,))
)
else:
attn_output_norm_weights = [self.constant(w) for w in attn_output_norm_weights]
attn_output_norm_biases = [self.constant(w) for w in attn_output_norm_biases]
encoder_output_norm_weights = [self.constant(w) for w in encoder_output_norm_weights]
encoder_output_norm_biases = [self.constant(w) for w in encoder_output_norm_biases]
attn_self_query_biases = [self.constant(w) for w in attn_self_query_biases]
attn_self_key_biases = [self.constant(w) for w in attn_self_key_biases]
attn_self_value_biases = [self.constant(w) for w in attn_self_value_biases]
attn_output_dense_biases = [self.constant(w) for w in attn_output_dense_biases]
encoder_inter_dense_biases = [self.constant(w) for w in encoder_inter_dense_biases]
encoder_output_dense_biases = [self.constant(w) for w in encoder_output_dense_biases]
for i in range(self.num_layers):
outputs = self.build_encoder(
hidden_states=hidden_states,
attention_mask=attention_mask,
attn_output_norm_weight=attn_output_norm_weights[i],
attn_output_norm_bias=attn_output_norm_biases[i],
encoder_output_norm_weight=encoder_output_norm_weights[i],
encoder_output_norm_bias=encoder_output_norm_biases[i],
attn_self_query_bias=attn_self_query_biases[i],
attn_self_key_bias=attn_self_key_biases[i],
attn_self_value_bias=attn_self_value_biases[i],
attn_output_dense_bias=attn_output_dense_biases[i],
encoder_inter_dense_bias=encoder_inter_dense_biases[i],
encoder_output_dense_bias=encoder_output_dense_biases[i],
)
# define outputs
outputs = self.convert_to_fp32(outputs)
print("start compiling")
self.compile()
def build_encoder(self,
hidden_states,
attention_mask,
attn_output_norm_weight,
attn_output_norm_bias,
encoder_output_norm_weight,
encoder_output_norm_bias,
attn_self_query_bias,
attn_self_key_bias,
attn_self_value_bias,
attn_output_dense_bias,
encoder_inter_dense_bias,
encoder_output_dense_bias,):
# XLMRobertaAttention
self_outputs = self.self_attention(
hidden_states=hidden_states,
attention_mask=attention_mask,
query_bias=attn_self_query_bias,
key_bias=attn_self_key_bias,
value_bias=attn_self_value_bias,)
attention_output = self.self_output(input_tensor=self_outputs,
hidden_states=hidden_states,
output_bias=attn_output_dense_bias,
layer_norm_weight=attn_output_norm_weight,
layer_norm_bias=attn_output_norm_bias,)
# # XLMRobertaAttention End
intermediate_output = self.self_intermediate(inter_tensor=attention_output,
inter_bias=encoder_inter_dense_bias,)
layer_output = self.encoder_output(input_tensor=intermediate_output,
hidden_states=attention_output,
output_bias=encoder_output_dense_bias,
layer_norm_weight=encoder_output_norm_weight,
layer_norm_bias=encoder_output_norm_bias,)
outputs = layer_output
return outputs
def self_attention(self, hidden_states, attention_mask, query_bias, key_bias, value_bias):
mixed_query_states = self.linear(
hidden_states, self.all_head_size, self.hidden_size, bias=False, wt_dtype=self.dtype,
)
mixed_query_states = mixed_query_states + query_bias
key_states = self.linear(
hidden_states, self.all_head_size, self.hidden_size, bias=False, wt_dtype=self.dtype,
)
key_states = key_states + key_bias
value_states = self.linear(
hidden_states, self.all_head_size, self.hidden_size, bias=False, wt_dtype=self.dtype,
)
value_states = value_states + value_bias
mixed_query_states = self.reshape(mixed_query_states,
[self.batch_size,
self.seq_len,
self.num_attention_heads,
self.attention_head_size])
key_states = self.reshape(key_states,
[self.batch_size,
self.seq_len,
self.num_attention_heads,
self.attention_head_size])
value_states = self.reshape(value_states,
[self.batch_size,
self.seq_len,
self.num_attention_heads,
self.attention_head_size])
query_states = self.transpose(mixed_query_states, [0, 2, 1, 3])
key_states = self.transpose(key_states, [0, 2, 1, 3])
value_states = self.transpose(value_states, [0, 2, 1, 3])
attention_scores = self.matmul(query_states, key_states,
False, True) / math.sqrt(self.attention_head_size)
if attention_mask is not None:
attention_scores = self.eltwise_add(attention_scores, attention_mask)
attention_probs = self.softmax(attention_scores, -1)
context_states = self.matmul(attention_probs, value_states, False, False)
context_states = self.transpose(context_states, [0, 2, 1, 3])
context_states = self.reshape(context_states,
[self.batch_size, self.seq_len, self.all_head_size])
context_states = self.convert_to_fp16(context_states)
return context_states
def self_output(self,
input_tensor,
hidden_states,
output_bias,
layer_norm_weight,
layer_norm_bias):
output_states = self.linear(input_tensor,
self.hidden_size,
self.hidden_size,
bias=False,
wt_dtype=self.dtype,)
output_states = output_states + output_bias
output_states = self.eltwise_add(output_states, hidden_states)
output_states = self.paraformer_layer_norm(output_states,
layer_norm_weight,
layer_norm_bias)
return output_states
def self_intermediate(self, inter_tensor, inter_bias):
inter_states = self.linear(inter_tensor,
self.inter_size,
self.hidden_size,
bias=False,
wt_dtype=self.dtype,)
inter_states = self.convert_to_fp32(inter_states)
inter_bias = self.convert_to_fp32(inter_bias)
inter_states = inter_states + inter_bias
return inter_states
def encoder_output(self,
input_tensor,
hidden_states,
output_bias,
layer_norm_weight,
layer_norm_bias):
input_tensor = self.convert_to_fp16(input_tensor)
output_states = self.linear(self.gelu(input_tensor),
self.hidden_size,
self.inter_size,
bias=False,
wt_dtype=self.dtype)
output_states = output_states + output_bias
output_states = self.eltwise_add(output_states, hidden_states)
output_states = self.paraformer_layer_norm(output_states,
layer_norm_weight,
layer_norm_bias)
return output_states
class FusedLlamaLowBitDecoderlayer(torch.nn.Module):
"""LLAMA MLP operation NPU backend."""
def __init__(
self,
parameters: List[torch.Tensor],
attn_output_norm_weight,
attn_output_norm_bias,
encoder_output_norm_weight,
encoder_output_norm_bias,
attn_self_query_bias,
attn_self_key_bias,
attn_self_value_bias,
attn_output_dense_bias,
encoder_inter_dense_bias,
encoder_output_dense_bias,
intermediate_size,
num_attention_heads,
hidden_size,
rms_norm_eps,
layer_idx,
max_seq_len=128,
transpose_value=False,
):
super().__init__()
self.op_parameters = parameters
self.op_id = str(uuid.uuid4())
self.layer_idx = layer_idx
self.max_seq_len = max_seq_len
self.transpose_value = transpose_value
if isinstance(parameters[0], tuple): # weight, scale from QuantizedLinear
np_dtype = np.int8 if parameters[0][0].dtype == torch.int8 else np.uint8
else: # FP16 Linear
np_dtype = np.float16
self.backend_cls_prefill = partial(
LowBitMultiEncoderlayer,
num_layers=1,
rms_norm_eps=rms_norm_eps,
attn_output_norm_weights=None,
attn_output_norm_biases=None,
encoder_output_norm_weights=None,
encoder_output_norm_biases=None,
attn_self_query_biases=None,
attn_self_key_biases=None,
attn_self_value_biases=None,
attn_output_dense_biases=None,
encoder_inter_dense_biases=None,
encoder_output_dense_biases=None,
intermediate_size=intermediate_size,
num_attention_heads=num_attention_heads,
hidden_size=hidden_size,
mode="prefill",
transpose_value=self.transpose_value,
dtype=np_dtype,
)
self.attn_output_norm_weight = attn_output_norm_weight
self.attn_output_norm_bias = attn_output_norm_bias
self.encoder_output_norm_weight = encoder_output_norm_weight
self.encoder_output_norm_bias = encoder_output_norm_bias
self.attn_self_query_bias = attn_self_query_bias
self.attn_self_key_bias = attn_self_key_bias
self.attn_self_value_bias = attn_self_value_bias
self.attn_output_dense_bias = attn_output_dense_bias
self.encoder_inter_dense_bias = encoder_inter_dense_bias
self.encoder_output_dense_bias = encoder_output_dense_bias
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
) -> Tuple[torch.Tensor]:
"""Torch module forward method.
Args:
x (torch.Tensor): Input tensor
Returns:
torch.Tensor: result
"""
backend_cls = self.backend_cls_prefill
inputs = (hidden_states.to(torch.float16),
attention_mask.to(torch.float16),
self.attn_output_norm_weight.to(torch.float16),
self.attn_output_norm_bias.to(torch.float16),
self.encoder_output_norm_weight.to(torch.float16),
self.encoder_output_norm_bias.to(torch.float16),
self.attn_self_query_bias.to(torch.float16),
self.attn_self_key_bias.to(torch.float16),
self.attn_self_value_bias.to(torch.float16),
self.attn_output_dense_bias.to(torch.float16),
self.encoder_inter_dense_bias.to(torch.float16),
self.encoder_output_dense_bias.to(torch.float16),
)
outputs = run_model(
inputs, self.op_parameters, backend_cls, self.op_id, replica=2
)
return outputs
def run_prefill(
model, max_output_len, max_prompt_len, transpose_value_cache, input_queue, result_queue
):
layer_start = 0
layer_end = model.config.num_hidden_layers
decoderlayers = []
layer_weights = []
layer_indexs = range(layer_start, layer_end)
rms_norm_eps = 1e-05
for layer_idx in layer_indexs:
curr_layer = model.encoder.layer[layer_idx]
attn_layer = curr_layer.attention
weights = [
(attn_layer.self.query.weight),
(attn_layer.self.key.weight),
(attn_layer.self.value.weight),
(attn_layer.output.dense.weight),
(curr_layer.intermediate.dense.weight),
(curr_layer.output.dense.weight),
]
attn_output_norm_weight = attn_layer.output.LayerNorm.weight.to(torch.float16)
attn_output_norm_bias = attn_layer.output.LayerNorm.bias.to(torch.float16)
encoder_output_norm_weight = curr_layer.output.LayerNorm.weight.to(torch.float16)
encoder_output_norm_bias = curr_layer.output.LayerNorm.bias.to(torch.float16)
attn_self_query_bias = attn_layer.self.query.bias.to(torch.float16)
attn_self_key_bias = attn_layer.self.key.bias.to(torch.float16)
attn_self_value_bias = attn_layer.self.value.bias.to(torch.float16)
attn_output_dense_bias = attn_layer.output.dense.bias.to(torch.float16)
encoder_inter_dense_bias = curr_layer.intermediate.dense.bias.to(torch.float16)
encoder_output_dense_bias = curr_layer.output.dense.bias.to(torch.float16)
new_decoderlayer = FusedLlamaLowBitDecoderlayer(
weights,
attn_output_norm_weight=attn_output_norm_weight,
attn_output_norm_bias=attn_output_norm_bias,
encoder_output_norm_weight=encoder_output_norm_weight,
encoder_output_norm_bias=encoder_output_norm_bias,
attn_self_query_bias=attn_self_query_bias,
attn_self_key_bias=attn_self_key_bias,
attn_self_value_bias=attn_self_value_bias,
attn_output_dense_bias=attn_output_dense_bias,
encoder_inter_dense_bias=encoder_inter_dense_bias,
encoder_output_dense_bias=encoder_output_dense_bias,
intermediate_size=model.config.intermediate_size,
num_attention_heads=model.config.num_attention_heads,
hidden_size=model.config.hidden_size,
rms_norm_eps=rms_norm_eps,
layer_idx=layer_idx,
max_seq_len=max_output_len,
transpose_value=transpose_value_cache,
)
layer_weights.extend(weights)
model.encoder.layer[layer_idx] = new_decoderlayer
decoderlayers.append(new_decoderlayer)
print("finish creating all decode layers in prefill")
result_queue.put("loading finish")
while True:
result = input_queue.get()
if result == "stop":
break
hidden_states, attention_mask = result
with torch.inference_mode():
for encoder_layer in decoderlayers:
layer_outputs = encoder_layer(
hidden_states.to(torch.float16),
attention_mask.to(torch.float16),
)
hidden_states = layer_outputs
result_queue.put((hidden_states))
class PrefillRunner:
def __init__(self, model, max_output_len, max_prompt_len, transpose_value_cache):
self.model = model
self.max_output_len = max_output_len
self.max_prompt_len = max_prompt_len
self.transpose_value_cache = transpose_value_cache
self.prefill_result_queue = mp.Queue()
self.prefill_input_queue = mp.Queue()
self.p = mp.Process(
target=run_prefill,
args=(
model,
max_output_len,
max_prompt_len,
transpose_value_cache,
self.prefill_input_queue,
self.prefill_result_queue,
),
)
self.p.daemon = True
self.p.start()
output = self.prefill_result_queue.get()
print(Fore.GREEN + f"prefill process output: {output}")
print(Style.RESET_ALL)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
) -> Tuple[torch.Tensor]:
args = (hidden_states, attention_mask)
self.prefill_input_queue.put(args)
outputs = self.prefill_result_queue.get()
return outputs
def shutdown(self):
self.prefill_input_queue.put("stop")
self.p.join(3)
if self.p.exitcode is None:
self.p.kill()
def __del__(self):
self.shutdown()
def gen_xlm_fused_encoder_forward(prefill_runner):
def xlm_fused_encoder_forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
next_decoder_cache = () if use_cache else None
layer_outputs = prefill_runner.forward(hidden_states, attention_mask)
layer_outputs = layer_outputs.to(torch.float32)
hidden_states = layer_outputs
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
return xlm_fused_encoder_forward
class XLMPoolLinear(NNFactory):
def __init__(
self,
input_shape,
output_channels,
input_channels,
device: str = "NPU",
):
super().__init__(False, device)
# define input
input_node = self.parameter(input_shape, dtype=np.float16)
res = self.linear(input_node, output_channels, input_channels, bias=True)
# define outputs
res = self.convert_to_fp16(res)
print("start compiling")
self.compile()
class XLMPoolLayer(torch.nn.Module):
def __init__(
self,
weight,
bias,
output_channel,
input_channel,
):
super().__init__()
self.op_id = str(uuid.uuid4())
self.parameters = [weight, bias]
self.backend_cls_pooler = partial(
XLMPoolLinear,
output_channels=output_channel,
input_channels=input_channel,
)
def forward(self, hidden_states):
backend_cls = self.backend_cls_pooler
hidden_states = hidden_states.to(torch.float16)
return run_model(hidden_states, self.parameters, backend_cls, self.op_id)
class LayerNorm(NNFactory):
def __init__(
self,
input_shape,
weight_shape,
bias_shape,
eps,
device: str = "NPU",
):
super().__init__(False, device)
# define input
input = self.parameter(input_shape, dtype=np.float16)
weight = self.parameter(weight_shape, dtype=np.float16)
bias = self.parameter(bias_shape, dtype=np.float16)
input = self.convert_to_fp32(input)
mean_res = self.reduce_mean(input, -1, keep_dims=True,)
variance = self.reduce_mean(
self.power(input - mean_res, self.constant(np.array([[2]], dtype=np.float32))),
-1,
keep_dims=True,
)
eps = self.constant(eps)
input = self.eltwise_div(input - mean_res, self.sqrt(self.eltwise_add(variance, eps)))
weight = self.convert_to_fp32(weight)
input = self.eltwise_mul(weight, input)
bias = self.convert_to_fp32(bias)
input = self.eltwise_add(bias, input)
# define outputs
input = self.convert_to_fp16(input)
print("start compiling")
self.compile()
class XLMLayerNorm(torch.nn.Module):
def __init__(
self,
weight,
bias,
eps=1e-05,
):
super().__init__()
self.op_id = str(uuid.uuid4())
self.parameters = [weight, bias]
self.backend_cls = partial(
LayerNorm,
weight_shape=weight.shape,
bias_shape=bias.shape,
eps=eps,
)
def forward(self, x):
x = x.to(torch.float16)
return run_model(x, self.parameters, self.backend_cls, self.op_id)
@module_optimization
def replace_with_Layernorm(layer, qtype=None, device='NPU',
modules_to_not_convert=[], group_size=0):
if isinstance(layer, torch.nn.LayerNorm):
return XLMLayerNorm(
weight=layer.weight.to(torch.float16),
bias=layer.bias.to(torch.float16),
)
@module_optimization
def replace_with_FP16Linear(layer, qtype, device, modules_to_not_convert,
group_size):
from ipex_llm.transformers.npu_models.linear import Linear
if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
return Linear(layer.weight, layer.bias)