refactor from_pretrained API for NPU (#11927)

This commit is contained in:
Zijie Li 2024-08-27 09:50:30 +08:00 committed by GitHub
parent 7ca557aada
commit 6c3eb1e1e8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 19 additions and 9 deletions

View file

@ -68,7 +68,7 @@ if __name__ == "__main__":
trust_remote_code=True,
attn_implementation="eager",
load_in_low_bit="sym_int4",
enable_mp=True,
optimize_model=True,
max_output_len=args.max_output_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,

View file

@ -55,7 +55,7 @@ if __name__ == "__main__":
trust_remote_code=True,
attn_implementation="eager",
load_in_low_bit="sym_int4",
enable_mp=True,
optimize_model=True,
max_output_len=args.max_output_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,

View file

@ -110,16 +110,16 @@ class _BaseAutoModelClass:
ignore_argument(kwargs, "mixed_precision")
ignore_argument(kwargs, "cpu_embedding")
ignore_argument(kwargs, "embedding_qtype")
ignore_argument(kwargs, "optimize_model")
ignore_argument(kwargs, "enable_mp")
ignore_argument(kwargs, "modules_to_not_convert")
ignore_argument(kwargs, "quantization_config")
ignore_argument(kwargs, "speculative")
ignore_argument(kwargs, "pipeline_parallel_stages")
enable_mp = kwargs.pop("enable_mp", False)
optimize_model = kwargs.pop("optimize_model", False)
max_output_len = kwargs.pop("max_output_len", 1024)
max_prompt_len = kwargs.pop("max_prompt_len", max_output_len)
inter_pp = kwargs.pop("inter_pp", 2)
intra_pp = kwargs.pop("intra_pp", 2)
inter_pp = kwargs.pop("inter_pp", None)
intra_pp = kwargs.pop("intra_pp", None)
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
_args = copy.deepcopy(args)
@ -140,7 +140,7 @@ class _BaseAutoModelClass:
logger.info(f"Converting model, it may takes up to several minutes ...")
from intel_npu_acceleration_library.compiler import create_npu_kernels
if enable_mp:
if optimize_model:
invalidInputError(
max_prompt_len < max_output_len,
(

View file

@ -29,11 +29,16 @@ def optimize_llm(
model: torch.nn.Module,
max_output_len=1024,
max_prompt_len=1024,
inter_pp=2,
intra_pp=2,
inter_pp=None,
intra_pp=None,
transpose_value_cache=True,
):
if model.config.model_type == "llama":
if intra_pp is None:
intra_pp = 2
if inter_pp is None:
inter_pp = 2
from ipex_llm.transformers.npu_models.llama_mp import gen_llama_fused_model_forward
from ipex_llm.transformers.npu_models.llama_mp import DecodeRunner, PrefillRunner
from transformers.models.llama.modeling_llama import LlamaModel
@ -60,6 +65,11 @@ def optimize_llm(
convert_forward(model, LlamaForCausalLM, llama2_casullm_forward)
elif model.config.model_type == "qwen2" and model.config.intermediate_size == 8960:
# for qwen2-1.5B
if intra_pp is None:
intra_pp = 2
if inter_pp is None:
inter_pp = 1
from ipex_llm.transformers.npu_models.qwen2_mp import gen_qwen2_fused_model_forward
from ipex_llm.transformers.npu_models.qwen2_mp import DecodeRunner, PrefillRunner
from transformers.models.qwen2.modeling_qwen2 import Qwen2Model