diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 9744e2f8..b2b08fe2 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -440,6 +440,15 @@ class _BaseAutoModelClass: model.kv_len = config_dict['kv_len'] model.vocab_size = config_dict['vocab_size'] model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32) + if model.can_generate(): + try: + model.generation_config = GenerationConfig.from_pretrained( + pretrained_model_name_or_path, + subfolder=subfolder, + **kwargs, + ) + except (OSError, TypeError): + pass except: invalidInputError(False, "Fail to InitLLMPipeline.") diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 337736a7..3b223017 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -466,6 +466,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, "lm_head_low_bit": lm_head_low_bit} model.config.update(update_dict) model.config.save_pretrained(save_directory) + if model.can_generate(): + model.generation_config.save_pretrained(save_directory) from .qwen import convert_qwen_layer, convert_fused_qwen_layer from .qwen import convert_lm_head_and_embedding