[NPU] Fix hf generate with save/load generation config for Python (cpp backend) (#12509)

* Fix hf generate with save/load generation config

* Small fix

* Fix based on comments
This commit is contained in:
Yuwen Hu 2024-12-05 19:19:58 +08:00 committed by GitHub
parent 49ab8974fa
commit 0918d3baca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 11 additions and 0 deletions

View file

@ -440,6 +440,15 @@ class _BaseAutoModelClass:
model.kv_len = config_dict['kv_len'] model.kv_len = config_dict['kv_len']
model.vocab_size = config_dict['vocab_size'] model.vocab_size = config_dict['vocab_size']
model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32) model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
if model.can_generate():
try:
model.generation_config = GenerationConfig.from_pretrained(
pretrained_model_name_or_path,
subfolder=subfolder,
**kwargs,
)
except (OSError, TypeError):
pass
except: except:
invalidInputError(False, invalidInputError(False,
"Fail to InitLLMPipeline.") "Fail to InitLLMPipeline.")

View file

@ -466,6 +466,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
"lm_head_low_bit": lm_head_low_bit} "lm_head_low_bit": lm_head_low_bit}
model.config.update(update_dict) model.config.update(update_dict)
model.config.save_pretrained(save_directory) model.config.save_pretrained(save_directory)
if model.can_generate():
model.generation_config.save_pretrained(save_directory)
from .qwen import convert_qwen_layer, convert_fused_qwen_layer from .qwen import convert_qwen_layer, convert_fused_qwen_layer
from .qwen import convert_lm_head_and_embedding from .qwen import convert_lm_head_and_embedding