[NPU] Fix hf generate with save/load generation config for Python (cpp backend) (#12509)
* Fix hf generate with save/load generation config * Small fix * Fix based on comments
This commit is contained in:
parent
49ab8974fa
commit
0918d3baca
2 changed files with 11 additions and 0 deletions
|
|
@ -440,6 +440,15 @@ class _BaseAutoModelClass:
|
|||
model.kv_len = config_dict['kv_len']
|
||||
model.vocab_size = config_dict['vocab_size']
|
||||
model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
|
||||
if model.can_generate():
|
||||
try:
|
||||
model.generation_config = GenerationConfig.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
subfolder=subfolder,
|
||||
**kwargs,
|
||||
)
|
||||
except (OSError, TypeError):
|
||||
pass
|
||||
except:
|
||||
invalidInputError(False,
|
||||
"Fail to InitLLMPipeline.")
|
||||
|
|
|
|||
|
|
@ -466,6 +466,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|||
"lm_head_low_bit": lm_head_low_bit}
|
||||
model.config.update(update_dict)
|
||||
model.config.save_pretrained(save_directory)
|
||||
if model.can_generate():
|
||||
model.generation_config.save_pretrained(save_directory)
|
||||
|
||||
from .qwen import convert_qwen_layer, convert_fused_qwen_layer
|
||||
from .qwen import convert_lm_head_and_embedding
|
||||
|
|
|
|||
Loading…
Reference in a new issue