[NPU] Fix hf generate with save/load generation config for Python (cpp backend) (#12509)
* Fix hf generate with save/load generation config * Small fix * Fix based on comments
This commit is contained in:
parent
49ab8974fa
commit
0918d3baca
2 changed files with 11 additions and 0 deletions
|
|
@ -440,6 +440,15 @@ class _BaseAutoModelClass:
|
||||||
model.kv_len = config_dict['kv_len']
|
model.kv_len = config_dict['kv_len']
|
||||||
model.vocab_size = config_dict['vocab_size']
|
model.vocab_size = config_dict['vocab_size']
|
||||||
model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
|
model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
|
||||||
|
if model.can_generate():
|
||||||
|
try:
|
||||||
|
model.generation_config = GenerationConfig.from_pretrained(
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
subfolder=subfolder,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
except (OSError, TypeError):
|
||||||
|
pass
|
||||||
except:
|
except:
|
||||||
invalidInputError(False,
|
invalidInputError(False,
|
||||||
"Fail to InitLLMPipeline.")
|
"Fail to InitLLMPipeline.")
|
||||||
|
|
|
||||||
|
|
@ -466,6 +466,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
"lm_head_low_bit": lm_head_low_bit}
|
"lm_head_low_bit": lm_head_low_bit}
|
||||||
model.config.update(update_dict)
|
model.config.update(update_dict)
|
||||||
model.config.save_pretrained(save_directory)
|
model.config.save_pretrained(save_directory)
|
||||||
|
if model.can_generate():
|
||||||
|
model.generation_config.save_pretrained(save_directory)
|
||||||
|
|
||||||
from .qwen import convert_qwen_layer, convert_fused_qwen_layer
|
from .qwen import convert_qwen_layer, convert_fused_qwen_layer
|
||||||
from .qwen import convert_lm_head_and_embedding
|
from .qwen import convert_lm_head_and_embedding
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue