update example to reduce peak memory usage (#12274)

2024-10-25 17:09:26 +08:00 · 2024-10-25 17:09:26 +08:00 · 854398f6e0
commit 854398f6e0
parent e713296090
1 changed files with 1 additions and 0 deletions
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py
@ -60,6 +60,7 @@ if __name__ == "__main__":
                                                 optimize_model=True,
                                                 pipeline=True,
                                                 max_output_len=args.max_output_len,
+                                                 torch_dtype=torch.float16,
                                                 attn_implementation="eager")

    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)