From 854398f6e09f4c185c2680d65d4a904391a40cd8 Mon Sep 17 00:00:00 2001 From: Ruonan Wang Date: Fri, 25 Oct 2024 17:09:26 +0800 Subject: [PATCH] update example to reduce peak memory usage (#12274) --- .../NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py index 469081c3..08bbb55e 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py @@ -60,6 +60,7 @@ if __name__ == "__main__": optimize_model=True, pipeline=True, max_output_len=args.max_output_len, + torch_dtype=torch.float16, attn_implementation="eager") tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)