From 1e27e08322b01991b51dcaa4cd2b82f17e9732a0 Mon Sep 17 00:00:00 2001 From: "Keyan (Kyrie) Zhang" <79576162+Zhangky11@users.noreply.github.com> Date: Tue, 9 Apr 2024 00:45:49 -0700 Subject: [PATCH] Modify example from fp32 to fp16 (#10528) * Modify example from fp32 to fp16 * Remove Falcon from fp16 example for now * Remove MPT from fp16 example --- .../GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py | 2 +- .../GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py | 2 +- .../GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py | 2 +- .../GPU/HF-Transformers-AutoModels/Model/llama2/generate.py | 2 +- .../GPU/HF-Transformers-AutoModels/Model/mistral/generate.py | 2 +- .../GPU/HF-Transformers-AutoModels/Model/qwen/generate.py | 2 +- .../GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py index 476d0946..68be4b1a 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py @@ -51,7 +51,7 @@ if __name__ == '__main__': load_in_4bit=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py index 53272834..f2e65b8f 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py @@ -48,7 +48,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py index 92190bca..da730d70 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py @@ -48,7 +48,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py index 09f389ad..f678dec3 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py @@ -61,7 +61,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py index 3734724a..585b1936 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py @@ -47,7 +47,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py index 2d0a5f8a..3e47ab47 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py @@ -54,7 +54,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py index 557b0d55..902b9170 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) - model = model.to("xpu") + model = model.half().to("xpu") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path,