diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py index 476d0946..68be4b1a 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py @@ -51,7 +51,7 @@ if __name__ == '__main__': load_in_4bit=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py index 53272834..f2e65b8f 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py @@ -48,7 +48,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py index 92190bca..da730d70 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py @@ -48,7 +48,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py index 09f389ad..f678dec3 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py @@ -61,7 +61,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py index 3734724a..585b1936 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py @@ -47,7 +47,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py index 2d0a5f8a..3e47ab47 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py @@ -54,7 +54,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py index 557b0d55..902b9170 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) - model = model.to("xpu") + model = model.half().to("xpu") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path,