diff --git a/python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py b/python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py index c772d8c2..05b14c2f 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py @@ -47,7 +47,7 @@ if __name__ == '__main__': optimize_model=False, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = CodeLlamaTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py b/python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py index 7c3b2dac..d08fca0a 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py @@ -47,7 +47,7 @@ if __name__ == '__main__': optimize_model=False, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HuggingFace/LLM/solar/generate.py b/python/llm/example/GPU/HuggingFace/LLM/solar/generate.py index eb5e4aa4..25b692ca 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/solar/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/solar/generate.py @@ -47,7 +47,7 @@ if __name__ == '__main__': load_in_4bit=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py index d676666b..04af3a02 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py @@ -50,7 +50,7 @@ if __name__ == '__main__': # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = CodeLlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py index 51690317..e8098e49 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py @@ -46,7 +46,7 @@ if __name__ == '__main__': use_cache=True) model = optimize_model(model) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py index 0e717730..377e3156 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py @@ -49,7 +49,7 @@ if __name__ == '__main__': # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py index b612bc2f..b328b51e 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py @@ -49,7 +49,7 @@ if __name__ == '__main__': # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)