change 5 pytorch/huggingface models to fp16 (#11894)

2024-08-22 16:12:09 +08:00 · 2024-08-22 16:12:09 +08:00 · 18662dca1c
commit 18662dca1c
parent 5c4ed00593
7 changed files with 7 additions and 7 deletions
--- a/python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py
+++ b/python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py
@ -47,7 +47,7 @@ if __name__ == '__main__':
                                                 optimize_model=False,
                                                 trust_remote_code=True,
                                                 use_cache=True)
-    model = model.to('xpu')
+    model = model.half().to('xpu')

    # Load tokenizer
    tokenizer = CodeLlamaTokenizer.from_pretrained(model_path,
--- a/python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py
+++ b/python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py
@ -47,7 +47,7 @@ if __name__ == '__main__':
                                                 optimize_model=False,
                                                 trust_remote_code=True,
                                                 use_cache=True)
-    model = model.to('xpu')
+    model = model.half().to('xpu')

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path,
--- a/python/llm/example/GPU/HuggingFace/LLM/solar/generate.py
+++ b/python/llm/example/GPU/HuggingFace/LLM/solar/generate.py
@ -47,7 +47,7 @@ if __name__ == '__main__':
                                                 load_in_4bit=True,
                                                 trust_remote_code=True,
                                                 use_cache=True)
-    model = model.to('xpu')
+    model = model.half().to('xpu')

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path,
--- a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
@ -50,7 +50,7 @@ if __name__ == '__main__':
    # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
    model = optimize_model(model)

-    model = model.to('xpu')
+    model = model.half().to('xpu')

    # Load tokenizer
    tokenizer = CodeLlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
--- a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py
@ -46,7 +46,7 @@ if __name__ == '__main__':
                                                 use_cache=True)
    model = optimize_model(model)
    
-    model = model.to('xpu')
+    model = model.half().to('xpu')

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path,
--- a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
@ -49,7 +49,7 @@ if __name__ == '__main__':
    # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
    model = optimize_model(model)

-    model = model.to('xpu')
+    model = model.half().to('xpu')

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
--- a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py
@ -49,7 +49,7 @@ if __name__ == '__main__':
    # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
    # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
    model = optimize_model(model)
-    model = model.to('xpu')
+    model = model.half().to('xpu')

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)