diff --git a/python/llm/example/GPU/HuggingFace/LLM/qwen2/generate.py b/python/llm/example/GPU/HuggingFace/LLM/qwen2/generate.py index 25fdaeec..7d0d1ed0 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/qwen2/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/qwen2/generate.py @@ -19,7 +19,6 @@ import time import argparse from transformers import AutoTokenizer -from ipex_llm import optimize_model import numpy as np @@ -36,7 +35,7 @@ if __name__ == '__main__': args = parser.parse_args() model_path = args.repo_id_or_model_path - + from ipex_llm.transformers import AutoModelForCausalLM # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format @@ -45,7 +44,7 @@ if __name__ == '__main__': optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to("xpu") + model = model.half().to("xpu") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/src/ipex_llm/transformers/lookup.py b/python/llm/src/ipex_llm/transformers/lookup.py index e5725ff7..c17f76af 100644 --- a/python/llm/src/ipex_llm/transformers/lookup.py +++ b/python/llm/src/ipex_llm/transformers/lookup.py @@ -149,7 +149,7 @@ class PromptLookupCandidateGenerator(): input_ids: torch.LongTensor): for ngram_size in range(self.max_matching_ngram_size, 0, -1): # Create sliding windows of size ngram_size - windows = input_ids.unfold(dimension=1, size=ngram_size, step=1) + windows = input_ids.cpu().unfold(dimension=1, size=ngram_size, step=1) for idx in range(windows.size(1)): window = tensor2key(windows[0, idx]) if window not in self.lookup_table: