diff --git a/python/llm/example/gpu/chatglm2/generate.py b/python/llm/example/gpu/chatglm2/generate.py index b52057a8..e0a0b716 100644 --- a/python/llm/example/gpu/chatglm2/generate.py +++ b/python/llm/example/gpu/chatglm2/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': # which convert the relevant layers in the model into INT4 format model = AutoModel.from_pretrained(model_path, load_in_4bit=True, - optimize_model=False, + optimize_model=True, trust_remote_code=True) model = model.to('xpu') diff --git a/python/llm/example/transformers/transformers_int4/GPU/gpt-j/generate.py b/python/llm/example/gpu/gpt-j/generate.py similarity index 97% rename from python/llm/example/transformers/transformers_int4/GPU/gpt-j/generate.py rename to python/llm/example/gpu/gpt-j/generate.py index 208d068f..7f98f15e 100644 --- a/python/llm/example/transformers/transformers_int4/GPU/gpt-j/generate.py +++ b/python/llm/example/gpu/gpt-j/generate.py @@ -45,6 +45,7 @@ if __name__ == '__main__': optimize_model=False, trust_remote_code=True) model = model.to('xpu') + model = ipex.optimize(model.eval(), dtype="float16", inplace=True) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/transformers/transformers_int4/GPU/gpt-j/readme.md b/python/llm/example/gpu/gpt-j/readme.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/GPU/gpt-j/readme.md rename to python/llm/example/gpu/gpt-j/readme.md diff --git a/python/llm/example/gpu/llama2/generate.py b/python/llm/example/gpu/llama2/generate.py index 78d16246..9d4e8e58 100644 --- a/python/llm/example/gpu/llama2/generate.py +++ b/python/llm/example/gpu/llama2/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': # which convert the relevant layers in the model into INT4 format model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, - optimize_model=False, + optimize_model=True, trust_remote_code=True) model = model.to('xpu')