From ae7302a654a85006974549450bfe51b7275294af Mon Sep 17 00:00:00 2001
From: "Chu,Youcheng" <youcheng.chu@intel.com>
Date: Fri, 30 Aug 2024 13:43:48 +0800
Subject: [PATCH] add gptq option for ppl test (#11921)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat：add gptq for ppl

* fix: add an empty line

* fix: add an empty line

* fix: remove an empty line

* Resolve comments

* Resolve comments

* Resolve comments
---
 .../dev/benchmark/perplexity/run_wikitext.py   | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/python/llm/dev/benchmark/perplexity/run_wikitext.py b/python/llm/dev/benchmark/perplexity/run_wikitext.py
index 92426a86..061c87ba 100644
--- a/python/llm/dev/benchmark/perplexity/run_wikitext.py
+++ b/python/llm/dev/benchmark/perplexity/run_wikitext.py
@@ -38,12 +38,24 @@ args = parser.parse_args()
 
 if args.precision == "fp16":  # ipex fp16
     from transformers import AutoModelForCausalLM
-    model = AutoModelForCausalLM.from_pretrained(args.model_path, use_cache=args.use_cache, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(args.model_path,
+                                                 use_cache=args.use_cache,
+                                                 trust_remote_code=True)
     model = model.half()
+elif 'gptq' in args.model_path.lower():  # ipex-llm gptq
+    from ipex_llm.transformers import AutoModelForCausalLM
+    model = AutoModelForCausalLM.from_pretrained(args.model_path,
+                                                 load_in_4bit=True,
+                                                 torch_dtype=torch.float,
+                                                 use_cache=args.use_cache,
+                                                 trust_remote_code=True)
 else:  # ipex-llm
     from ipex_llm.transformers import AutoModelForCausalLM
-    model = AutoModelForCausalLM.from_pretrained(args.model_path, load_in_low_bit=args.precision,
-                                                 use_cache=args.use_cache, trust_remote_code=True, mixed_precision= args.mixed_precision)   
+    model = AutoModelForCausalLM.from_pretrained(args.model_path,
+                                                 load_in_low_bit=args.precision,
+                                                 use_cache=args.use_cache,
+                                                 trust_remote_code=True,
+                                                 mixed_precision=args.mixed_precision)   
     model = model.half()
 model = model.to(args.device)
 model = model.eval()