From 3d1f2b44f8af3604b127e1f957adea1a36c3d345 Mon Sep 17 00:00:00 2001
From: Yishuo Wang <yishuo.wang@intel.com>
Date: Fri, 18 Aug 2023 15:46:19 +0800
Subject: [PATCH] LLM: change default n_threads of native int4 models (#8776)

---
 python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py         | 4 ++--
 python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py     | 4 ++--
 python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py     | 4 ++--
 python/llm/src/bigdl/llm/ggml/model/llama/llama.py         | 4 ++--
 python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py b/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py
index ec33ddaa..349cec3a 100644
--- a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py
+++ b/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py
@@ -71,7 +71,7 @@ class Bloom(GenerationMixin):
         use_mmap: bool = False,
         use_mlock: bool = False,
         embedding: bool = False,
-        n_threads: Optional[int] = 2,
+        n_threads: Optional[int] = -1,
         n_batch: int = 512,
         last_n_tokens_size: int = 64,
         lora_base: Optional[str] = None,
@@ -92,7 +92,7 @@ class Bloom(GenerationMixin):
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
             embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be 2.
+            n_threads: Number of threads to use. Default to be -1, means auto.
             n_batch: Maximum number of prompt tokens to batch together when calling bloom_eval.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and
diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
index 8ef53c63..66341b0e 100644
--- a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
+++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
@@ -72,7 +72,7 @@ class ChatGLM(GenerationMixin):
         use_mmap: bool = False,
         use_mlock: bool = False,
         embedding: bool = False,
-        n_threads: Optional[int] = 2,
+        n_threads: Optional[int] = -1,
         n_batch: int = 512,
         last_n_tokens_size: int = 64,
         lora_base: Optional[str] = None,
@@ -93,7 +93,7 @@ class ChatGLM(GenerationMixin):
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
             embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be 2.
+            n_threads: Number of threads to use. Default to be -1, means auto.
             n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and
diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py
index 019e55ed..6d07e216 100644
--- a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py
+++ b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py
@@ -139,7 +139,7 @@ class Gptneox(GenerationMixin):
         use_mmap: bool = False,
         use_mlock: bool = False,
         embedding: bool = False,
-        n_threads: Optional[int] = 2,
+        n_threads: Optional[int] = -1,
         n_batch: int = 512,
         last_n_tokens_size: int = 64,
         lora_base: Optional[str] = None,
@@ -160,7 +160,7 @@ class Gptneox(GenerationMixin):
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
             embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be 2.
+            n_threads: Number of threads to use. Default to be -1, means auto.
             n_batch: Maximum number of prompt tokens to batch together when calling gptneox_eval.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and
diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py b/python/llm/src/bigdl/llm/ggml/model/llama/llama.py
index aaf60fa9..9319d390 100644
--- a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py
+++ b/python/llm/src/bigdl/llm/ggml/model/llama/llama.py
@@ -137,7 +137,7 @@ class Llama(GenerationMixin):
         use_mmap: bool = False,
         use_mlock: bool = False,
         embedding: bool = False,
-        n_threads: Optional[int] = 2,
+        n_threads: Optional[int] = -1,
         n_batch: int = 512,
         last_n_tokens_size: int = 64,
         lora_base: Optional[str] = None,
@@ -158,7 +158,7 @@ class Llama(GenerationMixin):
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
             embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be 2.
+            n_threads: Number of threads to use. Default to be -1, means auto.
             n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and
diff --git a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py b/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py
index 2a26d94e..c00935cb 100644
--- a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py
+++ b/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py
@@ -72,7 +72,7 @@ class Starcoder(GenerationMixin):
         use_mmap: bool = False,
         use_mlock: bool = False,
         embedding: bool = False,
-        n_threads: Optional[int] = 2,
+        n_threads: Optional[int] = -1,
         n_batch: int = 512,
         last_n_tokens_size: int = 64,
         lora_base: Optional[str] = None,
@@ -93,7 +93,7 @@ class Starcoder(GenerationMixin):
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
             embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be 2.
+            n_threads: Number of threads to use. Default to be -1, means auto.
             n_batch: Maximum number of prompt tokens to batch together when calling starcoder_eval.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and