LLM: change default n_threads of native int4 models (#8776)

2023-08-18 15:46:19 +08:00 · 2023-08-18 15:46:19 +08:00 · 3d1f2b44f8
commit 3d1f2b44f8
parent 2ba2133613
5 changed files with 10 additions and 10 deletions
--- a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py
+++ b/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py
@ -71,7 +71,7 @@ class Bloom(GenerationMixin):
        use_mmap: bool = False,
        use_mlock: bool = False,
        embedding: bool = False,
-        n_threads: Optional[int] = 2,
+        n_threads: Optional[int] = -1,
        n_batch: int = 512,
        last_n_tokens_size: int = 64,
        lora_base: Optional[str] = None,
@ -92,7 +92,7 @@ class Bloom(GenerationMixin):
            use_mmap: Use mmap if possible.
            use_mlock: Force the system to keep the model in RAM.
            embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be 2.
+            n_threads: Number of threads to use. Default to be -1, means auto.
            n_batch: Maximum number of prompt tokens to batch together when calling bloom_eval.
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and
--- a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
+++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
@ -72,7 +72,7 @@ class ChatGLM(GenerationMixin):
        use_mmap: bool = False,
        use_mlock: bool = False,
        embedding: bool = False,
-        n_threads: Optional[int] = 2,
+        n_threads: Optional[int] = -1,
        n_batch: int = 512,
        last_n_tokens_size: int = 64,
        lora_base: Optional[str] = None,
@ -93,7 +93,7 @@ class ChatGLM(GenerationMixin):
            use_mmap: Use mmap if possible.
            use_mlock: Force the system to keep the model in RAM.
            embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be 2.
+            n_threads: Number of threads to use. Default to be -1, means auto.
            n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval.
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and
--- a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py
+++ b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py
@ -139,7 +139,7 @@ class Gptneox(GenerationMixin):
        use_mmap: bool = False,
        use_mlock: bool = False,
        embedding: bool = False,
-        n_threads: Optional[int] = 2,
+        n_threads: Optional[int] = -1,
        n_batch: int = 512,
        last_n_tokens_size: int = 64,
        lora_base: Optional[str] = None,
@ -160,7 +160,7 @@ class Gptneox(GenerationMixin):
            use_mmap: Use mmap if possible.
            use_mlock: Force the system to keep the model in RAM.
            embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be 2.
+            n_threads: Number of threads to use. Default to be -1, means auto.
            n_batch: Maximum number of prompt tokens to batch together when calling gptneox_eval.
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and
--- a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py
+++ b/python/llm/src/bigdl/llm/ggml/model/llama/llama.py
@ -137,7 +137,7 @@ class Llama(GenerationMixin):
        use_mmap: bool = False,
        use_mlock: bool = False,
        embedding: bool = False,
-        n_threads: Optional[int] = 2,
+        n_threads: Optional[int] = -1,
        n_batch: int = 512,
        last_n_tokens_size: int = 64,
        lora_base: Optional[str] = None,
@ -158,7 +158,7 @@ class Llama(GenerationMixin):
            use_mmap: Use mmap if possible.
            use_mlock: Force the system to keep the model in RAM.
            embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be 2.
+            n_threads: Number of threads to use. Default to be -1, means auto.
            n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and
--- a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py
+++ b/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py
@ -72,7 +72,7 @@ class Starcoder(GenerationMixin):
        use_mmap: bool = False,
        use_mlock: bool = False,
        embedding: bool = False,
-        n_threads: Optional[int] = 2,
+        n_threads: Optional[int] = -1,
        n_batch: int = 512,
        last_n_tokens_size: int = 64,
        lora_base: Optional[str] = None,
@ -93,7 +93,7 @@ class Starcoder(GenerationMixin):
            use_mmap: Use mmap if possible.
            use_mlock: Force the system to keep the model in RAM.
            embedding: Embedding mode only.
-            n_threads: Number of threads to use. Default to be 2.
+            n_threads: Number of threads to use. Default to be -1, means auto.
            n_batch: Maximum number of prompt tokens to batch together when calling starcoder_eval.
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and