From 3d1f2b44f8af3604b127e1f957adea1a36c3d345 Mon Sep 17 00:00:00 2001 From: Yishuo Wang Date: Fri, 18 Aug 2023 15:46:19 +0800 Subject: [PATCH] LLM: change default n_threads of native int4 models (#8776) --- python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py | 4 ++-- python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py | 4 ++-- python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py | 4 ++-- python/llm/src/bigdl/llm/ggml/model/llama/llama.py | 4 ++-- python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py b/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py index ec33ddaa..349cec3a 100644 --- a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py +++ b/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py @@ -71,7 +71,7 @@ class Bloom(GenerationMixin): use_mmap: bool = False, use_mlock: bool = False, embedding: bool = False, - n_threads: Optional[int] = 2, + n_threads: Optional[int] = -1, n_batch: int = 512, last_n_tokens_size: int = 64, lora_base: Optional[str] = None, @@ -92,7 +92,7 @@ class Bloom(GenerationMixin): use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. embedding: Embedding mode only. - n_threads: Number of threads to use. Default to be 2. + n_threads: Number of threads to use. Default to be -1, means auto. n_batch: Maximum number of prompt tokens to batch together when calling bloom_eval. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py index 8ef53c63..66341b0e 100644 --- a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py +++ b/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py @@ -72,7 +72,7 @@ class ChatGLM(GenerationMixin): use_mmap: bool = False, use_mlock: bool = False, embedding: bool = False, - n_threads: Optional[int] = 2, + n_threads: Optional[int] = -1, n_batch: int = 512, last_n_tokens_size: int = 64, lora_base: Optional[str] = None, @@ -93,7 +93,7 @@ class ChatGLM(GenerationMixin): use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. embedding: Embedding mode only. - n_threads: Number of threads to use. Default to be 2. + n_threads: Number of threads to use. Default to be -1, means auto. n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py index 019e55ed..6d07e216 100644 --- a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py +++ b/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py @@ -139,7 +139,7 @@ class Gptneox(GenerationMixin): use_mmap: bool = False, use_mlock: bool = False, embedding: bool = False, - n_threads: Optional[int] = 2, + n_threads: Optional[int] = -1, n_batch: int = 512, last_n_tokens_size: int = 64, lora_base: Optional[str] = None, @@ -160,7 +160,7 @@ class Gptneox(GenerationMixin): use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. embedding: Embedding mode only. - n_threads: Number of threads to use. Default to be 2. + n_threads: Number of threads to use. Default to be -1, means auto. n_batch: Maximum number of prompt tokens to batch together when calling gptneox_eval. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py b/python/llm/src/bigdl/llm/ggml/model/llama/llama.py index aaf60fa9..9319d390 100644 --- a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py +++ b/python/llm/src/bigdl/llm/ggml/model/llama/llama.py @@ -137,7 +137,7 @@ class Llama(GenerationMixin): use_mmap: bool = False, use_mlock: bool = False, embedding: bool = False, - n_threads: Optional[int] = 2, + n_threads: Optional[int] = -1, n_batch: int = 512, last_n_tokens_size: int = 64, lora_base: Optional[str] = None, @@ -158,7 +158,7 @@ class Llama(GenerationMixin): use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. embedding: Embedding mode only. - n_threads: Number of threads to use. Default to be 2. + n_threads: Number of threads to use. Default to be -1, means auto. n_batch: Maximum number of prompt tokens to batch together when calling llama_eval. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and diff --git a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py b/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py index 2a26d94e..c00935cb 100644 --- a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py +++ b/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py @@ -72,7 +72,7 @@ class Starcoder(GenerationMixin): use_mmap: bool = False, use_mlock: bool = False, embedding: bool = False, - n_threads: Optional[int] = 2, + n_threads: Optional[int] = -1, n_batch: int = 512, last_n_tokens_size: int = 64, lora_base: Optional[str] = None, @@ -93,7 +93,7 @@ class Starcoder(GenerationMixin): use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. embedding: Embedding mode only. - n_threads: Number of threads to use. Default to be 2. + n_threads: Number of threads to use. Default to be -1, means auto. n_batch: Maximum number of prompt tokens to batch together when calling starcoder_eval. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and