LLM: change default n_threads of native int4 models (#8776)
This commit is contained in:
parent
2ba2133613
commit
3d1f2b44f8
5 changed files with 10 additions and 10 deletions
|
|
@ -71,7 +71,7 @@ class Bloom(GenerationMixin):
|
|||
use_mmap: bool = False,
|
||||
use_mlock: bool = False,
|
||||
embedding: bool = False,
|
||||
n_threads: Optional[int] = 2,
|
||||
n_threads: Optional[int] = -1,
|
||||
n_batch: int = 512,
|
||||
last_n_tokens_size: int = 64,
|
||||
lora_base: Optional[str] = None,
|
||||
|
|
@ -92,7 +92,7 @@ class Bloom(GenerationMixin):
|
|||
use_mmap: Use mmap if possible.
|
||||
use_mlock: Force the system to keep the model in RAM.
|
||||
embedding: Embedding mode only.
|
||||
n_threads: Number of threads to use. Default to be 2.
|
||||
n_threads: Number of threads to use. Default to be -1, means auto.
|
||||
n_batch: Maximum number of prompt tokens to batch together when calling bloom_eval.
|
||||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
||||
lora_base: Optional path to base model, useful if using a quantized base model and
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ class ChatGLM(GenerationMixin):
|
|||
use_mmap: bool = False,
|
||||
use_mlock: bool = False,
|
||||
embedding: bool = False,
|
||||
n_threads: Optional[int] = 2,
|
||||
n_threads: Optional[int] = -1,
|
||||
n_batch: int = 512,
|
||||
last_n_tokens_size: int = 64,
|
||||
lora_base: Optional[str] = None,
|
||||
|
|
@ -93,7 +93,7 @@ class ChatGLM(GenerationMixin):
|
|||
use_mmap: Use mmap if possible.
|
||||
use_mlock: Force the system to keep the model in RAM.
|
||||
embedding: Embedding mode only.
|
||||
n_threads: Number of threads to use. Default to be 2.
|
||||
n_threads: Number of threads to use. Default to be -1, means auto.
|
||||
n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval.
|
||||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
||||
lora_base: Optional path to base model, useful if using a quantized base model and
|
||||
|
|
|
|||
|
|
@ -139,7 +139,7 @@ class Gptneox(GenerationMixin):
|
|||
use_mmap: bool = False,
|
||||
use_mlock: bool = False,
|
||||
embedding: bool = False,
|
||||
n_threads: Optional[int] = 2,
|
||||
n_threads: Optional[int] = -1,
|
||||
n_batch: int = 512,
|
||||
last_n_tokens_size: int = 64,
|
||||
lora_base: Optional[str] = None,
|
||||
|
|
@ -160,7 +160,7 @@ class Gptneox(GenerationMixin):
|
|||
use_mmap: Use mmap if possible.
|
||||
use_mlock: Force the system to keep the model in RAM.
|
||||
embedding: Embedding mode only.
|
||||
n_threads: Number of threads to use. Default to be 2.
|
||||
n_threads: Number of threads to use. Default to be -1, means auto.
|
||||
n_batch: Maximum number of prompt tokens to batch together when calling gptneox_eval.
|
||||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
||||
lora_base: Optional path to base model, useful if using a quantized base model and
|
||||
|
|
|
|||
|
|
@ -137,7 +137,7 @@ class Llama(GenerationMixin):
|
|||
use_mmap: bool = False,
|
||||
use_mlock: bool = False,
|
||||
embedding: bool = False,
|
||||
n_threads: Optional[int] = 2,
|
||||
n_threads: Optional[int] = -1,
|
||||
n_batch: int = 512,
|
||||
last_n_tokens_size: int = 64,
|
||||
lora_base: Optional[str] = None,
|
||||
|
|
@ -158,7 +158,7 @@ class Llama(GenerationMixin):
|
|||
use_mmap: Use mmap if possible.
|
||||
use_mlock: Force the system to keep the model in RAM.
|
||||
embedding: Embedding mode only.
|
||||
n_threads: Number of threads to use. Default to be 2.
|
||||
n_threads: Number of threads to use. Default to be -1, means auto.
|
||||
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
|
||||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
||||
lora_base: Optional path to base model, useful if using a quantized base model and
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ class Starcoder(GenerationMixin):
|
|||
use_mmap: bool = False,
|
||||
use_mlock: bool = False,
|
||||
embedding: bool = False,
|
||||
n_threads: Optional[int] = 2,
|
||||
n_threads: Optional[int] = -1,
|
||||
n_batch: int = 512,
|
||||
last_n_tokens_size: int = 64,
|
||||
lora_base: Optional[str] = None,
|
||||
|
|
@ -93,7 +93,7 @@ class Starcoder(GenerationMixin):
|
|||
use_mmap: Use mmap if possible.
|
||||
use_mlock: Force the system to keep the model in RAM.
|
||||
embedding: Embedding mode only.
|
||||
n_threads: Number of threads to use. Default to be 2.
|
||||
n_threads: Number of threads to use. Default to be -1, means auto.
|
||||
n_batch: Maximum number of prompt tokens to batch together when calling starcoder_eval.
|
||||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
||||
lora_base: Optional path to base model, useful if using a quantized base model and
|
||||
|
|
|
|||
Loading…
Reference in a new issue