LLM: change default n_threads of native int4 models (#8776)

This commit is contained in:
Yishuo Wang 2023-08-18 15:46:19 +08:00 committed by GitHub
parent 2ba2133613
commit 3d1f2b44f8
5 changed files with 10 additions and 10 deletions

View file

@ -71,7 +71,7 @@ class Bloom(GenerationMixin):
use_mmap: bool = False,
use_mlock: bool = False,
embedding: bool = False,
n_threads: Optional[int] = 2,
n_threads: Optional[int] = -1,
n_batch: int = 512,
last_n_tokens_size: int = 64,
lora_base: Optional[str] = None,
@ -92,7 +92,7 @@ class Bloom(GenerationMixin):
use_mmap: Use mmap if possible.
use_mlock: Force the system to keep the model in RAM.
embedding: Embedding mode only.
n_threads: Number of threads to use. Default to be 2.
n_threads: Number of threads to use. Default to be -1, means auto.
n_batch: Maximum number of prompt tokens to batch together when calling bloom_eval.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and

View file

@ -72,7 +72,7 @@ class ChatGLM(GenerationMixin):
use_mmap: bool = False,
use_mlock: bool = False,
embedding: bool = False,
n_threads: Optional[int] = 2,
n_threads: Optional[int] = -1,
n_batch: int = 512,
last_n_tokens_size: int = 64,
lora_base: Optional[str] = None,
@ -93,7 +93,7 @@ class ChatGLM(GenerationMixin):
use_mmap: Use mmap if possible.
use_mlock: Force the system to keep the model in RAM.
embedding: Embedding mode only.
n_threads: Number of threads to use. Default to be 2.
n_threads: Number of threads to use. Default to be -1, means auto.
n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and

View file

@ -139,7 +139,7 @@ class Gptneox(GenerationMixin):
use_mmap: bool = False,
use_mlock: bool = False,
embedding: bool = False,
n_threads: Optional[int] = 2,
n_threads: Optional[int] = -1,
n_batch: int = 512,
last_n_tokens_size: int = 64,
lora_base: Optional[str] = None,
@ -160,7 +160,7 @@ class Gptneox(GenerationMixin):
use_mmap: Use mmap if possible.
use_mlock: Force the system to keep the model in RAM.
embedding: Embedding mode only.
n_threads: Number of threads to use. Default to be 2.
n_threads: Number of threads to use. Default to be -1, means auto.
n_batch: Maximum number of prompt tokens to batch together when calling gptneox_eval.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and

View file

@ -137,7 +137,7 @@ class Llama(GenerationMixin):
use_mmap: bool = False,
use_mlock: bool = False,
embedding: bool = False,
n_threads: Optional[int] = 2,
n_threads: Optional[int] = -1,
n_batch: int = 512,
last_n_tokens_size: int = 64,
lora_base: Optional[str] = None,
@ -158,7 +158,7 @@ class Llama(GenerationMixin):
use_mmap: Use mmap if possible.
use_mlock: Force the system to keep the model in RAM.
embedding: Embedding mode only.
n_threads: Number of threads to use. Default to be 2.
n_threads: Number of threads to use. Default to be -1, means auto.
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and

View file

@ -72,7 +72,7 @@ class Starcoder(GenerationMixin):
use_mmap: bool = False,
use_mlock: bool = False,
embedding: bool = False,
n_threads: Optional[int] = 2,
n_threads: Optional[int] = -1,
n_batch: int = 512,
last_n_tokens_size: int = 64,
lora_base: Optional[str] = None,
@ -93,7 +93,7 @@ class Starcoder(GenerationMixin):
use_mmap: Use mmap if possible.
use_mlock: Force the system to keep the model in RAM.
embedding: Embedding mode only.
n_threads: Number of threads to use. Default to be 2.
n_threads: Number of threads to use. Default to be -1, means auto.
n_batch: Maximum number of prompt tokens to batch together when calling starcoder_eval.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and