LLM: Support gguf models use low_bit and fix no json(#10408)
* support others model use low_bit * update readme * update to add *.json
This commit is contained in:
parent
cda38f85a9
commit
fe8976a00f
5 changed files with 9 additions and 6 deletions
|
|
@ -61,6 +61,7 @@ In the example, several arguments can be passed to satisfy your requirements:
|
||||||
- `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf`
|
- `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf`
|
||||||
- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`.
|
- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`.
|
||||||
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
|
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
|
||||||
|
- `--low_bit`: use what low_bit to run, default is `sym_int4`.
|
||||||
|
|
||||||
#### 2.4 Sample Output
|
#### 2.4 Sample Output
|
||||||
#### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main)
|
#### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main)
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,7 @@ In the example, several arguments can be passed to satisfy your requirements:
|
||||||
- `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf`
|
- `--model`: path to GGUF model, it should be a file with name like `llama-2-7b-chat.Q4_0.gguf`
|
||||||
- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`.
|
- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`.
|
||||||
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
|
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
|
||||||
|
- `--low_bit`: use what low_bit to run, default is `sym_int4`.
|
||||||
|
|
||||||
#### 3.4 Sample Output
|
#### 3.4 Sample Output
|
||||||
#### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main)
|
#### [llama-2-7b-chat.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main)
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,8 @@ if __name__ == '__main__':
|
||||||
help='Prompt to infer')
|
help='Prompt to infer')
|
||||||
parser.add_argument('--n-predict', type=int, default=32,
|
parser.add_argument('--n-predict', type=int, default=32,
|
||||||
help='Max tokens to predict')
|
help='Max tokens to predict')
|
||||||
|
parser.add_argument('--low_bit', type=str, default="sym_int4",
|
||||||
|
help='what low_bit to run bigdl-llm')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -311,7 +311,7 @@ def setup_package():
|
||||||
packages=get_llm_packages(),
|
packages=get_llm_packages(),
|
||||||
package_dir={"": "src"},
|
package_dir={"": "src"},
|
||||||
package_data={
|
package_data={
|
||||||
"bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"]},
|
"bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"] + ["transformers/gguf/models/model_implement/*/*.json"]},
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
entry_points={
|
entry_points={
|
||||||
"console_scripts": [
|
"console_scripts": [
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,6 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str =
|
||||||
qtype = loader.config["general.file_type"]
|
qtype = loader.config["general.file_type"]
|
||||||
|
|
||||||
invalidInputError(qtype in qtype_map, f"Unsupported gguf quantize type: {qtype}")
|
invalidInputError(qtype in qtype_map, f"Unsupported gguf quantize type: {qtype}")
|
||||||
low_bit = qtype_map.get(qtype, "sym_int4")
|
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
if model_family == "llama":
|
if model_family == "llama":
|
||||||
|
|
@ -45,19 +44,19 @@ def load_gguf_model(fpath: str, dtype: torch.dtype = torch.float, low_bit: str =
|
||||||
if "mixtral" in general_name:
|
if "mixtral" in general_name:
|
||||||
# mixtral, which also enjoys a general architecture of llama
|
# mixtral, which also enjoys a general architecture of llama
|
||||||
from .models.mixtral import load_gguf_mixtral
|
from .models.mixtral import load_gguf_mixtral
|
||||||
model, tokenizer = load_gguf_mixtral(loader, dtype)
|
model, tokenizer = load_gguf_mixtral(loader, dtype, low_bit)
|
||||||
elif "mistral" in general_name:
|
elif "mistral" in general_name:
|
||||||
from .models.mistral import load_gguf_mistral
|
from .models.mistral import load_gguf_mistral
|
||||||
model, tokenizer = load_gguf_mistral(loader, dtype)
|
model, tokenizer = load_gguf_mistral(loader, dtype, low_bit)
|
||||||
elif "yuan" in general_name:
|
elif "yuan" in general_name:
|
||||||
from .models.yuan2 import load_gguf_yuan
|
from .models.yuan2 import load_gguf_yuan
|
||||||
model, tokenizer = load_gguf_yuan(loader, dtype)
|
model, tokenizer = load_gguf_yuan(loader, dtype)
|
||||||
else:
|
else:
|
||||||
from .models.llama import load_gguf_llama
|
from .models.llama import load_gguf_llama
|
||||||
model, tokenizer = load_gguf_llama(loader, dtype)
|
model, tokenizer = load_gguf_llama(loader, dtype, low_bit)
|
||||||
elif model_family == "baichuan":
|
elif model_family == "baichuan":
|
||||||
from .models.baichuan import load_gguf_baichuan
|
from .models.baichuan import load_gguf_baichuan
|
||||||
model, tokenizer = load_gguf_baichuan(loader, dtype)
|
model, tokenizer = load_gguf_baichuan(loader, dtype, low_bit)
|
||||||
elif model_family == "bloom":
|
elif model_family == "bloom":
|
||||||
from .models.bloom import load_gguf_bloom
|
from .models.bloom import load_gguf_bloom
|
||||||
model, tokenizer = load_gguf_bloom(loader, dtype)
|
model, tokenizer = load_gguf_bloom(loader, dtype)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue