LLM: add UT for starcoder (convert, inference) update examples and readme (#8379)
* first commit to add path * update example and readme * update path * fix * update based on comment
This commit is contained in:
parent
e68d631c0a
commit
4be784a49d
7 changed files with 57 additions and 8 deletions
7
.github/workflows/llm-nightly-test.yml
vendored
7
.github/workflows/llm-nightly-test.yml
vendored
|
|
@ -24,10 +24,12 @@ jobs:
|
||||||
LLAMA_ORIGIN_PATH: ./llm/models/llama-7b-hf
|
LLAMA_ORIGIN_PATH: ./llm/models/llama-7b-hf
|
||||||
GPTNEOX_ORIGIN_PATH: ./llm/models/gptneox-7b-redpajama-bf16
|
GPTNEOX_ORIGIN_PATH: ./llm/models/gptneox-7b-redpajama-bf16
|
||||||
BLOOM_ORIGIN_PATH: ./llm/models/bloomz-7b1
|
BLOOM_ORIGIN_PATH: ./llm/models/bloomz-7b1
|
||||||
|
STARCODER_ORIGIN_PATH: ./llm/models/gpt_bigcode-santacoder
|
||||||
INT4_CKPT_DIR: ./llm/ggml-actions/nightly
|
INT4_CKPT_DIR: ./llm/ggml-actions/nightly
|
||||||
LLAMA_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_llama_q4_0.bin
|
LLAMA_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_llama_q4_0.bin
|
||||||
GPTNEOX_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_gptneox_q4_0.bin
|
GPTNEOX_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_gptneox_q4_0.bin
|
||||||
BLOOM_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_bloom_q4_0.bin
|
BLOOM_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_bloom_q4_0.bin
|
||||||
|
STARCODER_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_starcoder_q4_0.bin
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
|
@ -57,6 +59,10 @@ jobs:
|
||||||
echo "Directory $BLOOM_ORIGIN_PATH not found. Downloading from FTP server..."
|
echo "Directory $BLOOM_ORIGIN_PATH not found. Downloading from FTP server..."
|
||||||
wget -r -nH --no-verbose --cut-dirs=1 $llm_ftp_url/llm/bloomz-7b1 -P $ORIGIN_DIR
|
wget -r -nH --no-verbose --cut-dirs=1 $llm_ftp_url/llm/bloomz-7b1 -P $ORIGIN_DIR
|
||||||
fi
|
fi
|
||||||
|
if [ ! -d $STARCODER_ORIGIN_PATH ]; then
|
||||||
|
echo "Directory $STARCODER_ORIGIN_PATH not found. Downloading from FTP server..."
|
||||||
|
wget -r -nH --no-verbose --cut-dirs=1 $llm_ftp_url/llm/gpt_bigcode-santacoder -P $ORIGIN_DIR
|
||||||
|
fi
|
||||||
|
|
||||||
- name: Run LLM convert test
|
- name: Run LLM convert test
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
@ -85,3 +91,4 @@ jobs:
|
||||||
tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_llama_7b_q4_0.bin $LLAMA_INT4_CKPT_PATH
|
tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_llama_7b_q4_0.bin $LLAMA_INT4_CKPT_PATH
|
||||||
tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_redpajama_7b_q4_0.bin $GPTNEOX_INT4_CKPT_PATH
|
tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_redpajama_7b_q4_0.bin $GPTNEOX_INT4_CKPT_PATH
|
||||||
tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_bloom_7b_q4_0.bin $BLOOM_INT4_CKPT_PATH
|
tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_bloom_7b_q4_0.bin $BLOOM_INT4_CKPT_PATH
|
||||||
|
tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_santacoder_1b_q4_0.bin $STARCODER_INT4_CKPT_PATH
|
||||||
|
|
|
||||||
5
.github/workflows/llm_unit_tests_linux.yml
vendored
5
.github/workflows/llm_unit_tests_linux.yml
vendored
|
|
@ -32,6 +32,7 @@ jobs:
|
||||||
LLAMA_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_llama_7b_q4_0.bin
|
LLAMA_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_llama_7b_q4_0.bin
|
||||||
GPTNEOX_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_redpajama_7b_q4_0.bin
|
GPTNEOX_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_redpajama_7b_q4_0.bin
|
||||||
BLOOM_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_bloom_7b_q4_0.bin
|
BLOOM_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_bloom_7b_q4_0.bin
|
||||||
|
STARCODER_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_santacoder_1b_q4_0.bin
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
|
@ -93,6 +94,10 @@ jobs:
|
||||||
echo "Directory $BLOOM_INT4_CKPT_PATH not found. Downloading from FTP server..."
|
echo "Directory $BLOOM_INT4_CKPT_PATH not found. Downloading from FTP server..."
|
||||||
wget --no-verbose $llm_ftp_url/${BLOOM_INT4_CKPT_PATH:1} -P $INT4_CKPT_DIR
|
wget --no-verbose $llm_ftp_url/${BLOOM_INT4_CKPT_PATH:1} -P $INT4_CKPT_DIR
|
||||||
fi
|
fi
|
||||||
|
if [ ! -d $STARCODER_INT4_CKPT_PATH ]; then
|
||||||
|
echo "Directory $STARCODER_INT4_CKPT_PATH not found. Downloading from FTP server..."
|
||||||
|
wget --no-verbose $llm_ftp_url/${STARCODER_INT4_CKPT_PATH:1} -P $INT4_CKPT_DIR
|
||||||
|
fi
|
||||||
|
|
||||||
- name: Run LLM inference test
|
- name: Run LLM inference test
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ Users could use `bigdl-llm` to
|
||||||
|
|
||||||
Currently `bigdl-llm` has supported
|
Currently `bigdl-llm` has supported
|
||||||
- Precision: INT4
|
- Precision: INT4
|
||||||
- Model Family: llama, gptneox, bloom
|
- Model Family: llama, gptneox, bloom, starcoder
|
||||||
- Platform: Ubuntu 20.04 or later, CentOS 7 or later, Windows 10/11
|
- Platform: Ubuntu 20.04 or later, CentOS 7 or later, Windows 10/11
|
||||||
- Device: CPU
|
- Device: CPU
|
||||||
- Python: 3.9 (recommended) or later
|
- Python: 3.9 (recommended) or later
|
||||||
|
|
@ -121,7 +121,7 @@ tokenizer.batch_decode(tokens_id)
|
||||||
`llama-cpp-python` has become a popular pybinding for `llama.cpp` program. Some users may be familiar with this API so `bigdl-llm` reserve this API and extend it to other model families (e.g., gptneox, bloom)
|
`llama-cpp-python` has become a popular pybinding for `llama.cpp` program. Some users may be familiar with this API so `bigdl-llm` reserve this API and extend it to other model families (e.g., gptneox, bloom)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from bigdl.llm.models import Llama, Bloom, Gptneox
|
from bigdl.llm.models import Llama, Bloom, Gptneox, Starcoder
|
||||||
|
|
||||||
llm = Llama("/path/to/llama-7b-int4/bigdl-llm-xxx.bin", n_threads=4)
|
llm = Llama("/path/to/llama-7b-int4/bigdl-llm-xxx.bin", n_threads=4)
|
||||||
result = llm("what is ai")
|
result = llm("what is ai")
|
||||||
|
|
@ -144,3 +144,5 @@ To avoid difficaulties during the installtion. `bigdl-llm` release the C impleme
|
||||||
| gptneox | Windows | MSVC 19.36.32532.0 | |
|
| gptneox | Windows | MSVC 19.36.32532.0 | |
|
||||||
| bloom | Linux | GCC 9.4.0 | 2.31 |
|
| bloom | Linux | GCC 9.4.0 | 2.31 |
|
||||||
| bloom | Windows | MSVC 19.36.32532.0 | |
|
| bloom | Windows | MSVC 19.36.32532.0 | |
|
||||||
|
| starcoder | Linux | GCC 9.4.0 | 2.31 |
|
||||||
|
| starcoder | Windows | MSVC 19.36.32532.0 | |
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
In this example, we show a pipeline to convert a large language model to low precision (INT4), and then conduct inference on the converted INT4 model, using BigDL-LLM transformers-like API.
|
In this example, we show a pipeline to convert a large language model to low precision (INT4), and then conduct inference on the converted INT4 model, using BigDL-LLM transformers-like API.
|
||||||
|
|
||||||
> **Note**: BigDL-LLM currently supports model family LLaMA, GPT-NeoX, and BLOOM.
|
> **Note**: BigDL-LLM currently supports model family LLaMA, GPT-NeoX, BLOOM and StarCoder.
|
||||||
|
|
||||||
## Prepare Environment
|
## Prepare Environment
|
||||||
We suggest using conda to manage environment:
|
We suggest using conda to manage environment:
|
||||||
|
|
@ -19,12 +19,13 @@ python ./int4_pipeline.py --thread-num THREAD_NUM --model-family MODEL_FAMILY
|
||||||
```
|
```
|
||||||
arguments info:
|
arguments info:
|
||||||
- `--thread-num THREAD_NUM`: **required** argument defining the number of threads to use for inference. It is default to be `2`.
|
- `--thread-num THREAD_NUM`: **required** argument defining the number of threads to use for inference. It is default to be `2`.
|
||||||
- `--model-family MODEL_FAMILY`: **required** argument defining the model family of the large language model (supported option: `'llama'`, `'gptneox'`, `'bloom'`). It is default to be `'llama'`.
|
- `--model-family MODEL_FAMILY`: **required** argument defining the model family of the large language model (supported option: `'llama'`, `'gptneox'`, `'bloom'`, `'starcoder'`). It is default to be `'llama'`.
|
||||||
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: optional argument defining the huggingface repo id from which the large language model is downloaded, or the path to the huggingface checkpoint folder for the model.
|
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: optional argument defining the huggingface repo id from which the large language model is downloaded, or the path to the huggingface checkpoint folder for the model.
|
||||||
|
|
||||||
- When model family is `'llama'`, it is default to be `'decapoda-research/llama-7b-hf'`.
|
- When model family is `'llama'`, it is default to be `'decapoda-research/llama-7b-hf'`.
|
||||||
- When model family is `'gptneox'`, it is default to be `'togethercomputer/RedPajama-INCITE-7B-Chat'`.
|
- When model family is `'gptneox'`, it is default to be `'togethercomputer/RedPajama-INCITE-7B-Chat'`.
|
||||||
- When model family is `'bloom'`, it is default to be `'bigscience/bloomz-7b1'`.
|
- When model family is `'bloom'`, it is default to be `'bigscience/bloomz-7b1'`.
|
||||||
|
- When model family is `'starcoder'`, it is default to be `'bigcode/gpt_bigcode-santacoder'`.
|
||||||
|
|
||||||
> **Note** `REPO_ID_OR_MODEL_PATH` should fits your inputed `MODEL_FAMILY`.
|
> **Note** `REPO_ID_OR_MODEL_PATH` should fits your inputed `MODEL_FAMILY`.
|
||||||
- `--promp PROMPT`: optional argument defining the prompt to be infered. It is default to be `'Q: What is CPU? A:'`.
|
- `--promp PROMPT`: optional argument defining the prompt to be infered. It is default to be `'Q: What is CPU? A:'`.
|
||||||
|
|
@ -93,4 +94,16 @@ inference: total time = xxxx ms
|
||||||
Inference time (fast forward): xxxx s
|
Inference time (fast forward): xxxx s
|
||||||
Output:
|
Output:
|
||||||
{'id': 'cmpl-a0ab2953-e08c-449c-b476-e21ad5bb84b0', 'object': 'text_completion', 'created': 1686557434, 'model': './bigdl_llm_bloom_q4_0.bin', 'choices': [{'text': 'Q: What is CPU? A: central processing unit</s>', 'index': 0, 'logprobs': None, 'finish_reason': None}], 'usage': {'prompt_tokens': None, 'completion_tokens': None, 'total_tokens': None}}
|
{'id': 'cmpl-a0ab2953-e08c-449c-b476-e21ad5bb84b0', 'object': 'text_completion', 'created': 1686557434, 'model': './bigdl_llm_bloom_q4_0.bin', 'choices': [{'text': 'Q: What is CPU? A: central processing unit</s>', 'index': 0, 'logprobs': None, 'finish_reason': None}], 'usage': {'prompt_tokens': None, 'completion_tokens': None, 'total_tokens': None}}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Model family StarCoder
|
||||||
|
```log
|
||||||
|
bigdl-llm: mem per token = 313912 bytes
|
||||||
|
bigdl-llm: load time = xxxx ms
|
||||||
|
bigdl-llm: sample time = xxxx ms
|
||||||
|
bigdl-llm: predict time = xxxx ms / xxxx ms per token
|
||||||
|
bigdl-llm: total time = xxxx ms
|
||||||
|
Inference time (fast forward): xxxx s
|
||||||
|
Output:
|
||||||
|
{'id': 'cmpl-c2358898-cad0-47f4-af5b-84bb575eb942', 'object': 'text_completion', 'created': 1687864692, 'model': './output_starcoder/bigdl_llm_starcoder_q4_0.bin', 'choices': [{'text': ' %s" % cpu)\n print("Q: What is RAM? A: %s GB" % ram)\n print("Q: What is BAT? A: %s" % bat)\n print("Q: What is the number of CPU\'s? A: %s" % ncpus)', 'index': 0, 'logprobs': None, 'finish_reason': None}], 'usage': {'prompt_tokens': 8, 'completion_tokens': 64, 'total_tokens': 72}}
|
||||||
```
|
```
|
||||||
|
|
@ -80,9 +80,9 @@ def inference(llm, repo_id_or_model_path, model_family, prompt):
|
||||||
print(f'Output:\n{output}')
|
print(f'Output:\n{output}')
|
||||||
|
|
||||||
|
|
||||||
if model_family in ['llama', 'gptneox', 'bloom']:
|
if model_family in ['llama', 'gptneox', 'bloom', 'starcoder']:
|
||||||
# Option 3: fast forward
|
# Option 3: fast forward
|
||||||
# note that currently Bloom family model only supports the fast forward inference method
|
# note that currently Bloom/Starcoder family model only supports the fast forward inference method
|
||||||
print('-'*20, ' fast forward ', '-'*20)
|
print('-'*20, ' fast forward ', '-'*20)
|
||||||
st = time.time()
|
st = time.time()
|
||||||
|
|
||||||
|
|
@ -99,7 +99,7 @@ def main():
|
||||||
help='Number of threads to use for inference')
|
help='Number of threads to use for inference')
|
||||||
parser.add_argument('--model-family', type=str, default='llama', required=True,
|
parser.add_argument('--model-family', type=str, default='llama', required=True,
|
||||||
help="The model family of the large language model (supported option: 'llama', "
|
help="The model family of the large language model (supported option: 'llama', "
|
||||||
"'gptneox', 'bloom')")
|
"'gptneox', 'bloom', 'starcoder')")
|
||||||
parser.add_argument('--repo-id-or-model-path', type=str,
|
parser.add_argument('--repo-id-or-model-path', type=str,
|
||||||
help='The huggingface repo id for the larga language model to be downloaded'
|
help='The huggingface repo id for the larga language model to be downloaded'
|
||||||
', or the path to the huggingface checkpoint folder')
|
', or the path to the huggingface checkpoint folder')
|
||||||
|
|
@ -117,6 +117,8 @@ def main():
|
||||||
repo_id_or_model_path = 'togethercomputer/RedPajama-INCITE-7B-Chat'
|
repo_id_or_model_path = 'togethercomputer/RedPajama-INCITE-7B-Chat'
|
||||||
elif args.model_family == 'bloom':
|
elif args.model_family == 'bloom':
|
||||||
repo_id_or_model_path = 'bigscience/bloomz-7b1'
|
repo_id_or_model_path = 'bigscience/bloomz-7b1'
|
||||||
|
elif args.model_family == 'starcoder':
|
||||||
|
repo_id_or_model_path = 'bigcode/gpt_bigcode-santacoder'
|
||||||
|
|
||||||
# Step 1: convert original model to BigDL llm model
|
# Step 1: convert original model to BigDL llm model
|
||||||
bigdl_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
|
bigdl_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@ from bigdl.llm import llm_convert
|
||||||
llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
|
llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
|
||||||
gptneox_model_path = os.environ.get('GPTNEOX_ORIGIN_PATH')
|
gptneox_model_path = os.environ.get('GPTNEOX_ORIGIN_PATH')
|
||||||
bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH')
|
bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH')
|
||||||
|
starcoder_model_path = os.environ.get('STARCODER_ORIGIN_PATH')
|
||||||
output_dir = os.environ.get('INT4_CKPT_DIR')
|
output_dir = os.environ.get('INT4_CKPT_DIR')
|
||||||
|
|
||||||
class TestConvertModel(TestCase):
|
class TestConvertModel(TestCase):
|
||||||
|
|
@ -52,6 +53,15 @@ class TestConvertModel(TestCase):
|
||||||
model_format="pth",
|
model_format="pth",
|
||||||
outtype='int4')
|
outtype='int4')
|
||||||
assert os.path.isfile(converted_model_path)
|
assert os.path.isfile(converted_model_path)
|
||||||
|
|
||||||
|
def test_convert_starcoder(self):
|
||||||
|
converted_model_path = llm_convert(model=starcoder_model_path,
|
||||||
|
outfile=output_dir,
|
||||||
|
model_family='starcoder',
|
||||||
|
model_format="pth",
|
||||||
|
outtype='int4')
|
||||||
|
assert os.path.isfile(converted_model_path)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pytest.main([__file__])
|
pytest.main([__file__])
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
from bigdl.llm.models import Llama, Bloom, Gptneox
|
from bigdl.llm.models import Llama, Bloom, Gptneox, Starcoder
|
||||||
from bigdl.llm.utils import get_avx_flags
|
from bigdl.llm.utils import get_avx_flags
|
||||||
import pytest
|
import pytest
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
|
|
@ -28,6 +28,7 @@ class Test_Models_Basics(TestCase):
|
||||||
self.llama_model_path = os.environ.get('LLAMA_INT4_CKPT_PATH')
|
self.llama_model_path = os.environ.get('LLAMA_INT4_CKPT_PATH')
|
||||||
self.bloom_model_path = os.environ.get('BLOOM_INT4_CKPT_PATH')
|
self.bloom_model_path = os.environ.get('BLOOM_INT4_CKPT_PATH')
|
||||||
self.gptneox_model_path = os.environ.get('GPTNEOX_INT4_CKPT_PATH')
|
self.gptneox_model_path = os.environ.get('GPTNEOX_INT4_CKPT_PATH')
|
||||||
|
self.starcoder_model_path = os.environ.get('STARCODER_INT4_CKPT_PATH')
|
||||||
|
|
||||||
def test_llama_completion_success(self):
|
def test_llama_completion_success(self):
|
||||||
llm = Llama(self.llama_model_path)
|
llm = Llama(self.llama_model_path)
|
||||||
|
|
@ -58,6 +59,15 @@ class Test_Models_Basics(TestCase):
|
||||||
def test_gptneox_completion_with_stream_success(self):
|
def test_gptneox_completion_with_stream_success(self):
|
||||||
llm = Gptneox(self.gptneox_model_path)
|
llm = Gptneox(self.gptneox_model_path)
|
||||||
output = llm("Q: What is the capital of France? A:", max_tokens=32, stream=True)
|
output = llm("Q: What is the capital of France? A:", max_tokens=32, stream=True)
|
||||||
|
|
||||||
|
def test_starcoder_completion_success(self):
|
||||||
|
llm = Starcoder(self.starcoder_model_path)
|
||||||
|
output = llm("def print_hello_world(", max_tokens=32, stream=False)
|
||||||
|
# assert "Paris" in output['choices'][0]['text']
|
||||||
|
|
||||||
|
def test_starcoder_completion_with_stream_success(self):
|
||||||
|
llm = Starcoder(self.starcoder_model_path)
|
||||||
|
output = llm("def print_hello_world(", max_tokens=32, stream=True)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue