Remove chatglm_C Module to Eliminate LGPL Dependency (#11178)
* remove chatglm_C.**.pyd to solve ngsolve weak copyright vunl * fix style check error * remove chatglm native int4 from langchain
This commit is contained in:
parent
50b5f4476f
commit
401013a630
14 changed files with 19 additions and 690 deletions
101
.github/workflows/llm-binary-build.yml
vendored
101
.github/workflows/llm-binary-build.yml
vendored
|
|
@ -72,12 +72,6 @@ jobs:
|
||||||
export http_proxy=${HTTP_PROXY}
|
export http_proxy=${HTTP_PROXY}
|
||||||
export https_proxy=${HTTPS_PROXY}
|
export https_proxy=${HTTPS_PROXY}
|
||||||
yum install -y gcc-toolset-11 cmake git
|
yum install -y gcc-toolset-11 cmake git
|
||||||
conda remove -n python39 --all -y
|
|
||||||
conda create -n python39 python=3.9 -y
|
|
||||||
conda remove -n python310 --all -y
|
|
||||||
conda create -n python310 python=3.10 -y
|
|
||||||
conda remove -n python311 --all -y
|
|
||||||
conda create -n python311 python=3.11 -y
|
|
||||||
- uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
|
- uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
|
||||||
with:
|
with:
|
||||||
repository: "intel-analytics/llm.cpp"
|
repository: "intel-analytics/llm.cpp"
|
||||||
|
|
@ -109,42 +103,6 @@ jobs:
|
||||||
mv build/libstarcoder-api.so release/libstarcoder-api.so
|
mv build/libstarcoder-api.so release/libstarcoder-api.so
|
||||||
mv build/quantize-starcoder release/quantize-starcoder
|
mv build/quantize-starcoder release/quantize-starcoder
|
||||||
mv build/libstarcoder.so release/libstarcoder_avxvnni.so
|
mv build/libstarcoder.so release/libstarcoder_avxvnni.so
|
||||||
- name: Build Chatglm
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
source activate python39 || conda activate python39
|
|
||||||
cd src/chatglm
|
|
||||||
scl enable gcc-toolset-11 "cmake -B build"
|
|
||||||
scl enable gcc-toolset-11 "cmake --build build --config Release -j"
|
|
||||||
- name: Move Chatglm binaries
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
mv src/chatglm/build/main release/main-chatglm_vnni
|
|
||||||
mv src/chatglm/build/_C.cpython-39-x86_64-linux-gnu.so release/chatglm_C.cpython-39-x86_64-linux-gnu.so
|
|
||||||
- name: Build Chatglm Py310
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
source activate python310 || conda activate python310
|
|
||||||
cd src/chatglm
|
|
||||||
rm -r build
|
|
||||||
scl enable gcc-toolset-11 "cmake -B build"
|
|
||||||
scl enable gcc-toolset-11 "cmake --build build --config Release -j"
|
|
||||||
- name: Move Chatglm binaries Py310
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
mv src/chatglm/build/_C.cpython-310-x86_64-linux-gnu.so release/chatglm_C.cpython-310-x86_64-linux-gnu.so
|
|
||||||
- name: Build Chatglm Py311
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
source activate python311 || conda activate python311
|
|
||||||
cd src/chatglm
|
|
||||||
rm -r build
|
|
||||||
scl enable gcc-toolset-11 "cmake -B build"
|
|
||||||
scl enable gcc-toolset-11 "cmake --build build --config Release -j"
|
|
||||||
- name: Move Chatglm binaries Py311
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
mv src/chatglm/build/_C.cpython-311-x86_64-linux-gnu.so release/chatglm_C.cpython-311-x86_64-linux-gnu.so
|
|
||||||
- name: Archive build files
|
- name: Archive build files
|
||||||
uses: actions/upload-artifact@v3
|
uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
|
|
@ -155,9 +113,6 @@ jobs:
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
make clean
|
make clean
|
||||||
conda remove -n python39 --all -y
|
|
||||||
conda remove -n python310 --all -y
|
|
||||||
conda remove -n python311 --all -y
|
|
||||||
|
|
||||||
check-linux-avx512-artifact:
|
check-linux-avx512-artifact:
|
||||||
if: ${{contains(inputs.platform, 'Linux')}}
|
if: ${{contains(inputs.platform, 'Linux')}}
|
||||||
|
|
@ -286,8 +241,6 @@ jobs:
|
||||||
export http_proxy=${HTTP_PROXY}
|
export http_proxy=${HTTP_PROXY}
|
||||||
export https_proxy=${HTTPS_PROXY}
|
export https_proxy=${HTTPS_PROXY}
|
||||||
yum install -y gcc-toolset-11 cmake git
|
yum install -y gcc-toolset-11 cmake git
|
||||||
conda remove -n python39 --all -y
|
|
||||||
conda create -n python39 python=3.9 -y
|
|
||||||
- uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
|
- uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
|
||||||
with:
|
with:
|
||||||
repository: "intel-analytics/llm.cpp"
|
repository: "intel-analytics/llm.cpp"
|
||||||
|
|
@ -299,11 +252,6 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
scl enable gcc-toolset-11 "cmake -DONLYAVX=OFF -DONLYAVX2=OFF -B build"
|
scl enable gcc-toolset-11 "cmake -DONLYAVX=OFF -DONLYAVX2=OFF -B build"
|
||||||
scl enable gcc-toolset-11 "cmake --build build --config Release -j"
|
scl enable gcc-toolset-11 "cmake --build build --config Release -j"
|
||||||
# build chatglm
|
|
||||||
source activate python39 || conda activate python39
|
|
||||||
cd src/chatglm
|
|
||||||
scl enable gcc-toolset-11 "cmake -B build"
|
|
||||||
scl enable gcc-toolset-11 "cmake --build build --config Release -j"
|
|
||||||
- name: Move amx release binary
|
- name: Move amx release binary
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
|
|
@ -316,9 +264,6 @@ jobs:
|
||||||
mv build/libgptneox.so amx_release/libgptneox_amx.so
|
mv build/libgptneox.so amx_release/libgptneox_amx.so
|
||||||
mv build/quantize-starcoder amx_release/quantize-starcoder_amx
|
mv build/quantize-starcoder amx_release/quantize-starcoder_amx
|
||||||
mv build/libstarcoder.so amx_release/libstarcoder_amx.so
|
mv build/libstarcoder.so amx_release/libstarcoder_amx.so
|
||||||
# chatglm binary files
|
|
||||||
mv src/chatglm/build/main amx_release/main-chatglm_amx
|
|
||||||
# mv src/chatglm/build/_C.cpython-39-x86_64-linux-gnu.so amx_release/chatglm_C.cpython-39-x86_64-linux-gnu.so
|
|
||||||
- name: Archive amx build files
|
- name: Archive amx build files
|
||||||
uses: actions/upload-artifact@v3
|
uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
|
|
@ -329,7 +274,6 @@ jobs:
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
make clean
|
make clean
|
||||||
conda remove -n python39 --all -y
|
|
||||||
|
|
||||||
check-windows-avx2-artifact:
|
check-windows-avx2-artifact:
|
||||||
if: ${{contains(inputs.platform, 'Windows')}}
|
if: ${{contains(inputs.platform, 'Windows')}}
|
||||||
|
|
@ -393,10 +337,6 @@ jobs:
|
||||||
needs: check-windows-avx-vnni-artifact
|
needs: check-windows-avx-vnni-artifact
|
||||||
if: needs.check-windows-avx-vnni-artifact.outputs.if-exists == 'false'
|
if: needs.check-windows-avx-vnni-artifact.outputs.if-exists == 'false'
|
||||||
steps:
|
steps:
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: "3.9"
|
|
||||||
- name: Set access token
|
- name: Set access token
|
||||||
run: |
|
run: |
|
||||||
echo "github_access_token=$env:GITHUB_ACCESS_TOKEN" >> $env:GITHUB_ENV
|
echo "github_access_token=$env:GITHUB_ACCESS_TOKEN" >> $env:GITHUB_ENV
|
||||||
|
|
@ -438,47 +378,6 @@ jobs:
|
||||||
# mv build/Release/main-starcoder.exe release/main-starcoder_vnni.exe
|
# mv build/Release/main-starcoder.exe release/main-starcoder_vnni.exe
|
||||||
mv build/Release/quantize-starcoder.exe release/quantize-starcoder_vnni.exe
|
mv build/Release/quantize-starcoder.exe release/quantize-starcoder_vnni.exe
|
||||||
mv build/Release/starcoder.dll release/libstarcoder_vnni.dll
|
mv build/Release/starcoder.dll release/libstarcoder_vnni.dll
|
||||||
- name: Build Chatglm
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
cd src/chatglm
|
|
||||||
cmake -DAVXVNNI=ON -B build
|
|
||||||
cmake --build build --config Release -j
|
|
||||||
- name: Move Chatglm binaries
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
mv src/chatglm/build/Release/main.exe release/main-chatglm_vnni.exe
|
|
||||||
mv src/chatglm/build/Release/_C.cp39-win_amd64.pyd release/chatglm_C.cp39-win_amd64.pyd
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: "3.10"
|
|
||||||
- name: Build Chatglm Py310
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
cd src/chatglm
|
|
||||||
rm -r build
|
|
||||||
cmake -DAVXVNNI=ON -B build
|
|
||||||
cmake --build build --config Release -j
|
|
||||||
- name: Move Chatglm binaries Py310
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
mv src/chatglm/build/Release/_C.cp310-win_amd64.pyd release/chatglm_C.cp310-win_amd64.pyd
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Build Chatglm Py311
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
cd src/chatglm
|
|
||||||
rm -r build
|
|
||||||
cmake -DAVXVNNI=ON -B build
|
|
||||||
cmake --build build --config Release -j
|
|
||||||
- name: Move Chatglm binaries Py311
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
mv src/chatglm/build/Release/_C.cp311-win_amd64.pyd release/chatglm_C.cp311-win_amd64.pyd
|
|
||||||
- name: Archive build files
|
- name: Archive build files
|
||||||
uses: actions/upload-artifact@v3
|
uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
|
|
|
||||||
|
|
@ -31,7 +31,7 @@ You may also convert Hugging Face *Transformers* models into native INT4 format,
|
||||||
```eval_rst
|
```eval_rst
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
* Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face ``transformers`` INT4 format as described `above <./langchain_api.html#using-hugging-face-transformers-int4-format>`_.
|
* Currently only llama/bloom/gptneox/starcoder model families are supported; for other models, you may use the Hugging Face ``transformers`` INT4 format as described `above <./langchain_api.html#using-hugging-face-transformers-int4-format>`_.
|
||||||
|
|
||||||
* You may choose the corresponding API developed for specific native models to load the converted model.
|
* You may choose the corresponding API developed for specific native models to load the converted model.
|
||||||
```
|
```
|
||||||
|
|
@ -41,9 +41,9 @@ from ipex_llm.langchain.llms import LlamaLLM
|
||||||
from ipex_llm.langchain.embeddings import LlamaEmbeddings
|
from ipex_llm.langchain.embeddings import LlamaEmbeddings
|
||||||
from langchain.chains.question_answering import load_qa_chain
|
from langchain.chains.question_answering import load_qa_chain
|
||||||
|
|
||||||
# switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
|
# switch to GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
|
||||||
embeddings = LlamaEmbeddings(model_path='/path/to/converted/model.bin')
|
embeddings = LlamaEmbeddings(model_path='/path/to/converted/model.bin')
|
||||||
# switch to ChatGLMLLM/GptneoxLLM/BloomLLM/StarcoderLLM to load other models
|
# switch to GptneoxLLM/BloomLLM/StarcoderLLM to load other models
|
||||||
ipex_llm = LlamaLLM(model_path='/path/to/converted/model.bin')
|
ipex_llm = LlamaLLM(model_path='/path/to/converted/model.bin')
|
||||||
|
|
||||||
doc_chain = load_qa_chain(ipex_llm, ...)
|
doc_chain = load_qa_chain(ipex_llm, ...)
|
||||||
|
|
|
||||||
|
|
@ -31,7 +31,7 @@ IPEX-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which imp
|
||||||
Native Model
|
Native Model
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, you could also use the following LLM wrappers with the native (cpp) implementation for maximum performance.
|
For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also use the following LLM wrappers with the native (cpp) implementation for maximum performance.
|
||||||
|
|
||||||
.. tabs::
|
.. tabs::
|
||||||
|
|
||||||
|
|
@ -47,18 +47,6 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
|
||||||
.. automethod:: stream
|
.. automethod:: stream
|
||||||
.. automethod:: get_num_tokens
|
.. automethod:: get_num_tokens
|
||||||
|
|
||||||
.. tab:: ChatGLM
|
|
||||||
|
|
||||||
.. autoclass:: ipex_llm.langchain.llms.ChatGLMLLM
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
:exclude-members: ggml_model, ggml_module, client, model_path, kwargs
|
|
||||||
|
|
||||||
.. automethod:: validate_environment
|
|
||||||
.. automethod:: stream
|
|
||||||
.. automethod:: get_num_tokens
|
|
||||||
|
|
||||||
.. tab:: Bloom
|
.. tab:: Bloom
|
||||||
|
|
||||||
.. autoclass:: ipex_llm.langchain.llms.BloomLLM
|
.. autoclass:: ipex_llm.langchain.llms.BloomLLM
|
||||||
|
|
|
||||||
|
|
@ -36,8 +36,7 @@ def load(model_path, model_family, n_threads):
|
||||||
"llama": LlamaForCausalLM,
|
"llama": LlamaForCausalLM,
|
||||||
"gptneox": GptneoxForCausalLM,
|
"gptneox": GptneoxForCausalLM,
|
||||||
"bloom": BloomForCausalLM,
|
"bloom": BloomForCausalLM,
|
||||||
"starcoder": StarcoderForCausalLM,
|
"starcoder": StarcoderForCausalLM
|
||||||
"chatglm": ChatGLMForCausalLM
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if model_family in model_family_to_class:
|
if model_family in model_family_to_class:
|
||||||
|
|
@ -55,7 +54,7 @@ def load(model_path, model_family, n_threads):
|
||||||
|
|
||||||
def inference(llm, repo_id_or_model_path, model_family, prompt):
|
def inference(llm, repo_id_or_model_path, model_family, prompt):
|
||||||
|
|
||||||
if model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm']:
|
if model_family in ['llama', 'gptneox', 'bloom', 'starcoder']:
|
||||||
# ------ Option 1: Use IPEX-LLM based tokenizer
|
# ------ Option 1: Use IPEX-LLM based tokenizer
|
||||||
print('-'*20, ' IPEX-LLM based tokenizer ', '-'*20)
|
print('-'*20, ' IPEX-LLM based tokenizer ', '-'*20)
|
||||||
st = time.time()
|
st = time.time()
|
||||||
|
|
@ -109,9 +108,9 @@ def main():
|
||||||
parser.add_argument('--thread-num', type=int, default=2, required=True,
|
parser.add_argument('--thread-num', type=int, default=2, required=True,
|
||||||
help='Number of threads to use for inference')
|
help='Number of threads to use for inference')
|
||||||
parser.add_argument('--model-family', type=str, default='llama', required=True,
|
parser.add_argument('--model-family', type=str, default='llama', required=True,
|
||||||
choices=["llama", "llama2", "bloom", "gptneox", "starcoder", "chatglm"],
|
choices=["llama", "llama2", "bloom", "gptneox", "starcoder"],
|
||||||
help="The model family of the large language model (supported option: 'llama', 'llama2', "
|
help="The model family of the large language model (supported option: 'llama', 'llama2', "
|
||||||
"'gptneox', 'bloom', 'starcoder', 'chatglm')")
|
"'gptneox', 'bloom', 'starcoder')")
|
||||||
parser.add_argument('--repo-id-or-model-path', type=str, required=True,
|
parser.add_argument('--repo-id-or-model-path', type=str, required=True,
|
||||||
help='The path to the huggingface checkpoint folder')
|
help='The path to the huggingface checkpoint folder')
|
||||||
parser.add_argument('--prompt', type=str, default='Once upon a time, there existed a little girl who liked to have adventures. ',
|
parser.add_argument('--prompt', type=str, default='Once upon a time, there existed a little girl who liked to have adventures. ',
|
||||||
|
|
|
||||||
|
|
@ -86,12 +86,7 @@ windows_binarys = [
|
||||||
"quantize-llama_vnni.exe",
|
"quantize-llama_vnni.exe",
|
||||||
"quantize-gptneox_vnni.exe",
|
"quantize-gptneox_vnni.exe",
|
||||||
"quantize-bloom_vnni.exe",
|
"quantize-bloom_vnni.exe",
|
||||||
"quantize-starcoder_vnni.exe",
|
"quantize-starcoder_vnni.exe"
|
||||||
|
|
||||||
"main-chatglm_vnni.exe",
|
|
||||||
"chatglm_C.cp39-win_amd64.pyd",
|
|
||||||
"chatglm_C.cp310-win_amd64.pyd",
|
|
||||||
"chatglm_C.cp311-win_amd64.pyd"
|
|
||||||
]
|
]
|
||||||
linux_binarys = [
|
linux_binarys = [
|
||||||
"libllama_avx.so",
|
"libllama_avx.so",
|
||||||
|
|
@ -125,13 +120,7 @@ linux_binarys = [
|
||||||
"main-llama",
|
"main-llama",
|
||||||
"main-gptneox",
|
"main-gptneox",
|
||||||
"main-bloom",
|
"main-bloom",
|
||||||
"main-starcoder",
|
"main-starcoder"
|
||||||
|
|
||||||
"main-chatglm_vnni",
|
|
||||||
"main-chatglm_amx",
|
|
||||||
"chatglm_C.cpython-39-x86_64-linux-gnu.so",
|
|
||||||
"chatglm_C.cpython-310-x86_64-linux-gnu.so",
|
|
||||||
"chatglm_C.cpython-311-x86_64-linux-gnu.so"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
ext_lib_urls = [
|
ext_lib_urls = [
|
||||||
|
|
|
||||||
|
|
@ -76,10 +76,6 @@ def _convert_starcoder(model_path, outfile_dir, outtype):
|
||||||
_convert_starcoder_hf_to_ggml(model_path, outfile_dir, outtype)
|
_convert_starcoder_hf_to_ggml(model_path, outfile_dir, outtype)
|
||||||
|
|
||||||
|
|
||||||
def _convert_chatglm(model_path, outfile_dir, outtype):
|
|
||||||
return _convert_chatglm_hf_to_ggml(model_path, outfile_dir, outtype)
|
|
||||||
|
|
||||||
|
|
||||||
def _convert_to_ggml(model_path: str, outfile_dir: str,
|
def _convert_to_ggml(model_path: str, outfile_dir: str,
|
||||||
model_family: str = 'llama', outtype: str="fp16"):
|
model_family: str = 'llama', outtype: str="fp16"):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from ipex_llm.ggml.convert import _convert_to_ggml, _convert_chatglm
|
from ipex_llm.ggml.convert import _convert_to_ggml
|
||||||
from ipex_llm.ggml.quantize import quantize
|
from ipex_llm.ggml.quantize import quantize
|
||||||
from ipex_llm.utils.common import invalidInputError
|
from ipex_llm.utils.common import invalidInputError
|
||||||
import argparse
|
import argparse
|
||||||
|
|
@ -54,9 +54,9 @@ def convert_model(input_path: str,
|
||||||
# make sure directory exists
|
# make sure directory exists
|
||||||
os.makedirs(output_path, exist_ok=True)
|
os.makedirs(output_path, exist_ok=True)
|
||||||
# check input value
|
# check input value
|
||||||
invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder', 'chatglm'],
|
invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
|
||||||
"Now we only support quantization of model \
|
"Now we only support quantization of model \
|
||||||
family('llama', 'bloom', 'gptneox', 'starcoder', 'chatglm')",
|
family('llama', 'bloom', 'gptneox', 'starcoder')",
|
||||||
"{} is not in the list.".format(model_family))
|
"{} is not in the list.".format(model_family))
|
||||||
invalidInputError(os.path.isdir(output_path),
|
invalidInputError(os.path.isdir(output_path),
|
||||||
"The output_path {} was not a directory".format(output_path))
|
"The output_path {} was not a directory".format(output_path))
|
||||||
|
|
@ -78,12 +78,6 @@ def convert_model(input_path: str,
|
||||||
family('llama', 'gptneox', 'starcoder')",
|
family('llama', 'gptneox', 'starcoder')",
|
||||||
"{} is not in the list.".format(model_family))
|
"{} is not in the list.".format(model_family))
|
||||||
|
|
||||||
# chatglm merges convertion and quantization into one operation.
|
|
||||||
if model_family == 'chatglm':
|
|
||||||
return _convert_chatglm(model_path=input_path,
|
|
||||||
outfile_dir=output_path,
|
|
||||||
outtype=dtype)
|
|
||||||
|
|
||||||
if tmp_path is not None:
|
if tmp_path is not None:
|
||||||
model_name = Path(input_path).stem
|
model_name = Path(input_path).stem
|
||||||
tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}')
|
tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}')
|
||||||
|
|
|
||||||
|
|
@ -1,22 +0,0 @@
|
||||||
#
|
|
||||||
# Copyright 2016 The BigDL Authors.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
|
|
||||||
# This would makes sure Python is aware there is more than one sub-package within bigdl,
|
|
||||||
# physically located elsewhere.
|
|
||||||
# Otherwise there would be module not found error in non-pip's setting as Python would
|
|
||||||
# only search the first bigdl package and end up finding only one sub-package.
|
|
||||||
|
|
||||||
from .chatglm import ChatGLM
|
|
||||||
|
|
@ -1,428 +0,0 @@
|
||||||
#
|
|
||||||
# Copyright 2016 The BigDL Authors.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
# ===========================================================================
|
|
||||||
#
|
|
||||||
# This file is adapted from
|
|
||||||
# https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py
|
|
||||||
#
|
|
||||||
# MIT License
|
|
||||||
#
|
|
||||||
# Copyright (c) 2023 Andrei Betlen
|
|
||||||
#
|
|
||||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
# of this software and associated documentation files (the "Software"), to deal
|
|
||||||
# in the Software without restriction, including without limitation the rights
|
|
||||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
# copies of the Software, and to permit persons to whom the Software is
|
|
||||||
# furnished to do so, subject to the following conditions:
|
|
||||||
#
|
|
||||||
# The above copyright notice and this permission notice shall be included in all
|
|
||||||
# copies or substantial portions of the Software.
|
|
||||||
#
|
|
||||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
# SOFTWARE.
|
|
||||||
|
|
||||||
# This would makes sure Python is aware there is more than one sub-package within bigdl,
|
|
||||||
# physically located elsewhere.
|
|
||||||
# Otherwise there would be module not found error in non-pip's setting as Python would
|
|
||||||
# only search the first bigdl package and end up finding only one sub-package.
|
|
||||||
|
|
||||||
|
|
||||||
from .chatglm_cpp import chatglm_load, chatglm_tokenize, chatglm_detokenize, \
|
|
||||||
chatglm_forward, chatglm_eos_token
|
|
||||||
from ipex_llm.utils.common import invalidInputError
|
|
||||||
from ipex_llm.ggml.model.generation import GenerationMixin
|
|
||||||
from typing import List, Optional, Generator, Sequence, Union
|
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
|
|
||||||
class ChatGLM(GenerationMixin):
|
|
||||||
"""High-level Python wrapper for a chatglm.cpp model."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
model_path: str,
|
|
||||||
n_ctx: int = 512,
|
|
||||||
n_parts: int = -1,
|
|
||||||
n_gpu_layers: int = 0,
|
|
||||||
seed: int = -1,
|
|
||||||
f16_kv: bool = True,
|
|
||||||
logits_all: bool = False,
|
|
||||||
vocab_only: bool = False,
|
|
||||||
use_mmap: bool = False,
|
|
||||||
use_mlock: bool = False,
|
|
||||||
embedding: bool = False,
|
|
||||||
n_threads: Optional[int] = -1,
|
|
||||||
n_batch: int = 512,
|
|
||||||
last_n_tokens_size: int = 64,
|
|
||||||
lora_base: Optional[str] = None,
|
|
||||||
lora_path: Optional[str] = None,
|
|
||||||
verbose: bool = True,
|
|
||||||
):
|
|
||||||
"""Load a chatglm.cpp model from `model_path`.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
model_path: Path to the model.
|
|
||||||
n_ctx: Maximum context size.
|
|
||||||
n_parts: Number of parts to split the model into. If -1, the number of parts
|
|
||||||
is automatically determined.
|
|
||||||
seed: Random seed. For default value -1, current timestamp is used as seed.
|
|
||||||
f16_kv: Use half-precision for key/value cache.
|
|
||||||
logits_all: Return logits for all tokens, not just the last token.
|
|
||||||
vocab_only: Only load the vocabulary no weights.
|
|
||||||
use_mmap: Use mmap if possible.
|
|
||||||
use_mlock: Force the system to keep the model in RAM.
|
|
||||||
embedding: Embedding mode only.
|
|
||||||
n_threads: Number of threads to use. Default to be -1, means auto.
|
|
||||||
n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval.
|
|
||||||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
|
||||||
lora_base: Optional path to base model, useful if using a quantized base model and
|
|
||||||
you want to apply LoRA to an f16 model.
|
|
||||||
lora_path: Path to a LoRA file to apply to the model.
|
|
||||||
verbose: Print verbose output to stderr.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If the model path does not exist.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A ChatGLM instance.
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.model_path = model_path
|
|
||||||
self.ctx = chatglm_load(model_path, use_mmap=use_mmap, n_ctx=n_ctx, n_threads=n_threads)
|
|
||||||
self.n_ctx = n_ctx
|
|
||||||
self.n_parts = n_parts
|
|
||||||
self.n_gpu_layers = n_gpu_layers
|
|
||||||
self.f16_kv = f16_kv
|
|
||||||
self.seed = seed
|
|
||||||
self.logits_all = logits_all
|
|
||||||
self.vocab_only = vocab_only
|
|
||||||
self.use_mmap = use_mmap
|
|
||||||
self.use_mlock = use_mlock
|
|
||||||
self.embedding = embedding
|
|
||||||
self.n_threads = n_threads
|
|
||||||
self.n_batch = n_batch
|
|
||||||
self.last_n_tokens_size = last_n_tokens_size
|
|
||||||
self.lora_base = lora_base
|
|
||||||
self.lora_path = lora_path
|
|
||||||
self.verbose = verbose
|
|
||||||
# TODO: Some parameters are temporarily not supported
|
|
||||||
unsupported_arg = {'n_parts': -1, 'n_gpu_layers': 0, 'f16_kv': True, 'logits_all': False,
|
|
||||||
'vocab_only': False, 'use_mlock': False, 'embedding': False,
|
|
||||||
'n_batch': 512, 'last_n_tokens_size': 64, 'lora_base': None,
|
|
||||||
'lora_path': None, 'verbose': True}
|
|
||||||
for arg in unsupported_arg.keys():
|
|
||||||
if getattr(self, arg) != unsupported_arg[arg]:
|
|
||||||
warnings.warn(f"The parameter {arg} is temporarily unsupported, "
|
|
||||||
"please use the default value.")
|
|
||||||
|
|
||||||
def __call__(
|
|
||||||
self,
|
|
||||||
prompt: str,
|
|
||||||
suffix: Optional[str] = None,
|
|
||||||
max_tokens: int = 128,
|
|
||||||
temperature: float = 0.95,
|
|
||||||
top_p: float = 0.7,
|
|
||||||
logprobs: Optional[int] = None,
|
|
||||||
echo: bool = False,
|
|
||||||
stop: Optional[Union[str, List[str]]]=[],
|
|
||||||
frequency_penalty: float = 0.0,
|
|
||||||
presence_penalty: float = 0.0,
|
|
||||||
repeat_penalty: float = 1.1,
|
|
||||||
top_k: int = 0,
|
|
||||||
stream: bool = False,
|
|
||||||
tfs_z: float = 1.0,
|
|
||||||
mirostat_mode: int = 0,
|
|
||||||
mirostat_tau: float = 5.0,
|
|
||||||
mirostat_eta: float = 0.1,
|
|
||||||
model: Optional[str] = None,
|
|
||||||
):
|
|
||||||
# TODO: Some parameters are temporarily not supported
|
|
||||||
# Unsupported parameters are checked in `_supported_call`
|
|
||||||
return self._supported_call(prompt, max_tokens, stream, temperature, top_p, top_k,
|
|
||||||
stop, model, suffix, logprobs, echo, frequency_penalty,
|
|
||||||
presence_penalty, repeat_penalty, tfs_z, mirostat_mode,
|
|
||||||
mirostat_tau, mirostat_eta)
|
|
||||||
|
|
||||||
def _supported_call(self, prompt: str, max_tokens: int, stream: bool,
|
|
||||||
temperature: float, top_p: float, top_k: int,
|
|
||||||
stop: Optional[List[str]] = [], model: Optional[str] = None, *args):
|
|
||||||
# Check unsupporeted parameters
|
|
||||||
unsupported_arg = ['suffix', 'logprobs', 'echo',
|
|
||||||
'frequency_penalty', 'presence_penalty', 'repeat_penalty',
|
|
||||||
'tfs_z', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'model']
|
|
||||||
defult_value = {'suffix': None, 'logprobs': None, 'echo': False,
|
|
||||||
'frequency_penalty': 0.0, 'presence_penalty': 0.0,
|
|
||||||
'repeat_penalty': 1.1, 'tfs_z': 1.0, 'mirostat_mode': 0,
|
|
||||||
'mirostat_tau': 5.0, 'mirostat_eta': 0.1}
|
|
||||||
for index in range(len(args)):
|
|
||||||
if args[index] != defult_value[unsupported_arg[index]]:
|
|
||||||
warnings.warn(f"The parameter {unsupported_arg[index]} is temporarily "
|
|
||||||
"unsupported, please use the default value.")
|
|
||||||
|
|
||||||
if stream:
|
|
||||||
return self.stream(prompt, max_tokens, temperature, top_p, top_k, stop, model)
|
|
||||||
else:
|
|
||||||
return self._eval(prompt, max_tokens, temperature, top_p, top_k, stop, model)
|
|
||||||
|
|
||||||
def _eval(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int,
|
|
||||||
stop: Optional[List[str]] = [], model: Optional[str] = None):
|
|
||||||
|
|
||||||
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
|
||||||
created: int = int(time.time())
|
|
||||||
if model is None:
|
|
||||||
model_name = self.model_path
|
|
||||||
else:
|
|
||||||
model_name = model
|
|
||||||
|
|
||||||
input_tokens = self._tokenize(prompt)
|
|
||||||
prompt_len = len(input_tokens)
|
|
||||||
if max_tokens < 1:
|
|
||||||
return {
|
|
||||||
"id": completion_id,
|
|
||||||
"object": "text_completion",
|
|
||||||
"created": created,
|
|
||||||
"model": model_name,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"text": prompt,
|
|
||||||
"index": 0,
|
|
||||||
"logprobs": None,
|
|
||||||
"finish_reason": "length",
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage":
|
|
||||||
{
|
|
||||||
"prompt_tokens": prompt_len,
|
|
||||||
"completion_tokens": 0,
|
|
||||||
"total_tokens": prompt_len,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for i in range(max_tokens):
|
|
||||||
token = self.forward(input_ids=input_tokens,
|
|
||||||
top_k=top_k,
|
|
||||||
top_p=top_p,
|
|
||||||
temperature=temperature)
|
|
||||||
input_tokens.append(token)
|
|
||||||
if token == self.eos_token():
|
|
||||||
break
|
|
||||||
|
|
||||||
text = self.detokenize(input_tokens)
|
|
||||||
split_text = text[len(prompt):]
|
|
||||||
split_text.rstrip('<EFBFBD>') # remove partial emoji
|
|
||||||
if stop != []:
|
|
||||||
for stop_word in stop:
|
|
||||||
split_text = split_text.split(stop_word)[0]
|
|
||||||
if split_text != text:
|
|
||||||
finish_reason = "stop"
|
|
||||||
else:
|
|
||||||
finish_reason = None
|
|
||||||
completion_len = len(input_tokens) - prompt_len
|
|
||||||
return {
|
|
||||||
"id": completion_id,
|
|
||||||
"object": "text_completion",
|
|
||||||
"created": created,
|
|
||||||
"model": model_name,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"text": prompt + split_text,
|
|
||||||
"index": 0,
|
|
||||||
"logprobs": None,
|
|
||||||
"finish_reason": finish_reason,
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": prompt_len,
|
|
||||||
"completion_tokens": completion_len,
|
|
||||||
"total_tokens": prompt_len + completion_len,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def stream(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int,
|
|
||||||
stop: Optional[List[str]] = [], model: Optional[str] = None):
|
|
||||||
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
|
||||||
created: int = int(time.time())
|
|
||||||
if model is None:
|
|
||||||
model_name = self.model_path
|
|
||||||
else:
|
|
||||||
model_name = model
|
|
||||||
input_tokens = self._tokenize(prompt)
|
|
||||||
prompt_len = len(input_tokens)
|
|
||||||
if max_tokens < 1:
|
|
||||||
yield {
|
|
||||||
"id": completion_id,
|
|
||||||
"object": "text_completion",
|
|
||||||
"created": created,
|
|
||||||
"model": model_name,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"text": prompt,
|
|
||||||
"index": 0,
|
|
||||||
"logprobs": None,
|
|
||||||
"finish_reason": "length",
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": prompt_len
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
history_text = prompt
|
|
||||||
for i in range(max_tokens):
|
|
||||||
token = self.forward(input_ids=input_tokens,
|
|
||||||
top_k=top_k,
|
|
||||||
top_p=top_p,
|
|
||||||
temperature=temperature)
|
|
||||||
input_tokens.append(token)
|
|
||||||
if token == self.eos_token():
|
|
||||||
print('\n')
|
|
||||||
break
|
|
||||||
text = self.detokenize(input_tokens)
|
|
||||||
if text.endswith('<EFBFBD>'):
|
|
||||||
# generated new token is part of an emoji
|
|
||||||
# (some emoji consists of multiple tokens)
|
|
||||||
# continue to generate more tokens to decode this emoji
|
|
||||||
continue
|
|
||||||
text = text[len(history_text):]
|
|
||||||
history_text += text
|
|
||||||
yield {
|
|
||||||
"id": completion_id,
|
|
||||||
"object": "text_completion",
|
|
||||||
"created": created,
|
|
||||||
"model": model_name,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"text": text,
|
|
||||||
"index": 0,
|
|
||||||
"logprobs": None,
|
|
||||||
"finish_reason": None,
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": prompt_len
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def _tokenize(self, text: str, *args) -> List[int]:
|
|
||||||
"""Tokenize a string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: The string to tokenize.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
RuntimeError: If the tokenization failed.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A list of tokens.
|
|
||||||
"""
|
|
||||||
warnings.warn("The parameter `add_bos` is unsupported, please use the default value.")
|
|
||||||
return chatglm_tokenize(self.ctx, text)
|
|
||||||
|
|
||||||
def detokenize(self, tokens: List[int]) -> str:
|
|
||||||
"""Detokenize a list of tokens.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
tokens: The list of tokens to detokenize.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The detokenized string.
|
|
||||||
"""
|
|
||||||
if isinstance(tokens, int):
|
|
||||||
tokens = [tokens]
|
|
||||||
return chatglm_detokenize(self.ctx, tokens)
|
|
||||||
|
|
||||||
def forward(self,
|
|
||||||
input_ids: List[int],
|
|
||||||
do_sample: bool = True,
|
|
||||||
top_k: int = 0,
|
|
||||||
top_p: float = 0.7,
|
|
||||||
temperature: float = 0.95,) -> int:
|
|
||||||
return chatglm_forward(ctx=self.ctx,
|
|
||||||
input_ids=input_ids,
|
|
||||||
do_sample=do_sample,
|
|
||||||
top_k=top_k,
|
|
||||||
top_p=top_p,
|
|
||||||
temperature=temperature)
|
|
||||||
|
|
||||||
def eos_token(self) -> int:
|
|
||||||
return chatglm_eos_token(self.ctx)
|
|
||||||
|
|
||||||
def _generate(
|
|
||||||
self,
|
|
||||||
tokens: Sequence[int],
|
|
||||||
top_k: int = 0,
|
|
||||||
top_p: float = 0.7,
|
|
||||||
temp: float = 0.95,
|
|
||||||
repeat_penalty: float = 1.1,
|
|
||||||
reset: bool = True,
|
|
||||||
frequency_penalty: float = 0.0,
|
|
||||||
presence_penalty: float = 0.0,
|
|
||||||
tfs_z: float = 1.0,
|
|
||||||
mirostat_mode: int = 0,
|
|
||||||
mirostat_tau: float = 5.0,
|
|
||||||
mirostat_eta: float = 0.1,
|
|
||||||
) -> Generator[int, Optional[Sequence[int]], None]:
|
|
||||||
"""Create a generator of tokens from a prompt.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
>>> llm = ChatGLM(your_model_path)
|
|
||||||
>>> tokens = llm._tokenize(b"Learning English is")
|
|
||||||
>>> for token in llm._generate(tokens):
|
|
||||||
>>> print(llm.detokenize([token]).decode("utf-8", errors="ignore"))
|
|
||||||
|
|
||||||
Args:
|
|
||||||
tokens: The prompt tokens.
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
The generated tokens.
|
|
||||||
"""
|
|
||||||
# TODO: Some parameters are temporarily not supported
|
|
||||||
# Unsupported parameters are checked in `_supported_generate`
|
|
||||||
return self._supported_generate(tokens, top_k, top_p, temp, repeat_penalty, reset,
|
|
||||||
frequency_penalty, presence_penalty, tfs_z, mirostat_mode,
|
|
||||||
mirostat_tau, mirostat_eta)
|
|
||||||
|
|
||||||
def _supported_generate(self, tokens: Sequence[int], top_k: int = 0, top_p: float = 0.7,
|
|
||||||
temp: float = 0.95, *args):
|
|
||||||
# Check unsupporeted parameters
|
|
||||||
unsupported_arg = ['repeat_penalty', 'reset', 'frequency_penalty', 'presence_penalty',
|
|
||||||
'tfs_z', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']
|
|
||||||
defult_value = {'repeat_penalty': 1.1, 'reset': True, 'frequency_penalty': 0.0,
|
|
||||||
'presence_penalty': 0.0, 'tfs_z': 1.0, 'mirostat_mode': 0,
|
|
||||||
'mirostat_tau': 5.0, 'mirostat_eta': 0.1}
|
|
||||||
for index in range(len(args)):
|
|
||||||
if args[index] != defult_value[unsupported_arg[index]]:
|
|
||||||
warnings.warn(f"The parameter {unsupported_arg[index]} is temporarily "
|
|
||||||
"unsupported, please use the default value.")
|
|
||||||
|
|
||||||
invalidInputError(self.ctx is not None, "The attribute `ctx` of `ChatGLM` object is None.")
|
|
||||||
while True:
|
|
||||||
token = self.forward(input_ids=tokens,
|
|
||||||
top_k=top_k,
|
|
||||||
top_p=top_p,
|
|
||||||
temperature=temp)
|
|
||||||
tokens_or_none = yield token
|
|
||||||
tokens.append(token)
|
|
||||||
if tokens_or_none is not None:
|
|
||||||
tokens.extend(tokens_or_none)
|
|
||||||
|
|
@ -1,72 +0,0 @@
|
||||||
#
|
|
||||||
# Copyright 2016 The BigDL Authors.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
|
|
||||||
# This would makes sure Python is aware there is more than one sub-package within bigdl,
|
|
||||||
# physically located elsewhere.
|
|
||||||
# Otherwise there would be module not found error in non-pip's setting as Python would
|
|
||||||
# only search the first bigdl package and end up finding only one sub-package.
|
|
||||||
|
|
||||||
|
|
||||||
from typing import List
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from ipex_llm.libs.chatglm_C import Pipeline, GenerationConfig
|
|
||||||
|
|
||||||
|
|
||||||
class ChatGLMContext:
|
|
||||||
def __init__(self, pipeline: Pipeline, config: GenerationConfig):
|
|
||||||
self.pipeline = pipeline
|
|
||||||
self.config = config
|
|
||||||
|
|
||||||
|
|
||||||
def chatglm_load(path: str,
|
|
||||||
n_ctx: int,
|
|
||||||
n_threads: int,
|
|
||||||
use_mmap: bool = False,
|
|
||||||
) -> ChatGLMContext:
|
|
||||||
path = str(Path(path))
|
|
||||||
pipeline = Pipeline(path, use_mmap)
|
|
||||||
config = GenerationConfig(
|
|
||||||
max_length=n_ctx,
|
|
||||||
num_threads=n_threads,
|
|
||||||
)
|
|
||||||
return ChatGLMContext(pipeline, config)
|
|
||||||
|
|
||||||
|
|
||||||
def chatglm_tokenize(ctx: ChatGLMContext, prompt: str) -> List[int]:
|
|
||||||
return ctx.pipeline.tokenizer.encode(prompt)
|
|
||||||
|
|
||||||
|
|
||||||
def chatglm_detokenize(ctx: ChatGLMContext, input_ids: List[int]) -> str:
|
|
||||||
return ctx.pipeline.tokenizer.decode(input_ids)
|
|
||||||
|
|
||||||
|
|
||||||
def chatglm_forward(ctx: ChatGLMContext,
|
|
||||||
input_ids: List[int],
|
|
||||||
do_sample: bool = True,
|
|
||||||
top_k: int = 0,
|
|
||||||
top_p: float = 0.7,
|
|
||||||
temperature: float = 0.95,
|
|
||||||
) -> int:
|
|
||||||
ctx.config.do_sample = do_sample
|
|
||||||
ctx.config.top_k = top_k
|
|
||||||
ctx.config.top_p = top_p
|
|
||||||
ctx.config.temperature = temperature
|
|
||||||
return ctx.pipeline.forward(input_ids, ctx.config)
|
|
||||||
|
|
||||||
|
|
||||||
def chatglm_eos_token(ctx: ChatGLMContext):
|
|
||||||
return ctx.pipeline.model.config.eos_token_id
|
|
||||||
|
|
@ -32,7 +32,6 @@ __all__ = [
|
||||||
"LlamaLLM",
|
"LlamaLLM",
|
||||||
"BloomLLM",
|
"BloomLLM",
|
||||||
"GptneoxLLM",
|
"GptneoxLLM",
|
||||||
"ChatGLMLLM",
|
|
||||||
"StarcoderLLM",
|
"StarcoderLLM",
|
||||||
"TransformersLLM",
|
"TransformersLLM",
|
||||||
"TransformersPipelineLLM"
|
"TransformersPipelineLLM"
|
||||||
|
|
@ -43,7 +42,6 @@ type_to_cls_dict: Dict[str, Type[BaseLLM]] = {
|
||||||
"LlamaLLM": LlamaLLM,
|
"LlamaLLM": LlamaLLM,
|
||||||
"BloomLLM": BloomLLM,
|
"BloomLLM": BloomLLM,
|
||||||
"GptneoxLLM": GptneoxLLM,
|
"GptneoxLLM": GptneoxLLM,
|
||||||
"ChatGLMLLM": ChatGLMLLM,
|
|
||||||
"StarcoderLLM": StarcoderLLM,
|
"StarcoderLLM": StarcoderLLM,
|
||||||
"TransformersPipelineLLM": TransformersPipelineLLM,
|
"TransformersPipelineLLM": TransformersPipelineLLM,
|
||||||
"TransformersLLM": TransformersLLM
|
"TransformersLLM": TransformersLLM
|
||||||
|
|
|
||||||
|
|
@ -70,14 +70,13 @@ class BigdlNativeLLM(LLM):
|
||||||
"please switch to the new LLM API for sepcific models.")
|
"please switch to the new LLM API for sepcific models.")
|
||||||
|
|
||||||
model_family: str = "llama"
|
model_family: str = "llama"
|
||||||
"""The model family: currently supports llama, gptneox, bloom, starcoder and chatglm."""
|
"""The model family: currently supports llama, gptneox, bloom, starcoder."""
|
||||||
|
|
||||||
family_info = {
|
family_info = {
|
||||||
'llama': {'module': "ipex_llm.models" , 'class': "Llama"},
|
'llama': {'module': "ipex_llm.models" , 'class': "Llama"},
|
||||||
'bloom': {'module': "ipex_llm.models", 'class': "Bloom"},
|
'bloom': {'module': "ipex_llm.models", 'class': "Bloom"},
|
||||||
'gptneox': {'module': "ipex_llm.models", 'class': "Gptneox"},
|
'gptneox': {'module': "ipex_llm.models", 'class': "Gptneox"},
|
||||||
'starcoder': {'module':"ipex_llm.models", 'class': "Starcoder"},
|
'starcoder': {'module':"ipex_llm.models", 'class': "Starcoder"},
|
||||||
'chatglm': {'module':"ipex_llm.ggml.model.chatglm", 'class': "ChatGLM"},
|
|
||||||
} #: :meta private:
|
} #: :meta private:
|
||||||
"""Info necessary for different model families initiation and configure."""
|
"""Info necessary for different model families initiation and configure."""
|
||||||
|
|
||||||
|
|
@ -688,11 +687,6 @@ class GptneoxLLM(_BaseCausalLM):
|
||||||
ggml_module = "ipex_llm.models"
|
ggml_module = "ipex_llm.models"
|
||||||
|
|
||||||
|
|
||||||
class ChatGLMLLM(_BaseCausalLM):
|
|
||||||
ggml_model = "ChatGLM"
|
|
||||||
ggml_module = "ipex_llm.ggml.model.chatglm"
|
|
||||||
|
|
||||||
|
|
||||||
class StarcoderLLM(_BaseCausalLM):
|
class StarcoderLLM(_BaseCausalLM):
|
||||||
ggml_model = "Starcoder"
|
ggml_model = "Starcoder"
|
||||||
ggml_module = "ipex_llm.models"
|
ggml_module = "ipex_llm.models"
|
||||||
|
|
|
||||||
|
|
@ -23,5 +23,3 @@ from ipex_llm.ggml.model.llama import Llama
|
||||||
from ipex_llm.ggml.model.gptneox import Gptneox
|
from ipex_llm.ggml.model.gptneox import Gptneox
|
||||||
from ipex_llm.ggml.model.bloom import Bloom
|
from ipex_llm.ggml.model.bloom import Bloom
|
||||||
from ipex_llm.ggml.model.starcoder import Starcoder
|
from ipex_llm.ggml.model.starcoder import Starcoder
|
||||||
# temporarily disable until linux binary file for chatglm ready
|
|
||||||
# from ipex_llm.ggml.model.chatglm import ChatGLM
|
|
||||||
|
|
|
||||||
|
|
@ -42,8 +42,7 @@ class BigdlNativeForCausalLM:
|
||||||
:param pretrained_model_name_or_path: Path for converted BigDL-LLM optimized ggml
|
:param pretrained_model_name_or_path: Path for converted BigDL-LLM optimized ggml
|
||||||
binary checkpoint. The checkpoint should be converted by ``ipex_llm.llm_convert``.
|
binary checkpoint. The checkpoint should be converted by ``ipex_llm.llm_convert``.
|
||||||
:param model_family: The model family of the pretrained checkpoint.
|
:param model_family: The model family of the pretrained checkpoint.
|
||||||
Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``, ``"starcoder"``
|
Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``, ``"starcoder"``.
|
||||||
and ``"chatglm"``.
|
|
||||||
:param dtype: Which quantized precision will be converted.
|
:param dtype: Which quantized precision will be converted.
|
||||||
Now only `int4` and `int8` are supported, and `int8` only works for `llama`
|
Now only `int4` and `int8` are supported, and `int8` only works for `llama`
|
||||||
, `gptneox` and `starcoder`.
|
, `gptneox` and `starcoder`.
|
||||||
|
|
@ -58,9 +57,9 @@ class BigdlNativeForCausalLM:
|
||||||
"""
|
"""
|
||||||
logging.warning("BigdlNativeForCausalLM has been deprecated, "
|
logging.warning("BigdlNativeForCausalLM has been deprecated, "
|
||||||
"please switch to the new CausalLM API for sepcific models.")
|
"please switch to the new CausalLM API for sepcific models.")
|
||||||
invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm'],
|
invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder'],
|
||||||
"Now we only support model family: 'llama', 'gptneox', 'bloom',"
|
"Now we only support model family: 'llama', 'gptneox', 'bloom',"
|
||||||
" 'starcoder', 'chatglm', '{}' is not in the list.".format(model_family))
|
" 'starcoder', '{}' is not in the list.".format(model_family))
|
||||||
invalidInputError(dtype.lower() in ['int4', 'int8'],
|
invalidInputError(dtype.lower() in ['int4', 'int8'],
|
||||||
"Now we only support int4 and int8 as date type for weight")
|
"Now we only support int4 and int8 as date type for weight")
|
||||||
|
|
||||||
|
|
@ -78,9 +77,6 @@ class BigdlNativeForCausalLM:
|
||||||
elif model_family == 'starcoder':
|
elif model_family == 'starcoder':
|
||||||
from ipex_llm.ggml.model.starcoder import Starcoder
|
from ipex_llm.ggml.model.starcoder import Starcoder
|
||||||
return Starcoder(model_path=ggml_model_path, **kwargs)
|
return Starcoder(model_path=ggml_model_path, **kwargs)
|
||||||
elif model_family == 'chatglm':
|
|
||||||
from ipex_llm.ggml.model.chatglm import ChatGLM
|
|
||||||
return ChatGLM(model_path=ggml_model_path, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class _BaseGGMLClass:
|
class _BaseGGMLClass:
|
||||||
|
|
@ -110,9 +106,9 @@ class _BaseGGMLClass:
|
||||||
:return: a model instance
|
:return: a model instance
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
module = importlib.import_module(cls.GGML_Module)
|
|
||||||
class_ = getattr(module, cls.GGML_Model)
|
|
||||||
if native:
|
if native:
|
||||||
|
module = importlib.import_module(cls.GGML_Module)
|
||||||
|
class_ = getattr(module, cls.GGML_Model)
|
||||||
invalidInputError(dtype.lower() in ['int4', 'int8'],
|
invalidInputError(dtype.lower() in ['int4', 'int8'],
|
||||||
"Now we only support int4 and int8 as date type for weight")
|
"Now we only support int4 and int8 as date type for weight")
|
||||||
ggml_model_path = pretrained_model_name_or_path
|
ggml_model_path = pretrained_model_name_or_path
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue