Support for Mixtral AWQ (#9775)

* Support for Mixtral AWQ

* Update README.md

* Update README.md

* Update awq_config.py

* Update README.md

* Update README.md
This commit is contained in:
Heyang Sun 2023-12-25 16:08:09 +08:00 committed by GitHub
parent 1917bbe626
commit 66e286a73d
4 changed files with 23 additions and 4 deletions

View file

@ -13,6 +13,7 @@ This example shows how to directly run 4-bit AWQ models using BigDL-LLM on Intel
- [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ) - [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ)
- [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ) - [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ)
- [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) - [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ)
- [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/ybelkada/Mixtral-8x7B-Instruct-v0.1-AWQ)
## Requirements ## Requirements
@ -30,12 +31,16 @@ We suggest using conda to manage environment:
conda create -n llm python=3.9 conda create -n llm python=3.9
conda activate llm conda activate llm
pip install autoawq==0.1.6 --no-deps pip install autoawq==0.1.8 --no-deps
pip install --pre --upgrade bigdl-llm[all] # install bigdl-llm with 'all' option pip install --pre --upgrade bigdl-llm[all] # install bigdl-llm with 'all' option
pip install transformers==4.35.0 pip install transformers==4.35.0
pip install accelerate==0.24.1 pip install accelerate==0.25.0
pip install einops pip install einops
``` ```
**Note: For Mixtral model, please use transformers 4.36.0:**
```bash
pip install transformers==4.36.0
```
### 2. Run ### 2. Run

View file

@ -13,6 +13,7 @@ This example shows how to directly run 4-bit AWQ models using BigDL-LLM on Intel
- [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ) - [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ)
- [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ) - [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ)
- [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) - [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ)
- [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/ybelkada/Mixtral-8x7B-Instruct-v0.1-AWQ)
## Requirements ## Requirements
@ -32,10 +33,14 @@ conda activate llm
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
pip install transformers==4.35.0 pip install transformers==4.35.0
pip install autoawq==0.1.6 --no-deps pip install autoawq==0.1.8 --no-deps
pip install accelerate==0.24.1 pip install accelerate==0.25.0
pip install einops pip install einops
``` ```
**Note: For Mixtral model, please use transformers 4.36.0:**
```bash
pip install transformers==4.36.0
```
### 2. Configures OneAPI environment variables ### 2. Configures OneAPI environment variables

View file

@ -71,6 +71,7 @@ layer_type_dict = {
"gpt_neox": "GPTNeoXDecoderLayer", "gpt_neox": "GPTNeoXDecoderLayer",
"aquila": "AquilaDecoderLayer", "aquila": "AquilaDecoderLayer",
"Yi": "YiDecoderLayer", "Yi": "YiDecoderLayer",
"mixtral": "MixtralDecoderLayer",
} }
@ -136,6 +137,8 @@ def get_blocks(model):
layers = model.model.layers layers = model.model.layers
elif "yi" in str(model.__class__).lower(): elif "yi" in str(model.__class__).lower():
layers = model.model.layers layers = model.model.layers
elif "mixtral" in str(model.__class__).lower():
layers = model.model.layers
else: else:
invalidInputError(False, f"Model type {type(model)} isn't supported.") invalidInputError(False, f"Model type {type(model)} isn't supported.")
return layers return layers
@ -213,6 +216,8 @@ def _replace_with_awq_layers(model, awq_config: AwqConfig):
# Replace nn.Linear with WQLinear # Replace nn.Linear with WQLinear
for name, module in named_linears.items(): for name, module in named_linears.items():
if any(key in name for key in awq_config.modules_to_not_convert):
continue
if awq_config.version == 'gemm': if awq_config.version == 'gemm':
q_linear_module = WQLinear_GEMM q_linear_module = WQLinear_GEMM
elif awq_config.version == 'gemv': elif awq_config.version == 'gemv':

View file

@ -64,6 +64,8 @@ class AwqConfig(QuantizationConfigMixin):
`AwqBackendPackingMethod.AUTOAWQ`): `AwqBackendPackingMethod.AUTOAWQ`):
The quantization backend. Some models might be quantized using `llm-awq` backend. The quantization backend. Some models might be quantized using `llm-awq` backend.
This is useful for users that quantize their own models using `llm-awq` library. This is useful for users that quantize their own models using `llm-awq` library.
modules_to_not_convert (`list`, *optional*, defaults to []):
The modules in qblock while not quantized.
""" """
def __init__( def __init__(
@ -73,6 +75,7 @@ class AwqConfig(QuantizationConfigMixin):
zero_point: bool = True, zero_point: bool = True,
version: AWQLinearVersion = AWQLinearVersion.GEMM, version: AWQLinearVersion = AWQLinearVersion.GEMM,
backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ, backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ,
modules_to_not_convert: list = [],
**kwargs, **kwargs,
): ):
self.quant_method = QuantizationMethod.AWQ self.quant_method = QuantizationMethod.AWQ
@ -82,6 +85,7 @@ class AwqConfig(QuantizationConfigMixin):
self.zero_point = zero_point self.zero_point = zero_point
self.version = version self.version = version
self.backend = backend self.backend = backend
self.modules_to_not_convert = modules_to_not_convert
self.post_init() self.post_init()