Support for Mixtral AWQ (#9775)
* Support for Mixtral AWQ * Update README.md * Update README.md * Update awq_config.py * Update README.md * Update README.md
This commit is contained in:
parent
1917bbe626
commit
66e286a73d
4 changed files with 23 additions and 4 deletions
|
|
@ -13,6 +13,7 @@ This example shows how to directly run 4-bit AWQ models using BigDL-LLM on Intel
|
||||||
- [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ)
|
- [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ)
|
||||||
- [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ)
|
- [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ)
|
||||||
- [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ)
|
- [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ)
|
||||||
|
- [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/ybelkada/Mixtral-8x7B-Instruct-v0.1-AWQ)
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
|
|
@ -30,12 +31,16 @@ We suggest using conda to manage environment:
|
||||||
conda create -n llm python=3.9
|
conda create -n llm python=3.9
|
||||||
conda activate llm
|
conda activate llm
|
||||||
|
|
||||||
pip install autoawq==0.1.6 --no-deps
|
pip install autoawq==0.1.8 --no-deps
|
||||||
pip install --pre --upgrade bigdl-llm[all] # install bigdl-llm with 'all' option
|
pip install --pre --upgrade bigdl-llm[all] # install bigdl-llm with 'all' option
|
||||||
pip install transformers==4.35.0
|
pip install transformers==4.35.0
|
||||||
pip install accelerate==0.24.1
|
pip install accelerate==0.25.0
|
||||||
pip install einops
|
pip install einops
|
||||||
```
|
```
|
||||||
|
**Note: For Mixtral model, please use transformers 4.36.0:**
|
||||||
|
```bash
|
||||||
|
pip install transformers==4.36.0
|
||||||
|
```
|
||||||
|
|
||||||
### 2. Run
|
### 2. Run
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ This example shows how to directly run 4-bit AWQ models using BigDL-LLM on Intel
|
||||||
- [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ)
|
- [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ)
|
||||||
- [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ)
|
- [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ)
|
||||||
- [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ)
|
- [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ)
|
||||||
|
- [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/ybelkada/Mixtral-8x7B-Instruct-v0.1-AWQ)
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
|
|
@ -32,10 +33,14 @@ conda activate llm
|
||||||
|
|
||||||
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
|
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
|
||||||
pip install transformers==4.35.0
|
pip install transformers==4.35.0
|
||||||
pip install autoawq==0.1.6 --no-deps
|
pip install autoawq==0.1.8 --no-deps
|
||||||
pip install accelerate==0.24.1
|
pip install accelerate==0.25.0
|
||||||
pip install einops
|
pip install einops
|
||||||
```
|
```
|
||||||
|
**Note: For Mixtral model, please use transformers 4.36.0:**
|
||||||
|
```bash
|
||||||
|
pip install transformers==4.36.0
|
||||||
|
```
|
||||||
|
|
||||||
### 2. Configures OneAPI environment variables
|
### 2. Configures OneAPI environment variables
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -71,6 +71,7 @@ layer_type_dict = {
|
||||||
"gpt_neox": "GPTNeoXDecoderLayer",
|
"gpt_neox": "GPTNeoXDecoderLayer",
|
||||||
"aquila": "AquilaDecoderLayer",
|
"aquila": "AquilaDecoderLayer",
|
||||||
"Yi": "YiDecoderLayer",
|
"Yi": "YiDecoderLayer",
|
||||||
|
"mixtral": "MixtralDecoderLayer",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -136,6 +137,8 @@ def get_blocks(model):
|
||||||
layers = model.model.layers
|
layers = model.model.layers
|
||||||
elif "yi" in str(model.__class__).lower():
|
elif "yi" in str(model.__class__).lower():
|
||||||
layers = model.model.layers
|
layers = model.model.layers
|
||||||
|
elif "mixtral" in str(model.__class__).lower():
|
||||||
|
layers = model.model.layers
|
||||||
else:
|
else:
|
||||||
invalidInputError(False, f"Model type {type(model)} isn't supported.")
|
invalidInputError(False, f"Model type {type(model)} isn't supported.")
|
||||||
return layers
|
return layers
|
||||||
|
|
@ -213,6 +216,8 @@ def _replace_with_awq_layers(model, awq_config: AwqConfig):
|
||||||
|
|
||||||
# Replace nn.Linear with WQLinear
|
# Replace nn.Linear with WQLinear
|
||||||
for name, module in named_linears.items():
|
for name, module in named_linears.items():
|
||||||
|
if any(key in name for key in awq_config.modules_to_not_convert):
|
||||||
|
continue
|
||||||
if awq_config.version == 'gemm':
|
if awq_config.version == 'gemm':
|
||||||
q_linear_module = WQLinear_GEMM
|
q_linear_module = WQLinear_GEMM
|
||||||
elif awq_config.version == 'gemv':
|
elif awq_config.version == 'gemv':
|
||||||
|
|
|
||||||
|
|
@ -64,6 +64,8 @@ class AwqConfig(QuantizationConfigMixin):
|
||||||
`AwqBackendPackingMethod.AUTOAWQ`):
|
`AwqBackendPackingMethod.AUTOAWQ`):
|
||||||
The quantization backend. Some models might be quantized using `llm-awq` backend.
|
The quantization backend. Some models might be quantized using `llm-awq` backend.
|
||||||
This is useful for users that quantize their own models using `llm-awq` library.
|
This is useful for users that quantize their own models using `llm-awq` library.
|
||||||
|
modules_to_not_convert (`list`, *optional*, defaults to []):
|
||||||
|
The modules in qblock while not quantized.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|
@ -73,6 +75,7 @@ class AwqConfig(QuantizationConfigMixin):
|
||||||
zero_point: bool = True,
|
zero_point: bool = True,
|
||||||
version: AWQLinearVersion = AWQLinearVersion.GEMM,
|
version: AWQLinearVersion = AWQLinearVersion.GEMM,
|
||||||
backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ,
|
backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ,
|
||||||
|
modules_to_not_convert: list = [],
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
self.quant_method = QuantizationMethod.AWQ
|
self.quant_method = QuantizationMethod.AWQ
|
||||||
|
|
@ -82,6 +85,7 @@ class AwqConfig(QuantizationConfigMixin):
|
||||||
self.zero_point = zero_point
|
self.zero_point = zero_point
|
||||||
self.version = version
|
self.version = version
|
||||||
self.backend = backend
|
self.backend = backend
|
||||||
|
self.modules_to_not_convert = modules_to_not_convert
|
||||||
|
|
||||||
self.post_init()
|
self.post_init()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue