diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/README.md index 736126c2..ef1eb88e 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/README.md @@ -13,6 +13,7 @@ This example shows how to directly run 4-bit AWQ models using BigDL-LLM on Intel - [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ) - [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ) - [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) +- [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/ybelkada/Mixtral-8x7B-Instruct-v0.1-AWQ) ## Requirements @@ -30,12 +31,16 @@ We suggest using conda to manage environment: conda create -n llm python=3.9 conda activate llm -pip install autoawq==0.1.6 --no-deps +pip install autoawq==0.1.8 --no-deps pip install --pre --upgrade bigdl-llm[all] # install bigdl-llm with 'all' option pip install transformers==4.35.0 -pip install accelerate==0.24.1 +pip install accelerate==0.25.0 pip install einops ``` +**Note: For Mixtral model, please use transformers 4.36.0:** +```bash +pip install transformers==4.36.0 +``` ### 2. Run diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/README.md index 32960122..64df39e3 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/README.md @@ -13,6 +13,7 @@ This example shows how to directly run 4-bit AWQ models using BigDL-LLM on Intel - [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ) - [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ) - [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) +- [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/ybelkada/Mixtral-8x7B-Instruct-v0.1-AWQ) ## Requirements @@ -32,10 +33,14 @@ conda activate llm pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu pip install transformers==4.35.0 -pip install autoawq==0.1.6 --no-deps -pip install accelerate==0.24.1 +pip install autoawq==0.1.8 --no-deps +pip install accelerate==0.25.0 pip install einops ``` +**Note: For Mixtral model, please use transformers 4.36.0:** +```bash +pip install transformers==4.36.0 +``` ### 2. Configures OneAPI environment variables diff --git a/python/llm/src/bigdl/llm/transformers/awq/awq.py b/python/llm/src/bigdl/llm/transformers/awq/awq.py index 671b389d..a9cd04c0 100644 --- a/python/llm/src/bigdl/llm/transformers/awq/awq.py +++ b/python/llm/src/bigdl/llm/transformers/awq/awq.py @@ -71,6 +71,7 @@ layer_type_dict = { "gpt_neox": "GPTNeoXDecoderLayer", "aquila": "AquilaDecoderLayer", "Yi": "YiDecoderLayer", + "mixtral": "MixtralDecoderLayer", } @@ -136,6 +137,8 @@ def get_blocks(model): layers = model.model.layers elif "yi" in str(model.__class__).lower(): layers = model.model.layers + elif "mixtral" in str(model.__class__).lower(): + layers = model.model.layers else: invalidInputError(False, f"Model type {type(model)} isn't supported.") return layers @@ -213,6 +216,8 @@ def _replace_with_awq_layers(model, awq_config: AwqConfig): # Replace nn.Linear with WQLinear for name, module in named_linears.items(): + if any(key in name for key in awq_config.modules_to_not_convert): + continue if awq_config.version == 'gemm': q_linear_module = WQLinear_GEMM elif awq_config.version == 'gemv': diff --git a/python/llm/src/bigdl/llm/transformers/awq/awq_config.py b/python/llm/src/bigdl/llm/transformers/awq/awq_config.py index 0f60a833..c351ce7b 100644 --- a/python/llm/src/bigdl/llm/transformers/awq/awq_config.py +++ b/python/llm/src/bigdl/llm/transformers/awq/awq_config.py @@ -64,6 +64,8 @@ class AwqConfig(QuantizationConfigMixin): `AwqBackendPackingMethod.AUTOAWQ`): The quantization backend. Some models might be quantized using `llm-awq` backend. This is useful for users that quantize their own models using `llm-awq` library. + modules_to_not_convert (`list`, *optional*, defaults to []): + The modules in qblock while not quantized. """ def __init__( @@ -73,6 +75,7 @@ class AwqConfig(QuantizationConfigMixin): zero_point: bool = True, version: AWQLinearVersion = AWQLinearVersion.GEMM, backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ, + modules_to_not_convert: list = [], **kwargs, ): self.quant_method = QuantizationMethod.AWQ @@ -82,6 +85,7 @@ class AwqConfig(QuantizationConfigMixin): self.zero_point = zero_point self.version = version self.backend = backend + self.modules_to_not_convert = modules_to_not_convert self.post_init()