Support for Mixtral AWQ (#9775)
* Support for Mixtral AWQ * Update README.md * Update README.md * Update awq_config.py * Update README.md * Update README.md
This commit is contained in:
		
							parent
							
								
									1917bbe626
								
							
						
					
					
						commit
						66e286a73d
					
				
					 4 changed files with 23 additions and 4 deletions
				
			
		| 
						 | 
				
			
			@ -13,6 +13,7 @@ This example shows how to directly run 4-bit AWQ models using BigDL-LLM on Intel
 | 
			
		|||
- [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ)
 | 
			
		||||
- [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ)
 | 
			
		||||
- [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ)
 | 
			
		||||
- [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/ybelkada/Mixtral-8x7B-Instruct-v0.1-AWQ)
 | 
			
		||||
 | 
			
		||||
## Requirements
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -30,12 +31,16 @@ We suggest using conda to manage environment:
 | 
			
		|||
conda create -n llm python=3.9
 | 
			
		||||
conda activate llm
 | 
			
		||||
 | 
			
		||||
pip install autoawq==0.1.6 --no-deps
 | 
			
		||||
pip install autoawq==0.1.8 --no-deps
 | 
			
		||||
pip install --pre --upgrade bigdl-llm[all] # install bigdl-llm with 'all' option
 | 
			
		||||
pip install transformers==4.35.0
 | 
			
		||||
pip install accelerate==0.24.1
 | 
			
		||||
pip install accelerate==0.25.0
 | 
			
		||||
pip install einops
 | 
			
		||||
```
 | 
			
		||||
**Note: For Mixtral model, please use transformers 4.36.0:**
 | 
			
		||||
```bash
 | 
			
		||||
pip install transformers==4.36.0
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### 2. Run
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -13,6 +13,7 @@ This example shows how to directly run 4-bit AWQ models using BigDL-LLM on Intel
 | 
			
		|||
- [llava-v1.5-13B-AWQ](https://huggingface.co/TheBloke/llava-v1.5-13B-AWQ)
 | 
			
		||||
- [Yi-6B-AWQ](https://huggingface.co/TheBloke/Yi-6B-AWQ)
 | 
			
		||||
- [Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ)
 | 
			
		||||
- [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/ybelkada/Mixtral-8x7B-Instruct-v0.1-AWQ)
 | 
			
		||||
 | 
			
		||||
## Requirements
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -32,10 +33,14 @@ conda activate llm
 | 
			
		|||
 | 
			
		||||
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
 | 
			
		||||
pip install transformers==4.35.0
 | 
			
		||||
pip install autoawq==0.1.6 --no-deps
 | 
			
		||||
pip install accelerate==0.24.1
 | 
			
		||||
pip install autoawq==0.1.8 --no-deps
 | 
			
		||||
pip install accelerate==0.25.0
 | 
			
		||||
pip install einops
 | 
			
		||||
```
 | 
			
		||||
**Note: For Mixtral model, please use transformers 4.36.0:**
 | 
			
		||||
```bash
 | 
			
		||||
pip install transformers==4.36.0
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### 2. Configures OneAPI environment variables
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -71,6 +71,7 @@ layer_type_dict = {
 | 
			
		|||
    "gpt_neox": "GPTNeoXDecoderLayer",
 | 
			
		||||
    "aquila": "AquilaDecoderLayer",
 | 
			
		||||
    "Yi": "YiDecoderLayer",
 | 
			
		||||
    "mixtral": "MixtralDecoderLayer",
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -136,6 +137,8 @@ def get_blocks(model):
 | 
			
		|||
        layers = model.model.layers
 | 
			
		||||
    elif "yi" in str(model.__class__).lower():
 | 
			
		||||
        layers = model.model.layers
 | 
			
		||||
    elif "mixtral" in str(model.__class__).lower():
 | 
			
		||||
        layers = model.model.layers
 | 
			
		||||
    else:
 | 
			
		||||
        invalidInputError(False, f"Model type {type(model)} isn't supported.")
 | 
			
		||||
    return layers
 | 
			
		||||
| 
						 | 
				
			
			@ -213,6 +216,8 @@ def _replace_with_awq_layers(model, awq_config: AwqConfig):
 | 
			
		|||
 | 
			
		||||
        # Replace nn.Linear with WQLinear
 | 
			
		||||
        for name, module in named_linears.items():
 | 
			
		||||
            if any(key in name for key in awq_config.modules_to_not_convert):
 | 
			
		||||
                continue
 | 
			
		||||
            if awq_config.version == 'gemm':
 | 
			
		||||
                q_linear_module = WQLinear_GEMM
 | 
			
		||||
            elif awq_config.version == 'gemv':
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -64,6 +64,8 @@ class AwqConfig(QuantizationConfigMixin):
 | 
			
		|||
        `AwqBackendPackingMethod.AUTOAWQ`):
 | 
			
		||||
            The quantization backend. Some models might be quantized using `llm-awq` backend.
 | 
			
		||||
            This is useful for users that quantize their own models using `llm-awq` library.
 | 
			
		||||
        modules_to_not_convert (`list`, *optional*, defaults to []):
 | 
			
		||||
            The modules in qblock while not quantized.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
| 
						 | 
				
			
			@ -73,6 +75,7 @@ class AwqConfig(QuantizationConfigMixin):
 | 
			
		|||
        zero_point: bool = True,
 | 
			
		||||
        version: AWQLinearVersion = AWQLinearVersion.GEMM,
 | 
			
		||||
        backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ,
 | 
			
		||||
        modules_to_not_convert: list = [],
 | 
			
		||||
        **kwargs,
 | 
			
		||||
    ):
 | 
			
		||||
        self.quant_method = QuantizationMethod.AWQ
 | 
			
		||||
| 
						 | 
				
			
			@ -82,6 +85,7 @@ class AwqConfig(QuantizationConfigMixin):
 | 
			
		|||
        self.zero_point = zero_point
 | 
			
		||||
        self.version = version
 | 
			
		||||
        self.backend = backend
 | 
			
		||||
        self.modules_to_not_convert = modules_to_not_convert
 | 
			
		||||
 | 
			
		||||
        self.post_init()
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue