support loading q4_1/q5_0/q5_1/q8_0 gguf model (#9546)
This commit is contained in:
		
							parent
							
								
									b824754256
								
							
						
					
					
						commit
						65121c7997
					
				
					 2 changed files with 66 additions and 6 deletions
				
			
		| 
						 | 
					@ -19,7 +19,11 @@ from bigdl.llm.utils.common import invalidInputError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
qtype_map = {
 | 
					qtype_map = {
 | 
				
			||||||
    2: "sym_int4"       # q4_0
 | 
					    2: "sym_int4",      # q4_0
 | 
				
			||||||
 | 
					    3: "asym_int4",     # q4_1
 | 
				
			||||||
 | 
					    7: "sym_int8",      # q8_0
 | 
				
			||||||
 | 
					    8: "sym_int5",      # q5_0
 | 
				
			||||||
 | 
					    9: "asym_int5",     # q5_1
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -121,7 +121,7 @@ class GGUFHeader:
 | 
				
			||||||
        invalidInputError(magic == "GGUF", "not a valid gguf file")
 | 
					        invalidInputError(magic == "GGUF", "not a valid gguf file")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        version, n_tensors, n_kv = struct.unpack("<IQQ", data[4:])
 | 
					        version, n_tensors, n_kv = struct.unpack("<IQQ", data[4:])
 | 
				
			||||||
        invalidInputError(version == 2, "only gguf v2 is supported")
 | 
					        invalidInputError(version in [2, 3], "only gguf v2 and v3 is supported")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.magic = magic
 | 
					        self.magic = magic
 | 
				
			||||||
        self.version = version
 | 
					        self.version = version
 | 
				
			||||||
| 
						 | 
					@ -209,9 +209,9 @@ class GGUFTensorLoader:
 | 
				
			||||||
            1: self.convert_f16_tensor,         # f16
 | 
					            1: self.convert_f16_tensor,         # f16
 | 
				
			||||||
            2: self.convert_q4_0_tensor,        # q4_0
 | 
					            2: self.convert_q4_0_tensor,        # q4_0
 | 
				
			||||||
            3: self.convert_q4_1_tensor,        # q4_1
 | 
					            3: self.convert_q4_1_tensor,        # q4_1
 | 
				
			||||||
            6: self.convert_unknown_tensor,     # q5_0
 | 
					            6: self.convert_q5_0_tensor,        # q5_0
 | 
				
			||||||
            7: self.convert_unknown_tensor,     # q5_1
 | 
					            7: self.convert_q5_1_tensor,        # q5_1
 | 
				
			||||||
            8: self.convert_unknown_tensor,     # q8_0
 | 
					            8: self.convert_q8_0_tensor,        # q8_0
 | 
				
			||||||
            9: self.convert_unknown_tensor,     # q8_1
 | 
					            9: self.convert_unknown_tensor,     # q8_1
 | 
				
			||||||
            10: self.convert_unknown_tensor,    # q2_k
 | 
					            10: self.convert_unknown_tensor,    # q2_k
 | 
				
			||||||
            11: self.convert_unknown_tensor,    # q3_k
 | 
					            11: self.convert_unknown_tensor,    # q3_k
 | 
				
			||||||
| 
						 | 
					@ -265,7 +265,63 @@ class GGUFTensorLoader:
 | 
				
			||||||
        return result
 | 
					        return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def convert_q4_1_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
 | 
					    def convert_q4_1_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
 | 
				
			||||||
        invalidInputError(False, "q4_1 conversion is not implemented")
 | 
					        # see https://github.com/ggerganov/llama.cpp/blob
 | 
				
			||||||
 | 
					        # /b38a16dfcff88d547f78f52d1bea31b84a05aff7/ggml-quants.c#L1094
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        block_size = self.block_size[3]
 | 
				
			||||||
 | 
					        tensor = tensor.reshape((-1, block_size))
 | 
				
			||||||
 | 
					        scales, base, data = tensor[:, :2], tensor[:, 2:4], tensor[:, 4:]
 | 
				
			||||||
 | 
					        scales = scales.view(torch.half)
 | 
				
			||||||
 | 
					        base = base.view(torch.half)
 | 
				
			||||||
 | 
					        data = torch.cat([data & 0xF, data >> 4], dim=-1)
 | 
				
			||||||
 | 
					        result = (data * scales + base).reshape(dims)
 | 
				
			||||||
 | 
					        return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def convert_q5_0_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
 | 
				
			||||||
 | 
					        # see https://github.com/ggerganov/llama.cpp/blob
 | 
				
			||||||
 | 
					        # /b38a16dfcff88d547f78f52d1bea31b84a05aff7/ggml-quants.c#L1115
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        block_size = self.block_size[6]
 | 
				
			||||||
 | 
					        tensor = tensor.reshape((-1, block_size))
 | 
				
			||||||
 | 
					        scales, hdata, ldata = tensor[:, :2], tensor[:, 2:6], tensor[:, 6:]
 | 
				
			||||||
 | 
					        scales = scales.view(torch.half)
 | 
				
			||||||
 | 
					        # hdata = hdata.view(torch.int)
 | 
				
			||||||
 | 
					        hdata = hdata.clone().view(torch.int)   # clone hdata to fix memory address alignment
 | 
				
			||||||
 | 
					        shift = torch.arange(0, 32, 1)
 | 
				
			||||||
 | 
					        hdata = (((hdata.expand(-1, 32) >> shift) << 4) & 0x10).byte()
 | 
				
			||||||
 | 
					        ldata = torch.cat([ldata & 0xF, ldata >> 4], dim=-1)
 | 
				
			||||||
 | 
					        data = (hdata | ldata).view(torch.int8) - 16
 | 
				
			||||||
 | 
					        result = (data * scales).reshape(dims)
 | 
				
			||||||
 | 
					        return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def convert_q5_1_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
 | 
				
			||||||
 | 
					        # https://github.com/ggerganov/llama.cpp/blob
 | 
				
			||||||
 | 
					        # /b38a16dfcff88d547f78f52d1bea31b84a05aff7/ggml-quants.c#L1141
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        block_size = self.block_size[7]
 | 
				
			||||||
 | 
					        tensor = tensor.reshape((-1, block_size))
 | 
				
			||||||
 | 
					        scales, base, hdata, ldata = tensor[:, :2], tensor[:, 2:4], tensor[:, 4:8], tensor[:, 8:]
 | 
				
			||||||
 | 
					        scales = scales.view(torch.half)
 | 
				
			||||||
 | 
					        base = base.view(torch.half)
 | 
				
			||||||
 | 
					        hdata = hdata.view(torch.int)
 | 
				
			||||||
 | 
					        shift = torch.arange(0, 32, 1)
 | 
				
			||||||
 | 
					        hdata = (((hdata.expand(-1, 32) >> shift) << 4) & 0x10).byte()
 | 
				
			||||||
 | 
					        ldata = torch.cat([ldata & 0xF, ldata >> 4], dim=-1)
 | 
				
			||||||
 | 
					        data = hdata | ldata
 | 
				
			||||||
 | 
					        result = (data * scales + base).reshape(dims)
 | 
				
			||||||
 | 
					        return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def convert_q8_0_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
 | 
				
			||||||
 | 
					        # https://github.com/ggerganov/llama.cpp/blob
 | 
				
			||||||
 | 
					        # /b38a16dfcff88d547f78f52d1bea31b84a05aff7/ggml-quants.c#L1168
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        block_size = self.block_size[8]
 | 
				
			||||||
 | 
					        tensor = tensor.reshape((-1, block_size))
 | 
				
			||||||
 | 
					        scales, data = tensor[:, :2], tensor[:, 2:]
 | 
				
			||||||
 | 
					        scales = scales.view(torch.half)
 | 
				
			||||||
 | 
					        data = data.view(torch.int8)
 | 
				
			||||||
 | 
					        result = (data * scales).reshape(dims)
 | 
				
			||||||
 | 
					        return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def convert_q6_k_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
 | 
					    def convert_q6_k_tensor(self, tensor: torch.Tensor, size: int, ndims: int, dims: int):
 | 
				
			||||||
        # see https://github.com/ggerganov/llama.cpp/blob
 | 
					        # see https://github.com/ggerganov/llama.cpp/blob
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue