change xmx condition (#9896)

2024-01-12 19:51:48 +08:00 · 2024-01-12 19:51:48 +08:00 · 6637860ddf
commit 6637860ddf
parent 0e69bfe6b0
3 changed files with 18 additions and 53 deletions
--- a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
+++ b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
@ -470,7 +470,7 @@ class LowBitLinear(nn.Linear):
            try:
                import intel_extension_for_pytorch
                import linear_q4_0
-                from bigdl.llm.utils.xmx_checker import use_xmx
+                from bigdl.llm.transformers.models.utils import use_xmx
            except ModuleNotFoundError:
                invalidInputError(False,
                                  "Please `pip install bigdl_core_xe` first.")
--- a/python/llm/src/bigdl/llm/transformers/models/utils.py
+++ b/python/llm/src/bigdl/llm/transformers/models/utils.py
@ -21,6 +21,11 @@ from bigdl.llm.ggml.quantize import ggml_tensor_qtype
 from bigdl.llm.transformers.utils import get_ipex_version, get_xpu_device_type
 SYM_INT4 = ggml_tensor_qtype["sym_int4"]
 SYM_INT8 = ggml_tensor_qtype["sym_int8"]
 FP8 = ggml_tensor_qtype["fp8"]
 def init_kv_cache(batch_size, num_heads, head_dim, current_length, max_length, dtype, device):
    key_cache_storage = torch.empty(batch_size, num_heads,
                                    max_length, head_dim,
@ -263,3 +268,15 @@ def mlp_fusion_check(x, qtype, training):
    if training or x.requires_grad:
        return False
    return True
 def use_xmx(x: torch.Tensor, qtype: int):
    device = get_xpu_device_type(x)
    return (
        device in ["arc", "flex", "pvc"]
        and qtype in [SYM_INT4, SYM_INT8, FP8]
        and (
            (device != "pvc" and x.dtype == torch.float32 and 1 < x.size(0) <= 64)
            or 1 < x.size(0) <= 8
        )
    )
--- a/python/llm/src/bigdl/llm/utils/xmx_checker.py
+++ b/python/llm/src/bigdl/llm/utils/xmx_checker.py
@ -1,52 +0,0 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import torch
 import intel_extension_for_pytorch as ipex
 from bigdl.llm.ggml.quantize import ggml_tensor_qtype
 SYM_INT4 = ggml_tensor_qtype["sym_int4"]
 SYM_INT8 = ggml_tensor_qtype["sym_int8"]
 NF4 = ggml_tensor_qtype["nf4"]
 NF3 = ggml_tensor_qtype["nf3"]
 FP8 = ggml_tensor_qtype["fp8"]
 FP4 = ggml_tensor_qtype["fp4"]
 MOFQ4 = ggml_tensor_qtype["mixed_fp4"]
 MOFQ8 = ggml_tensor_qtype["mixed_fp8"]
 class XMXChecker:
    def __init__(self):
        self.support_xmx = self.check_xmx()
        self.supported_qtype = [SYM_INT4, SYM_INT8, FP8]
    @staticmethod
    def check_xmx():
        name = torch.xpu.get_device_name(0)
        # todo: not sure how to check xmx or how to get device name for now
        return "Arc(TM)" in name or "GPU Max" in name or "GPU Flex" in name
    def check(self, input_tensor: torch.Tensor, qtype: int):
        return self.support_xmx and 1 < input_tensor.shape[0] <= 8 and \
            qtype in self.supported_qtype
 xmx_checker = XMXChecker()
 def use_xmx(input_tensor: torch.Tensor, qtype: int):
    return xmx_checker.check(input_tensor, qtype)