Fix speech_paraformer issue with unexpected changes (#12416)

* Fix speech_paraformer issue with unexpected changes * Add paraformer version specified
2024-11-18 23:01:20 -08:00 · 2024-11-18 23:01:20 -08:00 · ff3f7cb25f
commit ff3f7cb25f
parent a9cb70a71c
3 changed files with 20 additions and 16 deletions
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
@ -37,8 +37,8 @@ pip install timm torch==2.1.2 torchvision==0.16.2
 pip install BCEmbedding==0.1.5 transformers==4.40.0
 # [optional] for Speech_Paraformer-Large
-pip install -U funasr
+pip install funasr==1.1.14
-pip install modelscope torch==2.1.2 torchaudio==2.1.2
+pip install modelscope==1.20.1 torch==2.1.2 torchaudio==2.1.2
 ```
 ### 2. Runtime Configurations
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@ -162,11 +162,13 @@ class _BaseAutoModelClass:
                model = cls.HF_Model.from_pretrained(*args, **kwargs)
            else:
                model = cls.HF_Model(*args, **kwargs)
-            model.config.update({"bigdl_lcmu_enabled": False})
+            if hasattr(model, "config"):
                model.config.update({"bigdl_lcmu_enabled": False})
        logger.info(f"Converting model, it may takes up to several minutes ...")
-        model.config.update({"optimize_model": optimize_model})
+        if hasattr(model, "config"):
            model.config.update({"optimize_model": optimize_model})
        if mock_device == "cpu":
            with torch.no_grad():
--- a/python/llm/src/ipex_llm/transformers/npu_models/paraformer_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/paraformer_mp.py
@ -294,17 +294,17 @@ class FusedLlamaLowBitDecoderlayer(torch.nn.Module):
            torch.Tensor: result
        """
        backend_cls = self.backend_cls_prefill
-        inputs = (x,
+        inputs = (x.to(torch.float16),
-                  masks,
+                  masks.to(torch.float16),
-                  self.layer_norm_0_weight,
+                  self.layer_norm_0_weight.to(torch.float16),
-                  self.layer_norm_0_bias,
+                  self.layer_norm_0_bias.to(torch.float16),
-                  self.layer_norm_1_weight,
+                  self.layer_norm_1_weight.to(torch.float16),
-                  self.layer_norm_1_bias,
+                  self.layer_norm_1_bias.to(torch.float16),
-                  self.fsmn_weight,
+                  self.fsmn_weight.to(torch.float16),
-                  self.qkv_bias,
+                  self.qkv_bias.to(torch.float16),
-                  self.out_bias,
+                  self.out_bias.to(torch.float16),
-                  self.w1_bias,
+                  self.w1_bias.to(torch.float16),
-                  self.w2_bias,
+                  self.w2_bias.to(torch.float16),
                  )
        outputs = run_model(
@ -431,6 +431,8 @@ class PrefillRunner:
        args = (xs_pad, masks)
        self.prefill_input_queue.put(args)
        xs_pad, masks = self.prefill_result_queue.get()
        xs_pad = xs_pad.to(torch.float32)
        masks = masks.to(torch.float32)
        return xs_pad, masks
    def shutdown(self):
@ -639,7 +641,7 @@ class FusedLlamaLowBitMultiDecoderlayer(torch.nn.Module):
    ):
        super().__init__()
-        self.do_print = True
+        self.do_print = do_print
        op_parameters = []
        for w in parameters: