add pipeline parallel support with load_low_bit (#11414)
This commit is contained in:
parent
d0b801d7bc
commit
a414e3ff8a
1 changed files with 15 additions and 0 deletions
|
|
@ -534,6 +534,9 @@ class _BaseAutoModelClass:
|
||||||
:param pretrained_model_name_or_path: str value, Path to load the optimized model ckpt.
|
:param pretrained_model_name_or_path: str value, Path to load the optimized model ckpt.
|
||||||
:param optimize_model: boolean value, Whether to further optimize the low_bit llm model.
|
:param optimize_model: boolean value, Whether to further optimize the low_bit llm model.
|
||||||
Default to be True.
|
Default to be True.
|
||||||
|
:param pipeline_parallel_stages: int value, the number of GPUs allocated for
|
||||||
|
pipeline parallel. Default to be ``1``. Please set pipeline_parallel_stages > 1
|
||||||
|
to run pipeline parallel inference on multiple GPUs.
|
||||||
|
|
||||||
:return: a model instance
|
:return: a model instance
|
||||||
"""
|
"""
|
||||||
|
|
@ -580,6 +583,8 @@ class _BaseAutoModelClass:
|
||||||
embedding_qtype = kwargs.pop("embedding_qtype", None)
|
embedding_qtype = kwargs.pop("embedding_qtype", None)
|
||||||
sharded_metadata = None
|
sharded_metadata = None
|
||||||
|
|
||||||
|
pipeline_parallel_stages = kwargs.pop("pipeline_parallel_stages", 1)
|
||||||
|
|
||||||
config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path)
|
config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path)
|
||||||
bigdl_transformers_low_bit = config_dict.pop("bigdl_transformers_low_bit", False)
|
bigdl_transformers_low_bit = config_dict.pop("bigdl_transformers_low_bit", False)
|
||||||
bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True)
|
bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True)
|
||||||
|
|
@ -750,6 +755,16 @@ class _BaseAutoModelClass:
|
||||||
# rwkv model linear layers has been rescaled
|
# rwkv model linear layers has been rescaled
|
||||||
if model.config.model_type == "rwkv":
|
if model.config.model_type == "rwkv":
|
||||||
model.rwkv.layers_are_rescaled = True
|
model.rwkv.layers_are_rescaled = True
|
||||||
|
|
||||||
|
if pipeline_parallel_stages > 1:
|
||||||
|
from .pipeline_parallel import pipeline_parallel, pipeline_parallel_generate
|
||||||
|
model = pipeline_parallel(model, pipeline_parallel_stages)
|
||||||
|
import types
|
||||||
|
# add pipeline_parallel_generate to pretrained model dynamically
|
||||||
|
model.pipeline_parallel_generate = types.MethodType(pipeline_parallel_generate,
|
||||||
|
model)
|
||||||
|
torch.distributed.barrier()
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue