diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md index 65b9545e..5e4e5cae 100644 --- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md +++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md @@ -40,6 +40,12 @@ bash finetune_llama2_7b_arc_1_card.sh bash finetune_llama2_7b_arc_2_card.sh ``` +#### Finetuning LLaMA2-7B on single Data Center GPU Flex 170 + +```bash +bash finetune_llama2_7b_flex_170_1_card.sh +``` + #### Finetuning LLaMA2-7B on three Data Center GPU Flex 170 ```bash @@ -58,6 +64,18 @@ bash finetune_llama2_7b_pvc_1100_1_card.sh bash finetune_llama2_7b_pvc_1100_4_card.sh ``` +#### Finetuning LLaMA2-7B on single Intel Data Center GPU Max 1550 + +```bash +bash finetune_llama2_7b_pvc_1550_1_card.sh +``` + +#### Finetuning LLaMA2-7B on four Intel Data Center GPU Max 1550 + +```bash +bash finetune_llama2_7b_pvc_1550_4_card.sh +``` + ### 4. Sample Output ```log {'loss': 1.9231, 'learning_rate': 2.9999945367033285e-05, 'epoch': 0.0} diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py index 0af97135..f276b801 100644 --- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py @@ -53,6 +53,21 @@ from bigdl.llm.transformers import AutoModelForCausalLM # import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training +def get_int_from_env(env_keys, default): + """Returns the first positive env value found in the `env_keys` list or the default.""" + for e in env_keys: + val = int(os.environ.get(e, -1)) + if val >= 0: + return val + return default + +local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0") +world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1") +port = get_int_from_env(["MASTER_PORT"], 29500) +os.environ["LOCAL_RANK"] = str(local_rank) +os.environ["WORLD_SIZE"] = str(world_size) +os.environ["RANK"] = str(local_rank) +os.environ["MASTER_PORT"] = str(port) def train( # model/data params diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_arc_2_card.sh b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_arc_2_card.sh index 3ad65d9d..ccb30e42 100644 --- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_arc_2_card.sh +++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_arc_2_card.sh @@ -14,15 +14,13 @@ # limitations under the License. # -export CCL_ZE_IPC_EXCHANGE=sockets export MASTER_ADDR=127.0.0.1 export OMP_NUM_THREADS=6 # adjust this to 1/4 of total physical cores export FI_PROVIDER=tcp +export CCL_ATL_TRANSPORT=ofi -torchrun --standalone \ - --nnodes=1 \ - --nproc-per-node 2 \ - ./alpaca_qlora_finetuning.py \ - --base_model "meta-llama/Llama-2-7b-hf" \ - --data_path "yahma/alpaca-cleaned" \ - --output_dir "./bigdl-qlora-alpaca" +mpirun -n 2 \ + python -u ./alpaca_qlora_finetuning.py \ + --base_model /mnt/disk1/models/Llama-2-7b-hf \ + --data_path '/home/arda/binbin/dataset/alpaca-cleaned/alpaca_data_cleaned.json' \ + --output_dir "./bigdl-qlora-alpaca" > training.log diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_flex_170_1_card.sh b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_flex_170_1_card.sh new file mode 100644 index 00000000..542aecae --- /dev/null +++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_flex_170_1_card.sh @@ -0,0 +1,23 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file +python ./alpaca_qlora_finetuning.py \ + --micro_batch_size 2 \ + --batch_size 128 \ + --base_model "meta-llama/Llama-2-7b-hf" \ + --data_path "yahma/alpaca-cleaned" \ + --output_dir "./bigdl-qlora-alpaca" diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_flex_170_3_card.sh b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_flex_170_3_card.sh index c708cc2e..ce6a635c 100644 --- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_flex_170_3_card.sh +++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_flex_170_3_card.sh @@ -14,17 +14,16 @@ # limitations under the License. # -export CCL_ZE_IPC_EXCHANGE=sockets export MASTER_ADDR=127.0.0.1 -export OMP_NUM_THREADS=16 # adjust this to 1/4 of total physical cores +export OMP_NUM_THREADS=12 # adjust this to 1/4 of total physical cores +export FI_PROVIDER=tcp +export CCL_ATL_TRANSPORT=ofi -torchrun --standalone \ - --nnodes=1 \ - --nproc-per-node 3 \ - ./alpaca_qlora_finetuning.py \ - --base_model "meta-llama/Llama-2-7b-hf" \ - --data_path "yahma/alpaca-cleaned" \ - --output_dir "./bigdl-qlora-alpaca" \ - --gradient_checkpointing False \ - --micro_batch_size 2 \ - --batch_size 128 +mpirun -n 3 \ + python -u ./alpaca_qlora_finetuning.py \ + --base_model "meta-llama/Llama-2-7b-hf" \ + --data_path "yahma/alpaca-cleaned" \ + --output_dir "./bigdl-qlora-alpaca" \ + --gradient_checkpointing False \ + --micro_batch_size 2 \ + --batch_size 128 > training.log diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_pvc_1100_4_card.sh b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_pvc_1100_4_card.sh index 24a2f50d..95a2ac78 100644 --- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_pvc_1100_4_card.sh +++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_pvc_1100_4_card.sh @@ -14,15 +14,15 @@ # limitations under the License. # -export CCL_ZE_IPC_EXCHANGE=sockets export MASTER_ADDR=127.0.0.1 export OMP_NUM_THREADS=28 # adjust this to 1/4 of total physical cores -torchrun --standalone \ - --nnodes=1 \ - --nproc-per-node 4 \ - ./alpaca_qlora_finetuning.py \ - --base_model "meta-llama/Llama-2-7b-hf" \ - --data_path "yahma/alpaca-cleaned" \ - --output_dir "./bigdl-qlora-alpaca" \ - --micro_batch_size 8 \ - --batch_size 128 +export FI_PROVIDER=tcp +export CCL_ATL_TRANSPORT=ofi + +mpirun -n 4 \ + python -u ./alpaca_qlora_finetuning.py \ + --base_model "meta-llama/Llama-2-7b-hf" \ + --data_path "yahma/alpaca-cleaned" \ + --output_dir "./bigdl-qlora-alpaca" \ + --micro_batch_size 8 \ + --batch_size 128 > training.log diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_pvc_1550_1_card.sh b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_pvc_1550_1_card.sh new file mode 100644 index 00000000..7fd91937 --- /dev/null +++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_pvc_1550_1_card.sh @@ -0,0 +1,28 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +export MASTER_ADDR=127.0.0.1 +export OMP_NUM_THREADS=28 # adjust this to 1/4 of total physical cores +export FI_PROVIDER=tcp +export CCL_ATL_TRANSPORT=ofi + +mpirun -n 2 \ + python -u ./alpaca_qlora_finetuning.py \ + --base_model "meta-llama/Llama-2-7b-hf" \ + --data_path "yahma/alpaca-cleaned" \ + --output_dir "./bigdl-qlora-alpaca" \ + --micro_batch_size 8 \ + --batch_size 128 > training.log diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_pvc_1550_4_card.sh b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_pvc_1550_4_card.sh new file mode 100644 index 00000000..2461b088 --- /dev/null +++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/finetune_llama2_7b_pvc_1550_4_card.sh @@ -0,0 +1,28 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +export MASTER_ADDR=127.0.0.1 +export OMP_NUM_THREADS=28 # adjust this to 1/4 of total physical cores +export FI_PROVIDER=tcp +export CCL_ATL_TRANSPORT=ofi + +mpirun -n 8 \ + python -u ./alpaca_qlora_finetuning.py \ + --base_model "meta-llama/Llama-2-7b-hf" \ + --data_path "yahma/alpaca-cleaned" \ + --output_dir "./bigdl-qlora-alpaca" \ + --micro_batch_size 8 \ + --batch_size 128 > training.log