From a0a80d232ee218ae5b6401d92883ac5afeef6169 Mon Sep 17 00:00:00 2001
From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com>
Date: Thu, 30 Nov 2023 13:42:30 +0800
Subject: [PATCH] LLM: Add qlora cpu distributed readme (#9561)

* init readme

* add distributed guide

* update
---
 .../llm/example/CPU/QLoRA-FineTuning/README.md |  6 ++++++
 .../QLoRA-FineTuning/alpaca-qlora/README.md    | 14 ++++++++++++++
 .../finetune_one_node_two_sockets.sh           | 18 ++++++++++++++++++
 3 files changed, 38 insertions(+)
 create mode 100644 python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/finetune_one_node_two_sockets.sh

diff --git a/python/llm/example/CPU/QLoRA-FineTuning/README.md b/python/llm/example/CPU/QLoRA-FineTuning/README.md
index 7f6a5e79..02b482c9 100644
--- a/python/llm/example/CPU/QLoRA-FineTuning/README.md
+++ b/python/llm/example/CPU/QLoRA-FineTuning/README.md
@@ -3,6 +3,12 @@
 This example demonstrates how to finetune a llama2-7b model using Big-LLM 4bit optimizations on [Intel CPUs](../README.md).
 
 
+## Distributed Training Guide
+1. Single node with single socket: [simple example](https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/CPU/QLoRA-FineTuning#example-finetune-llama2-7b-using-qlora)
+or [alpaca example](https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora)
+2. [Single node with multiple sockets](https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora#guide-to-finetuning-qlora-on-one-node-with-multiple-sockets)
+3. multiple nodes with multiple sockets
+
 ## Example: Finetune llama2-7b using QLoRA
 
 This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k). 
diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md
index a256fcc2..b920d66d 100644
--- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md
+++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md
@@ -44,6 +44,20 @@ python ./alpaca_qlora_finetuning_cpu.py \
   1%|█                                                                                                                                                         | 8/1164 [xx:xx<xx:xx:xx, xx s/it]
 ```
 
+### Guide to finetuning QLoRA on one node with multiple sockets
+1. install extra lib
+```bash
+# need to run the alpaca stand-alone version first
+# for using mpirun
+pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable
+```
+2. modify conf in `finetune_one_node_two_sockets.sh` and run
+```
+source ${conda_env}/lib/python3.9/site-packages/oneccl_bindings_for_pytorch/env/setvars.sh
+bash finetune_one_node_two_sockets.sh
+```
+
+
 ### Guide to use different prompts or different datasets
 Now the prompter is for the datasets with `instruction` `input`(optional) and `output`. If you want to use different datasets,
 you can add template file xxx.json in templates. And then update utils.prompter.py's `generate_prompt` method and update `generate_and_tokenize_prompt` method to fix the dataset.
diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/finetune_one_node_two_sockets.sh b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/finetune_one_node_two_sockets.sh
new file mode 100644
index 00000000..b4b42ac1
--- /dev/null
+++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/finetune_one_node_two_sockets.sh
@@ -0,0 +1,18 @@
+export MASTER_ADDR=127.0.0.1
+export SOCKET_CORES=48
+
+source bigdl-llm-init -t
+mpirun -n 2 \
+ --bind-to socket \
+ -genv OMP_NUM_THREADS=$SOCKET_CORES \
+ -genv KMP_AFFINITY="granularity=fine,none" \
+ -genv KMP_BLOCKTIME=1 \
+ python alpaca_qlora_finetuning_cpu.py \
+ --gradient_checkpointing False \
+ --batch_size 128 \
+ --micro_batch_size 8 \
+ --max_steps -1 \
+ --base_model "meta-llama/Llama-2-7b-hf" \
+ --data_path "yahma/alpaca-cleaned" \
+ --output_dir "./bigdl-qlora-alpaca"
+