From 5809a3f5fecbe464ea1cb004177c8380a9e422f9 Mon Sep 17 00:00:00 2001
From: Lilac09 <74996885+Zhengjin-Wang@users.noreply.github.com>
Date: Tue, 12 Mar 2024 16:15:27 +0800
Subject: [PATCH] Add run-hbm.sh & add user guide for spr and hbm (#10357)

* add run-hbm.sh

* add spr and hbm guide

* only support quad mode

* only support quad mode

* update special cases

* update special cases
---
 .../LLM/Quickstart/benchmark_quickstart.md    | 36 ++++++++++++++++++
 .../llm/dev/benchmark/all-in-one/run-hbm.sh   | 38 +++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 python/llm/dev/benchmark/all-in-one/run-hbm.sh

diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/benchmark_quickstart.md b/docs/readthedocs/source/doc/LLM/Quickstart/benchmark_quickstart.md
index 0dc3bc48..138f421e 100644
--- a/docs/readthedocs/source/doc/LLM/Quickstart/benchmark_quickstart.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/benchmark_quickstart.md
@@ -104,6 +104,42 @@ Please refer to [here](https://bigdl.readthedocs.io/en/latest/doc/LLM/Overview/i
 
          ./run-max-gpu.sh
 
+   .. tab:: Intel SPR
+
+      For Intel SPR machine, we recommend:
+
+      .. code-block:: bash
+
+         ./run-spr.sh
+
+      The scipt uses a default numactl strategy. If you want to customize it, please use ``lscpu`` or ``numactl -H`` to check how cpu indexs are assigned to numa node, and make sure the run command is binded to only one socket.
+
+   .. tab:: Intel HBM
+
+      For Intel HBM machine, we recommend:
+
+      .. code-block:: bash
+
+         ./run-hbm.sh
+
+       The scipt uses a default numactl strategy. If you want to customize it, please use ``numactl -H`` to check how the index of hbm node and cpu are assigned.
+      
+      For example:
+
+
+      .. code-block:: bash
+
+         node   0   1   2   3
+            0:  10  21  13  23
+            1:  21  10  23  13
+            2:  13  23  10  23
+            3:  23  13  23  10
+
+
+      here hbm node is the node whose distance from the checked node is 13, node 2 is node 0's hbm node.
+
+      And make sure the run command is binded to only one socket.
+
 ```
 
 ## Result
diff --git a/python/llm/dev/benchmark/all-in-one/run-hbm.sh b/python/llm/dev/benchmark/all-in-one/run-hbm.sh
new file mode 100644
index 00000000..d57b5ec3
--- /dev/null
+++ b/python/llm/dev/benchmark/all-in-one/run-hbm.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+source bigdl-llm-init
+
+sockets_num=$(lscpu | grep "Socket(s)" | awk -F ':' '{print $2}')
+cores_per_socket=$(lscpu | grep "Core(s) per socket" | awk -F ':' '{print $2}')
+numa_nodes=$(lscpu | grep "NUMA node(s)" | awk -F ':' '{print $2}')
+# Multiply by 2 to avoid an float result in HBM flat mode that the NUMA count twice and it will be divided later.
+cores_per_numa=$(($sockets_num * $cores_per_socket * 2 / $numa_nodes))
+
+# Only support Quad-mode now
+if [ "${numa_nodes}" -eq 4 ]; then
+    #HBM flat Quad-mode, Confirm that there are 2 HBM memory nodes and 2 DRAM memory nodes through "nuamctl -H"
+    echo "HBM Quad mode"
+    export OMP_NUM_THREADS=${cores_per_numa}
+    echo "OMP_NUM_THREADS: ${cores_per_numa}"
+    last_cpu_index=$(($OMP_NUM_THREADS - 1))
+    numactl -C 0-$last_cpu_index -p 2 python $(dirname "$0")/run.py
+elif [ "${numa_nodes}" -eq 2 ]; then
+    #SPR or hbm only or hbm cache Quad-mode, Confirm that there are 2 DRAM memory nodes through "nuamctl -H"
+    echo "Warning: SPR Quad mode, hbm usage is default off, please check if HBM can be on."
+    export OMP_NUM_THREADS=$((${cores_per_numa} / 2))
+    echo "OMP_NUM_THREADS: $((${cores_per_numa} / 2))"
+    last_cpu_index=$(($OMP_NUM_THREADS - 1))
+    numactl -C 0-$last_cpu_index -p 0 python $(dirname "$0")/run.py
+elif [ "${numa_nodes}" -eq 1 ]; then
+    # General Test mode
+    echo "General Test mode"
+    export OMP_NUM_THREADS=$((${cores_per_numa} / 2))
+    echo "OMP_NUM_THREADS: $((${cores_per_numa} / 2))"
+    last_cpu_index=$(($OMP_NUM_THREADS - 1))
+    numactl -C 0-$last_cpu_index -p 0 python $(dirname "$0")/run.py
+else
+    echo "Warning: The number of nodes in this machine is ${numa_nodes}. Node 0 will be used for run. "
+    export OMP_NUM_THREADS=${cores_per_numa}
+    echo "OMP_NUM_THREADS: ${cores_per_numa}"
+    last_cpu_index=$(($OMP_NUM_THREADS - 1))
+    numactl -C 0-$last_cpu_index -p 0 python $(dirname "$0")/run.py
+fi