[WIP] LLm llm-cli chat mode (#8440)

* fix timezone * temp * Update linux interactive mode * modify init text for interactive mode * meet comments * update * win script * meet comments
2023-07-05 14:04:17 +08:00 · 2023-07-05 14:04:17 +08:00 · f2bb469847
commit f2bb469847
parent 936d21635f
5 changed files with 187 additions and 4 deletions
--- a/python/llm/README.md
+++ b/python/llm/README.md
@ -74,6 +74,10 @@ Currently `bigdl-llm` CLI supports *LLaMA* (e.g., *vicuna*), *GPT-NeoX* (e.g., *
   #text completion
   #llama/bloom/gptneox/starcoder model family is currently supported
   llm-cli -t 16 -x gptneox -m "/path/to/output/model.bin" -p 'Once upon a time,'
+
+   #chat mode
+   #Note: The chat mode only support LLaMA (e.g., *vicuna*), GPT-NeoX (e.g., *redpajama*)for now.
+   llm-chat -m "/path/to/output/model.bin" -x llama
   ```

 #### Hugging Face `transformers`-style API
--- a/python/llm/setup.py
+++ b/python/llm/setup.py
@ -214,7 +214,7 @@ def setup_package():
        url='https://github.com/intel-analytics/BigDL',
        packages=get_llm_packages(),
        package_dir={"": "src"},
-        package_data={"bigdl.llm": package_data[platform_name]},
+        package_data={"bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"]},
        include_package_data=True,
        entry_points={
            "console_scripts": [
@ -228,8 +228,8 @@ def setup_package():
            'Programming Language :: Python :: 3.9',
            'Programming Language :: Python :: Implementation :: CPython'],
        scripts={
-            'Linux': ['src/bigdl/llm/cli/llm-cli'],
-            'Windows': ['src/bigdl/llm/cli/llm-cli.ps1'],
+            'Linux': ['src/bigdl/llm/cli/llm-cli', 'src/bigdl/llm/cli/llm-chat'],
+            'Windows': ['src/bigdl/llm/cli/llm-cli.ps1', 'src/bigdl/llm/cli/llm-chat.ps1'],
        }[platform_name],
        platforms=['windows']
    )
--- a/python/llm/src/bigdl/llm/cli/llm-chat
+++ b/python/llm/src/bigdl/llm/cli/llm-chat
@ -0,0 +1,94 @@
+#!/bin/bash
+
+# Default values
+model_family=""
+threads=8
+# Number of tokens to predict (made it larger than default because we want a long interaction)
+n_predict=512
+
+EXTRA_ARGS=('--color')
+
+llm_dir="$(dirname "$(python -c "import bigdl.llm;print(bigdl.llm.__file__)")")"
+lib_dir="$llm_dir/libs"
+prompts_dir="$llm_dir/cli/prompts"
+
+function get_avx_flags() {
+  avx="avx2"
+  if command -v lscpu &>/dev/null; then
+    msg=$(lscpu)
+    if [[ $msg == *"avx512_vnni"* ]]; then
+      avx="avx512"
+    fi
+  else
+    echo "lscpu command not found. Please make sure it is installed."
+  fi
+  echo $avx
+}
+
+# Function to display help message
+function display_help {
+  echo "usage: ./llm-chat -x MODEL_FAMILY [-h] [args]"
+  echo ""
+  echo "options:"
+  echo "  -h, --help           show this help message"
+  echo "  -x, --model_family   {llama,gptneox}"
+  echo "                       family name of model"
+  echo "  -t N, --threads N    number of threads to use during computation (default: 8)"
+  echo "  -n N, --n_predict N  number of tokens to predict (default: 128, -1 = infinity)"
+  echo "  args                 parameters passed to the specified model function"
+}
+
+function llama {
+  PROMPT_TEMPLATE="$prompts_dir/chat-with-llm.txt"
+  EXTRA_ARGS+=('-i' '--file' "'$PROMPT_TEMPLATE'" '--reverse-prompt' "'USER:'" '--in-prefix' "' '")
+  command="$lib_dir/main-llama_$avx_flag -t $threads -n $n_predict ${filteredArguments[*]} ${EXTRA_ARGS[*]}"
+  echo "$command"
+  eval "$command"
+}
+
+function gptneox {
+  command="$lib_dir/main-gptneox_$avx_flag -t $threads -n $n_predict ${filteredArguments[*]} ${EXTRA_ARGS[*]}"
+  echo "$command"
+  eval "$command"
+}
+
+# Remove model_family/x parameter
+filteredArguments=()
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+  -h | --help)
+    display_help
+    shift
+    ;;
+  -x | --model_family | --model-family)
+    model_family="$2"
+    shift 2
+    ;;
+  -t | --threads)
+    threads="$2"
+    shift 2
+    ;;
+  -n | --n_predict | --n-predict)
+    n_predict="$2"
+    shift 2
+    ;;
+  *)
+    filteredArguments+=("'$1'")
+    shift
+    ;;
+  esac
+done
+
+avx_flag=$(get_avx_flags)
+echo "AVX Flags: $avx_flag"
+
+# Perform actions based on the model_family
+if [[ "$model_family" == "llama" ]]; then
+  llama
+elif [[ "$model_family" == "gptneox" ]]; then
+  # TODO
+  gptneox
+else
+  echo "llm-chat does not support model_family $model_family for now."
+  display_help
+fi
--- a/python/llm/src/bigdl/llm/cli/llm-chat.ps1
+++ b/python/llm/src/bigdl/llm/cli/llm-chat.ps1
@ -0,0 +1,80 @@
+$llm_dir = (Split-Path -Parent (python -c "import bigdl.llm;print(bigdl.llm.__file__)"))
+$lib_dir = Join-Path $llm_dir "libs"
+$prompt_dir = Join-Path $llm_dir "cli/prompts"
+
+$model_family = ""
+$threads = 8
+# Number of tokens to predict (made it larger than default because we want a long interaction)
+$n_predict = 512
+
+# Function to display help message
+function Display-Help
+{
+    Write-Host "usage: ./llm-cli.ps1 -x MODEL_FAMILY [-h] [args]"
+    Write-Host ""
+    Write-Host "options:"
+    Write-Host "  -h, --help           show this help message"
+    Write-Host "  -x, --model_family {llama,bloom,gptneox}"
+    Write-Host "                       family name of model"
+    Write-Host "  -t N, --threads N    number of threads to use during computation (default: 8)"
+    Write-Host "  -n N, --n_predict N  number of tokens to predict (default: 128, -1 = infinity)"
+    Write-Host "  args                 parameters passed to the specified model function"
+}
+
+function llama
+{
+    $prompt_file = Join-Path $prompt_dir "chat-with-llm.txt"
+    $command = "$lib_dir/main-llama.exe -t $threads -n $n_predict -f $prompt_file -i --color --reverse-prompt 'USER:' --in-prefix ' ' $filteredArguments"
+    Write-Host "$command"
+    Invoke-Expression $command
+}
+
+function gptneox
+{
+    # TODO
+    $command = "$lib_dir/main-gptneox.exe -t $threads -n $n_predict $filteredArguments"
+    Write-Host "$command"
+    Invoke-Expression $command
+}
+
+# Remove model_family/x parameter
+$filteredArguments = @()
+for ($i = 0; $i -lt $args.Length; $i++) {
+    if ($args[$i] -eq '--model_family' -or $args[$i] -eq '--model-family' -or $args[$i] -eq '-x')
+    {
+        if ($i + 1 -lt $args.Length -and $args[$i + 1] -notlike '-*')
+        {
+            $i++
+            $model_family = $args[$i]
+        }
+    }
+    elseif ($args[$i] -eq '--threads' -or $args[$i] -eq '-t')
+    {
+        $i++
+        $threads = $args[$i]
+    }
+    elseif ($args[$i] -eq '--n_predict' -or $args[$i] -eq '--n-predict' -or $args[$i] -eq '-n')
+    {
+        $i++
+        $n_predict = $args[$i]
+    }
+    else
+    {
+        $filteredArguments += "`'" + $args[$i] + "`'"
+    }
+}
+
+# Perform actions based on the model_family
+switch ($model_family)
+{
+    "llama" {
+        llama
+    }
+    "gptneox" {
+        gptneox
+    }
+    default {
+        Write-Host "llm-chat does not support model_family $model_family for now."
+        Display-Help
+    }
+}
--- a/python/llm/src/bigdl/llm/cli/prompts/chat-with-llm.txt
+++ b/python/llm/src/bigdl/llm/cli/prompts/chat-with-llm.txt
@ -0,0 +1,5 @@
+A chat between a curious human USER and an artificial intelligence assistant ChatLLM. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+USER: Hello, ChatLLM.
+ChatLLM: Hello. 
+USER: