diff --git a/python/llm/dev/benchmark/all-in-one/README.md b/python/llm/dev/benchmark/all-in-one/README.md index 56bfc730..24b22c7f 100644 --- a/python/llm/dev/benchmark/all-in-one/README.md +++ b/python/llm/dev/benchmark/all-in-one/README.md @@ -1,7 +1,7 @@ # All in One Benchmark Test All in one benchmark test allows users to test all the benchmarks and record them in a result CSV. Users can provide models and related information in `config.yaml`. -Before running, make sure to have [bigdl-llm](../../../README.md) installed. +Before running, make sure to have [bigdl-llm](../../../README.md) and [bigdl-nano](../../../../nano/README.md) installed. ## Config Config YAML file has following format @@ -28,4 +28,10 @@ test_api: run `python run.py`, this will output results to `results.csv`. For SPR performance, run `bash run-spr.sh`. -For ARC performance, run `bash run-arc.sh` +> **Note** +> +> In `run-spr.sh`, we set optimal environment varaible by `source bigdl-nano-init -c`, `-c` stands for disabling jemalloc. Enabling jemalloc may lead to latency increasement after multiple trials. +> +> The value of `OMP_NUM_THREADS` should be the same as the cpu cores specified by `numactl -C`. + +For ARC performance, run `bash run-arc.sh`. diff --git a/python/llm/dev/benchmark/all-in-one/run-spr.sh b/python/llm/dev/benchmark/all-in-one/run-spr.sh index dd96962d..790ec242 100644 --- a/python/llm/dev/benchmark/all-in-one/run-spr.sh +++ b/python/llm/dev/benchmark/all-in-one/run-spr.sh @@ -1,5 +1,7 @@ #!/bin/bash - +source bigdl-nano-init -c +export OMP_NUM_THREADS=48 +export TRANSFORMERS_OFFLINE=1 # set following parameters according to the actual specs of the test machine numactl -C 0-47 -m 0 python $(dirname "$0")/run.py \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 0383f0a2..a985b233 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -116,8 +116,8 @@ def run_transformer_int4(repo_id, model = AutoModel.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, torch_dtype='auto') tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) else: - model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True) - tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) end = time.perf_counter() print(">> loading of model costs {}s".format(end - st))