Arc stress test (#9795)
* add arc stress test * triger ci * triger CI * triger ci * disable ci
This commit is contained in:
parent
40eaf76ae3
commit
f6bb4ab313
7 changed files with 668 additions and 559 deletions
|
|
@ -102,6 +102,88 @@ jobs:
|
|||
cd ../../../test/benchmark
|
||||
python -m pip install pandas==1.5.3
|
||||
python csv_to_html.py -f $CSV_SAVE_PATH/fp8 -b $CSV_SAVE_PATH/fp8/transformer_int4_gpu-results-1baseline.csv -t 5.0
|
||||
|
||||
llm-stress-test-on-arc:
|
||||
needs: llm-perf-regression-test-on-arc
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.9"]
|
||||
runs-on: [self-hosted, llm, perf]
|
||||
env:
|
||||
OMP_NUM_THREADS: 16
|
||||
THREAD_NUM: 16
|
||||
ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
|
||||
CSV_SAVE_PATH: '/mnt/disk1/stable_version_stress_test_gpu/'
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
# pip install transformers_stream_generator for model internlm-chat-7b-8k
|
||||
# pip install tiktoken for model Qwen-7B-Chat-10-12
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install --upgrade wheel
|
||||
python -m pip install --upgrade omegaconf
|
||||
python -m pip install --upgrade pandas
|
||||
python -m pip install --upgrade einops
|
||||
python -m pip install --upgrade transformers_stream_generator
|
||||
python -m pip install --upgrade tiktoken
|
||||
|
||||
- name: Download llm binary
|
||||
uses: ./.github/actions/llm/download-llm-binary
|
||||
|
||||
- name: Run LLM install (all) test
|
||||
uses: ./.github/actions/llm/setup-llm-env
|
||||
with:
|
||||
extra-dependency: "xpu"
|
||||
|
||||
- name: Test installed xpu version
|
||||
shell: bash
|
||||
run: |
|
||||
source /home/arda/intel/oneapi/setvars.sh
|
||||
bash python/llm/test/run-llm-install-tests.sh
|
||||
|
||||
- name: Test on xpu (int4)
|
||||
shell: bash
|
||||
run: |
|
||||
source /home/arda/intel/oneapi/setvars.sh
|
||||
export USE_XETLA=OFF
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
mv python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml python/llm/dev/benchmark/all-in-one/config.yaml
|
||||
cd python/llm/dev/benchmark/all-in-one
|
||||
# hide time info
|
||||
sed -i 's/str(end - st)/"xxxxxx"/g' run-stress-test.py
|
||||
python run-stress-test.py
|
||||
cp ./*.csv $CSV_SAVE_PATH/int4
|
||||
rm ./*.csv
|
||||
cd ../../../test/benchmark
|
||||
python -m pip install pandas==1.5.3
|
||||
python csv_to_html.py -f $CSV_SAVE_PATH/int4
|
||||
|
||||
- name: Test on xpu (fp8)
|
||||
shell: bash
|
||||
run: |
|
||||
source /home/arda/intel/oneapi/setvars.sh
|
||||
export USE_XETLA=OFF
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
mv python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml python/llm/dev/benchmark/all-in-one/config.yaml
|
||||
cd python/llm/dev/benchmark/all-in-one
|
||||
# hide time info
|
||||
sed -i 's/str(end - st)/"xxxxxx"/g' run-stress-test.py
|
||||
python run-stress-test.py
|
||||
cp ./*.csv $CSV_SAVE_PATH/fp8
|
||||
rm ./*.csv
|
||||
cd ../../../test/benchmark
|
||||
python -m pip install pandas==1.5.3
|
||||
python csv_to_html.py -f $CSV_SAVE_PATH/fp8
|
||||
|
||||
|
||||
llm-perf-regression-test-on-spr:
|
||||
|
|
@ -209,4 +291,4 @@ jobs:
|
|||
cp ./*.csv /models/stable_version_stress_test_cpu/
|
||||
cd ../../../test/benchmark
|
||||
python -m pip install pandas==1.5.3
|
||||
python csv_to_html.py -f /models/stable_version_stress_test_cpu/
|
||||
python csv_to_html.py -f /models/stable_version_stress_test_cpu/
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -45,22 +45,6 @@ LLAVA_IDS = ['liuhaotian/llava-v1.5-7b']
|
|||
results = []
|
||||
excludes = []
|
||||
|
||||
def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials):
|
||||
for i in range(num_trials + warm_up):
|
||||
st = time.perf_counter()
|
||||
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
|
||||
num_beams=num_beams)
|
||||
torch.xpu.synchronize()
|
||||
end = time.perf_counter()
|
||||
output_ids = output_ids.cpu()
|
||||
print("model generate cost: " + str(end - st))
|
||||
output = tokenizer.batch_decode(output_ids)
|
||||
print(output[0])
|
||||
actual_out_len = output_ids.shape[1] - actual_in_len
|
||||
if i >= warm_up:
|
||||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||
actual_in_len, actual_out_len])
|
||||
|
||||
def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False):
|
||||
# TODO: make a parameter
|
||||
result= {}
|
||||
|
|
@ -82,8 +66,7 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
|
|||
num_beams,
|
||||
low_bit,
|
||||
cpu_embedding if 'win' in test_api else 'N/A',
|
||||
result[in_out_pair][-1][5] if 'win' in test_api else 'N/A']) # currently only peak mem for win gpu is caught here
|
||||
|
||||
result[in_out_pair][-1][5] if 'int4_gpu' in test_api else 'N/A']) # currently only peak mem for win gpu is caught here
|
||||
|
||||
def get_model_path(repo_id, local_model_hub):
|
||||
if local_model_hub:
|
||||
|
|
@ -95,10 +78,6 @@ def get_model_path(repo_id, local_model_hub):
|
|||
else:
|
||||
return repo_id
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def run_transformer_int4(repo_id,
|
||||
local_model_hub,
|
||||
in_out_pairs,
|
||||
|
|
@ -158,10 +137,11 @@ def run_transformer_int4(repo_id,
|
|||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||
actual_in_len, actual_out_len])
|
||||
i += 1
|
||||
if i >= warm_up+num_trials:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def run_transformer_int4_gpu(repo_id,
|
||||
local_model_hub,
|
||||
in_out_pairs,
|
||||
|
|
@ -172,6 +152,7 @@ def run_transformer_int4_gpu(repo_id,
|
|||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
reserved_mem_list = []
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
# Load model in 4 bit,
|
||||
# which convert the relevant layers in the model into INT4 format
|
||||
|
|
@ -196,6 +177,7 @@ def run_transformer_int4_gpu(repo_id,
|
|||
model = ipex.optimize(model.eval(), inplace=True)
|
||||
end = time.perf_counter()
|
||||
print(">> loading of model costs {}s".format(end - st))
|
||||
reserved_mem_list.append(torch.xpu.memory.memory_reserved()/(1024**3))
|
||||
|
||||
model = BenchmarkWrapper(model)
|
||||
|
||||
|
|
@ -205,31 +187,42 @@ def run_transformer_int4_gpu(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
# As different tokenizer has different encodings,
|
||||
# in_len.txt maybe shorter than we need,
|
||||
# use much longer context to make sure input length
|
||||
test_length = min(in_len*2, 8192)
|
||||
while test_length not in [32, 256, 1024, 2048, 8192]:
|
||||
test_length = test_length * 2
|
||||
input_str = open(f"prompt/{test_length}.txt", 'r').read()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials))
|
||||
thread.start()
|
||||
thread.join()
|
||||
del model
|
||||
i = 0
|
||||
with open("prompt/stress_test.txt", 'r') as file:
|
||||
for input_str in file:
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
st = time.perf_counter()
|
||||
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
|
||||
num_beams=num_beams)
|
||||
torch.xpu.synchronize()
|
||||
end = time.perf_counter()
|
||||
reserved_mem_list.append(torch.xpu.memory.memory_reserved()/(1024**3))
|
||||
gpu_peak_mem = max(reserved_mem_list) # always keep the peak gpu mem at current stage
|
||||
output_ids = output_ids.cpu()
|
||||
print("model generate cost: " + str(end - st))
|
||||
output = tokenizer.batch_decode(output_ids)
|
||||
print(output[0])
|
||||
actual_out_len = output_ids.shape[1] - actual_in_len
|
||||
if i >= warm_up:
|
||||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||
actual_in_len, actual_out_len, gpu_peak_mem])
|
||||
i += 1
|
||||
if i >= warm_up+num_trials:
|
||||
break
|
||||
model.to('cpu')
|
||||
torch.xpu.synchronize()
|
||||
torch.xpu.empty_cache()
|
||||
del model
|
||||
gc.collect()
|
||||
return result
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from omegaconf import OmegaConf
|
||||
conf = OmegaConf.load(f'{current_dir}/config.yaml')
|
||||
|
|
|
|||
|
|
@ -0,0 +1,20 @@
|
|||
repo_id:
|
||||
- 'meta-llama/Llama-2-7b-chat-hf'
|
||||
- 'THUDM/chatglm2-6b'
|
||||
- 'THUDM/chatglm3-6b'
|
||||
- 'baichuan-inc/Baichuan2-7B-Chat'
|
||||
- 'Qwen/Qwen-7B-Chat'
|
||||
local_model_hub: '/mnt/disk1/models'
|
||||
warm_up: 10
|
||||
num_trials: 100
|
||||
num_beams: 1 # default to greedy search
|
||||
low_bit: 'fp8' # default to use 'sym_int4' (i.e. symmetric int4)
|
||||
in_out_pairs:
|
||||
- '1024-512'
|
||||
- '2048-512'
|
||||
test_api:
|
||||
- "transformer_int4_gpu" # on Intel GPU
|
||||
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
|
||||
exclude:
|
||||
- 'baichuan-inc/Baichuan2-7B-Chat:2048'
|
||||
- 'Qwen/Qwen-7B-Chat:2048'
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
repo_id:
|
||||
- 'meta-llama/Llama-2-7b-chat-hf'
|
||||
- 'THUDM/chatglm2-6b'
|
||||
- 'THUDM/chatglm3-6b'
|
||||
- 'baichuan-inc/Baichuan2-7B-Chat'
|
||||
- 'Qwen/Qwen-7B-Chat'
|
||||
local_model_hub: '/mnt/disk1/models'
|
||||
warm_up: 10
|
||||
num_trials: 100
|
||||
num_beams: 1 # default to greedy search
|
||||
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
|
||||
in_out_pairs:
|
||||
- '1024-512'
|
||||
- '2048-512'
|
||||
test_api:
|
||||
- "transformer_int4_gpu" # on Intel GPU
|
||||
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
|
||||
exclude:
|
||||
- 'Qwen/Qwen-7B-Chat:2048'
|
||||
|
|
@ -7,8 +7,8 @@ repo_id:
|
|||
- 'baichuan-inc/Baichuan2-13B-Chat'
|
||||
- 'Qwen/Qwen-14B-Chat'
|
||||
local_model_hub: '/models'
|
||||
warm_up: 1
|
||||
num_trials: 4
|
||||
warm_up: 10
|
||||
num_trials: 100
|
||||
num_beams: 1 # default to greedy search
|
||||
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
|
||||
in_out_pairs:
|
||||
|
|
|
|||
Loading…
Reference in a new issue