[REFINE] graphmode code (#12540)

This commit is contained in:
Jun Wang 2024-12-16 09:17:01 +08:00 committed by GitHub
parent caf15cc5ef
commit 0b953e61ef
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 3 additions and 87 deletions

View file

@ -25,15 +25,11 @@ model_path = args.model_path
dtype=torch.bfloat16 dtype=torch.bfloat16
num_labels = 5 num_labels = 5
model_name=model_path model_name=model_path
save_directory = model_name + "-classification" save_directory = model_name + "-classification"
# Initialize the tokenizer # Initialize the tokenizer
# Need padding from the left and padding to 1024
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(save_directory) tokenizer.save_pretrained(save_directory)

View file

@ -17,6 +17,7 @@
import torch import torch
import time import time
import argparse import argparse
import contextlib
from transformers import GPT2ForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, Qwen2ForSequenceClassification from transformers import GPT2ForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, Qwen2ForSequenceClassification
from torch.profiler import profile, record_function, ProfilerActivity, schedule from torch.profiler import profile, record_function, ProfilerActivity, schedule
@ -36,12 +37,6 @@ engine = args.engine
model_path = args.model_path model_path = args.model_path
print(f"The batch size is: {batch_size}, device is {device}") print(f"The batch size is: {batch_size}, device is {device}")
######################################################################################
# PyTorch Profiling with IPEX
# export IPEX_ZE_TRACING=1
# export ZE_ENABLE_TRACING_LAYER=1
import contextlib
def profiler_setup(profiling=False, *args, **kwargs): def profiler_setup(profiling=False, *args, **kwargs):
if profiling: if profiling:
return torch.profiler.profile(*args, **kwargs) return torch.profiler.profile(*args, **kwargs)
@ -55,21 +50,15 @@ my_schedule = schedule(
active=1 active=1
) )
# also define a handler for outputing results # define a handler for outputing results
def trace_handler(p): def trace_handler(p):
if(device == 'xpu'): if(device == 'xpu'):
print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=20)) print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=20))
print(p.key_averages().table(sort_by="cpu_time_total", row_limit=20)) print(p.key_averages().table(sort_by="cpu_time_total", row_limit=20))
# p.export_chrome_trace("./trace_" + str(p.step_num) + ".json")
#######################################################################################
dtype = torch.bfloat16 if device == 'cpu' else torch.float16 dtype = torch.bfloat16 if device == 'cpu' else torch.float16
num_labels = 5 num_labels = 5
model_name = model_path model_name = model_path
model_name = model_name + "-classification" model_name = model_name + "-classification"
model_name_ov = model_name + "-ov" model_name_ov = model_name + "-ov"
model_name_ov = model_name_ov + "-fp16" model_name_ov = model_name_ov + "-fp16"
@ -77,11 +66,9 @@ model_name_ov = model_name_ov + "-fp16"
if (engine == 'ipex') : if (engine == 'ipex') :
import torch import torch
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
# Need padding from the left and padding to 1024
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype, model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype,
pad_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id,
low_cpu_mem_usage=True low_cpu_mem_usage=True
@ -106,122 +93,55 @@ else:
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
model = OVModelForSequenceClassification.from_pretrained(model_name_ov, torch_dtype=dtype).to(device) model = OVModelForSequenceClassification.from_pretrained(model_name_ov, torch_dtype=dtype).to(device)
# Intel(R) Extension for PyTorch* # Intel(R) Extension for PyTorch*
if engine == 'ipex': if engine == 'ipex':
if device == 'cpu': if device == 'cpu':
# model = ipex.llm.optimize(model, dtype=dtype, inplace=True, deployment_mode=True)
# ############## TorchDynamo ###############
model = ipex.optimize(model, dtype=torch.bfloat16, weights_prepack=False) model = ipex.optimize(model, dtype=torch.bfloat16, weights_prepack=False)
model = torch.compile(model, backend='ipex') model = torch.compile(model, backend='ipex')
# ##########################################
else: # Intel XPU else: # Intel XPU
#model = ipex.llm.optimize(model, dtype=dtype, device="xpu", inplace=True)
model = ipex.optimize(model, dtype=dtype, inplace=True) model = ipex.optimize(model, dtype=dtype, inplace=True)
model=torch.compile(model, backend="inductor") model=torch.compile(model, backend="inductor")
print(model) print(model)
# # #######calulate the total num of parameters########
# def model_size(model):
# return sum(t.numel() for t in model.parameters())
# print(f"GPT2 size: {model_size(model)/1000**2:.1f}M parameters")
# # # #######print model information ###################
# print(model)
# ########Enable the BetterTransformer ###################
# only Better Transformer only support GPT2, not support Qwen2
# model = BetterTransformer.transform(model)
#elif engine == 'ipex-llm':
# model = ipex.optimize(model, dtype=dtype, inplace=True)
# model=torch.compile(model) #backend="inductor")
elif engine == 'ov': elif engine == 'ov':
print("OV inference") print("OV inference")
prompt = ["this is the first prompt"] prompt = ["this is the first prompt"]
prompts = prompt * batch_size prompts = prompt * batch_size
#print(prompts)
# Tokenize the batch of prompts
inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=1024, truncation=True) inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=1024, truncation=True)
# print(inputs)
if engine == 'ipex' or engine == 'ipex-llm': if engine == 'ipex' or engine == 'ipex-llm':
#ipex need move the inputs to device, but OV doesn't need
inputs.to(device) inputs.to(device)
# Initialize an empty list to store elapsed times
elapsed_times = [] elapsed_times = []
# Loop for batch processing 10 times and calculate the time for every loop
with profiler_setup(profiling=enable_profile, activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], with profiler_setup(profiling=enable_profile, activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
schedule=my_schedule, schedule=my_schedule,
on_trace_ready=trace_handler, on_trace_ready=trace_handler,
# on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/gpt2'),
record_shapes=True, record_shapes=True,
with_stack=True with_stack=True
) as prof: ) as prof:
for i in range(10): for i in range(10):
start_time = time.time() start_time = time.time()
# Perform inference
with torch.inference_mode(): with torch.inference_mode():
# logits = model(**inputs).logits
outputs = model(**inputs) outputs = model(**inputs)
logits = outputs.logits logits = outputs.logits
# Get the predicted class for each input in the batch
predicted_class_ids = logits.argmax(dim=1).tolist() predicted_class_ids = logits.argmax(dim=1).tolist()
end_time = time.time() end_time = time.time()
elapsed_time = end_time - start_time elapsed_time = end_time - start_time
# Save the elapsed time in the list
elapsed_times.append(elapsed_time) elapsed_times.append(elapsed_time)
if(enable_profile): if(enable_profile):
prof.step() prof.step()
# print(outputs)
# print(type(outputs))
# print("logits.shape is " + str(logits.shape))
# print(logits)
# print(predicted_class_ids)
elif engine == 'ov': elif engine == 'ov':
print("OV inference") print("OV inference")
# Initialize an empty list to store elapsed times
elapsed_times = [] elapsed_times = []
# Loop for batch processing 10 times and calculate the time for every loop
for i in range(10): for i in range(10):
start_time = time.time() start_time = time.time()
outputs = model(**inputs) outputs = model(**inputs)
logits = outputs.logits logits = outputs.logits
# Get the predicted class for each input in the batch
predicted_class_ids = logits.argmax(dim=1).tolist() predicted_class_ids = logits.argmax(dim=1).tolist()
end_time = time.time() end_time = time.time()
elapsed_time = end_time - start_time elapsed_time = end_time - start_time
# Save the elapsed time in the list
elapsed_times.append(elapsed_time) elapsed_times.append(elapsed_time)
# print(outputs)
# print(type(outputs))
# print("logits.shape is " + str(logits.shape))
# print(logits)
# print(predictions)
#print(predicted_class_ids)
# Skip the first two values and calculate the average of the remaining elapsed times # Skip the first two values and calculate the average of the remaining elapsed times
average_elapsed_time = sum(elapsed_times[2:]) / len(elapsed_times[2:]) average_elapsed_time = sum(elapsed_times[2:]) / len(elapsed_times[2:])
classfication_per_second = batch_size/average_elapsed_time classfication_per_second = batch_size/average_elapsed_time