```
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline inference throughput."""
import argparse
import dataclasses
import json
import os
import random
import time
import csv
import matplotlib.pyplot as plt
import numpy as np
import warnings
from typing import Any, Optional, Union, List, Dict
import torch
import torch_npu
import uvloop
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
# 从本地模块导入数据集和工具函数
from benchmark_dataset import (
AIMODataset,
BurstGPTDataset,
ConversationDataset,
InstructCoderDataset,
RandomDataset,
SampleRequest,
ShareGPTDataset,
SonnetDataset,
VisionArenaDataset,
)
class BenchmarkResult:
"""封装基准测试结果"""
def __init__(self):
self.num_prompts = 0
self.elapsed_time = 0.0
self.total_prompt_tokens = 0
self.total_output_tokens = 0
self.requests_per_second = 0.0
self.tokens_per_second = 0.0
self.output_tokens_per_second = 0.0
@property
def total_tokens(self):
return self.total_prompt_tokens + self.total_output_tokens
def update(self, num_prompts: int, elapsed_time: float,
total_prompt_tokens: int, total_output_tokens: int):
self.num_prompts = num_prompts
self.elapsed_time = elapsed_time
self.total_prompt_tokens = total_prompt_tokens
self.total_output_tokens = total_output_tokens
self.requests_per_second = num_prompts / elapsed_time
self.tokens_per_second = (total_prompt_tokens + total_output_tokens) / elapsed_time
self.output_tokens_per_second = total_output_tokens / elapsed_time
def to_dict(self) -> Dict[str, Any]:
return {
"concurrency": self.num_prompts,
"elapsed_time": self.elapsed_time,
"total_prompt_tokens": self.total_prompt_tokens,
"total_output_tokens": self.total_output_tokens,
"total_tokens": self.total_tokens,
"requests_per_second": self.requests_per_second,
"tokens_per_second": self.tokens_per_second,
"output_tokens_per_second": self.output_tokens_per_second,
"avg_latency": self.elapsed_time # 平均延迟就是总耗时
}
def __str__(self) -> str:
return (f"Concurrency: {self.num_prompts}\n"
f"Throughput: {self.requests_per_second:.2f} requests/s\n"
f"Total tokens/s: {self.tokens_per_second:.2f}\n"
f"Output tokens/s: {self.output_tokens_per_second:.2f}\n"
f"Avg Latency: {self.elapsed_time:.2f}s")
def get_requests(args, tokenizer, num_prompts):
"""生成指定数量的请求样本"""
# 更新请求数量
args.num_prompts = num_prompts
# Common parameters for all dataset types.
common_kwargs = {
"dataset_path": args.dataset_path,
"random_seed": args.seed,
}
sample_kwargs = {
"tokenizer": tokenizer,
"lora_path": args.lora_path,
"max_loras": args.max_loras,
"num_requests": num_prompts,
"input_len": args.input_len,
"output_len": args.output_len,
}
if args.dataset_path is None or args.dataset_name == "random":
sample_kwargs["range_ratio"] = args.random_range_ratio
sample_kwargs["prefix_len"] = args.prefix_len
dataset_cls = RandomDataset
elif args.dataset_name == "sharegpt":
dataset_cls = ShareGPTDataset
if args.backend == "vllm-chat":
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_name == "sonnet":
assert tokenizer.chat_template or tokenizer.default_chat_template, (
"Tokenizer/model must have chat template for sonnet dataset."
)
dataset_cls = SonnetDataset
sample_kwargs["prefix_len"] = args.prefix_len
sample_kwargs["return_prompt_formatted"] = True
elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf":
common_kwargs["no_stream"] = args.no_stream
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = VisionArenaDataset
common_kwargs["dataset_subset"] = None
common_kwargs["dataset_split"] = "train"
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = InstructCoderDataset
common_kwargs["dataset_split"] = "train"
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = ConversationDataset
common_kwargs["dataset_subset"] = args.hf_subset
common_kwargs["dataset_split"] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_cls = AIMODataset
common_kwargs["dataset_subset"] = None
common_kwargs["dataset_split"] = "train"
else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
# Remove None values
sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
def run_hf_benchmark(
requests: List[SampleRequest],
model_name: str,
tokenizer: PreTrainedTokenizerBase,
max_batch_size: int,
trust_remote_code: bool,
disable_detokenize: bool = False
) -> BenchmarkResult:
"""运行HF模型基准测试并返回性能指标"""
result = BenchmarkResult()
# 加载模型
device = torch.device("npu:0" if torch.npu.is_available() else "cpu")
torch.npu.set_device(device)
# 在可能OOM的情况下,使用上下文管理器管理显存
try:
llm = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
trust_remote_code=trust_remote_code
).to(device)
if llm.config.model_type == "llama":
# 为HF后端启用填充
tokenizer.pad_token = tokenizer.eos_token
# 进度条
pbar = tqdm(total=len(requests), desc="Processing requests")
# 计时开始
start_time = time.perf_counter()
# 分批处理请求
batch: List[str] = []
max_prompt_len = 0
max_output_len = 0
total_output_tokens = 0
total_prompt_tokens = 0
for i in range(len(requests)):
prompt = requests[i].prompt
prompt_len = requests[i].prompt_len
output_len = requests[i].expected_output_len
# 添加提示到批处理
batch.append(prompt)
max_prompt_len = max(max_prompt_len, prompt_len)
max_output_len = max(max_output_len, output_len)
# 检查是否可以添加更多请求到当前批处理
if len(batch) < max_batch_size and i != len(requests) - 1:
next_prompt_len = requests[i + 1].prompt_len
next_output_len = requests[i + 1].expected_output_len
if (max(max_prompt_len, next_prompt_len) +
max(max_output_len, next_output_len)) <= 2048:
continue
# 准备输入数据
inputs = tokenizer(batch, return_tensors="pt", padding=True)
# 将数据移动到NPU设备
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
torch.npu.synchronize()
# 生成输出
llm_outputs = llm.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=max_output_len,
do_sample=True,
temperature=1.0,
top_p=1.0,
pad_token_id=tokenizer.eos_token_id
)
# 更新标记计数
total_prompt_tokens += sum(len(ids) for ids in inputs.input_ids)
if not disable_detokenize:
# 包含解码时间
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
# 计算生成的标记数量
total_output_tokens += sum(len(ids) - len(input_ids[i])
for i, ids in enumerate(llm_outputs))
pbar.update(len(batch))
# 重置批处理
batch = []
max_prompt_len = 0
max_output_len = 0
# 计时结束
end_time = time.perf_counter()
elapsed_time = end_time - start_time
pbar.close()
# 更新结果
result.update(len(requests), elapsed_time,
total_prompt_tokens, total_output_tokens)
except RuntimeError as e:
if "out of memory" in str(e).lower():
print(f"Out of memory (OOM) detected with {len(requests)} requests")
result.num_prompts = len(requests)
result.elapsed_time = 0
raise # 重新抛出异常以便上层处理
else:
raise
finally:
# 清理资源
if 'llm' in locals():
del llm
torch.npu.empty_cache()
return result
def run_concurrency_range(args: argparse.Namespace) -> List[Dict[str, Any]]:
"""测试并发范围并返回结果列表"""
print(f"\n🔍 Starting concurrency range testing: {args.start_concurrency} to {args.end_concurrency}, step {args.step_size}\n")
results = []
# 初始化NPU环境
if torch.npu.is_available():
torch.npu.init()
torch.npu.set_device(0)
else:
print("Warning: NPU not available, using CPU")
# 创建tokenizer
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer,
trust_remote_code=True,
local_files_only=True
)
tokenizer.padding_side = "left"
# 创建结果目录
os.makedirs(os.path.dirname(args.output_csv), exist_ok=True)
# 初始化CSV文件
with open(args.output_csv, "w", newline="") as csvfile:
fieldnames = [
"concurrency", "elapsed_time", "total_prompt_tokens",
"total_output_tokens", "total_tokens", "requests_per_second",
"tokens_per_second", "output_tokens_per_second", "avg_latency"
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# 测试并发范围
concurrency_values = range(
args.start_concurrency,
args.end_concurrency + 1,
args.step_size
)
for concurrency in tqdm(concurrency_values, desc="Testing concurrency levels"):
# 清理显存
torch.npu.empty_cache()
result_dict = {"concurrency": concurrency}
print(f"\nTesting concurrency: {concurrency}")
try:
# 生成请求
requests = get_requests(args, tokenizer, concurrency)
# 运行基准测试
result = run_hf_benchmark(
requests,
args.model,
tokenizer,
args.hf_max_batch_size,
args.trust_remote_code,
args.disable_detokenize
)
# 记录结果
result_dict = result.to_dict()
print(f"✔ Success | Throughput: {result.requests_per_second:.2f} req/s | Latency: {result.elapsed_time:.2f}s")
except RuntimeError as e:
if "out of memory" in str(e).lower():
result_dict["status"] = "OOM"
print(f"✘ OOM Error")
else:
result_dict["status"] = "Error"
print(f"✘ Error: {str(e)}")
# 保存到CSV
with open(args.output_csv, "a", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(result_dict)
# 添加到结果列表
results.append(result_dict)
# 释放资源
del tokenizer
torch.npu.empty_cache()
return results
def plot_results(csv_path: str):
"""从CSV文件读取结果并绘制图表"""
concurrency = []
throughput = []
latency = []
tokens_per_sec = []
# 读取CSV文件
with open(csv_path, "r") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row.get("status") == "OOM" or row.get("status") == "Error":
continue
concurrency.append(int(row["concurrency"]))
throughput.append(float(row["requests_per_second"]))
latency.append(float(row["avg_latency"]))
tokens_per_sec.append(float(row["tokens_per_second"]))
if not concurrency:
print("⚠ No valid data to plot")
return
# 设置绘图风格
plt.style.use("ggplot")
# 创建图表
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
# 吞吐量图表
ax1.plot(concurrency, throughput, "bo-", linewidth=2, markersize=8)
ax1.set_title("Throughput vs Concurrency")
ax1.set_xlabel("Concurrency Level")
ax1.set_ylabel("Throughput (requests/second)")
ax1.grid(True, linestyle="--", alpha=0.7)
# 延迟图表
ax2.plot(concurrency, latency, "ro-", linewidth=2, markersize=8)
ax2.set_title("Latency vs Concurrency")
ax2.set_xlabel("Concurrency Level")
ax2.set_ylabel("Average Latency (seconds)")
ax2.grid(True, linestyle="--", alpha=0.7)
# 调整布局
plt.tight_layout()
# 保存图表
png_path = csv_path.replace(".csv", "_results.png")
plt.savefig(png_path, dpi=300)
plt.close()
print(f"\n📊 Performance charts saved to: {png_path}")
def run_standard_benchmark(args: argparse.Namespace) -> Dict[str, Any]:
"""运行标准基准测试"""
# 创建tokenizer
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer,
trust_remote_code=True,
local_files_only=True
)
tokenizer.padding_side = "left"
# 生成请求
requests = get_requests(args, tokenizer, args.num_prompts)
# 运行基准测试
result = run_hf_benchmark(
requests,
args.model,
tokenizer,
args.hf_max_batch_size,
args.trust_remote_code,
args.disable_detokenize
)
# 打印结果
print(f"\n📊 Benchmark Results (Concurrency: {args.num_prompts})")
print(f" Throughput: {result.requests_per_second:.2f} requests/s")
print(f" Total tokens/s: {result.tokens_per_second:.2f}")
print(f" Output tokens/s: {result.output_tokens_per_second:.2f}")
print(f" Average Latency: {result.elapsed_time:.2f} seconds")
return result.to_dict()
def validate_args(args: argparse.Namespace):
"""验证命令行参数"""
# 后端验证
valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
if args.backend not in valid_backends:
raise ValueError(f"Unsupported backend: {args.backend}")
# 数据集配置
if not args.dataset and not args.dataset_path:
print("Using random dataset by default")
args.dataset_name = "random"
if args.input_len is None:
raise ValueError("input_len must be provided for a random dataset")
# 确保tokenizer设置
if not getattr(args, "tokenizer", None):
args.tokenizer = args.model
# 并发范围验证
if args.start_concurrency < 1:
raise ValueError("Start concurrency must be at least 1")
if args.end_concurrency < args.start_concurrency:
raise ValueError("End concurrency must be greater than or equal to start concurrency")
if args.step_size < 1:
raise ValueError("Step size must be at least 1")
def main(args: argparse.Namespace):
validate_args(args)
# 设置随机种子
if args.seed is None:
args.seed = 0
random.seed(args.seed)
# 运行合适的基准测试
if args.concurrency_range:
results = run_concurrency_range(args)
print("\n✅ Concurrency range testing completed")
# 绘制性能图表
if args.plot_results:
plot_results(args.output_csv)
else:
results = run_standard_benchmark(args)
# 输出JSON结果
if args.output_json:
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
print(f"\n📁 Results saved to {args.output_json}")
# 保存PyTorch基准测试格式
if not args.concurrency_range:
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
with open(pt_file, "w") as f:
json.dump([results], f, indent=4)
print(f"📝 PyTorch benchmark format saved to {pt_file}")
def create_argument_parser():
"""创建命令行参数解析器"""
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
# 基准测试配置
parser.add_argument(
"--concurrency-range",
action="store_true",
help="Test a range of concurrency levels"
)
parser.add_argument(
"--start-concurrency",
type=int,
default=1,
help="Starting concurrency level for range testing",
)
parser.add_argument(
"--end-concurrency",
type=int,
default=16,
help="Ending concurrency level for range testing",
)
parser.add_argument(
"--step-size",
type=int,
default=1,
help="Step size for concurrency level increases",
)
parser.add_argument(
"--output-csv",
type=str,
default="concurrency_results.csv",
help="Path to save concurrency test results in CSV format",
)
parser.add_argument(
"--plot-results",
action="store_true",
default=True,
help="Plot performance graphs after range testing",
)
parser.add_argument(
"--backend",
type=str,
choices=["vllm", "hf", "mii", "vllm-chat"],
default="hf",
)
parser.add_argument(
"--model",
type=str,
required=True,
help="Path to the model or Hugging Face model ID",
)
parser.add_argument(
"--tokenizer",
type=str,
default=None,
help="Path to the tokenizer (defaults to model path if not provided)",
)
parser.add_argument(
"--dataset",
type=str,
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
default="random",
help="Name of the dataset to benchmark on.",
)
parser.add_argument(
"--dataset-path",
type=str,
default=None,
help="Path to the dataset",
)
parser.add_argument(
"--input-len",
type=int,
default=256,
help="Input prompt length for each request",
)
parser.add_argument(
"--output-len",
type=int,
default=256,
help="Output length for each request",
)
parser.add_argument(
"--num-prompts",
type=int,
default=100,
help="Number of prompts to process.",
)
parser.add_argument(
"--hf-max-batch-size",
type=int,
default=4,
help="Maximum batch size for HF backend.",
)
parser.add_argument(
"--output-json",
type=str,
default=None,
help="Path to save the throughput results in JSON format.",
)
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=(
"Do not detokenize the response (i.e. do not include "
"detokenization time in the measurement)"
),
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Random seed for reproducibility",
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="Trust remote code when loading models from Hugging Face Hub",
)
# 数据集相关参数
parser.add_argument(
"--prefix-len",
type=int,
default=None,
help="Number of prefix tokens for RandomDataset and SonnetDataset"
)
parser.add_argument(
"--random-range-ratio",
type=float,
default=None,
help="Range ratio for sampling input/output length in RandomDataset"
)
parser.add_argument(
"--hf-subset",
type=str,
default=None,
help="Subset of the HF dataset"
)
parser.add_argument(
"--hf-split",
type=str,
default=None,
help="Split of the HF dataset"
)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
main(args)
```
报错:
Traceback (most recent call last):
File "/models/z50051264/vllm-0.10.0/benchmarks/benchmark_modify.py", line 647, in <module>
main(args)
File "/models/z50051264/vllm-0.10.0/benchmarks/benchmark_modify.py", line 464, in main
validate_args(args)
File "/models/z50051264/vllm-0.10.0/benchmarks/benchmark_modify.py", line 445, in validate_args
if not args.dataset and not args.dataset_path:
AttributeError: 'Namespace' object has no attribute 'dataset'