环境:
2张A6000的GPU
vllm:0.5.0
修改vllm 0.5.0的cli_args.py内容:
路径:minconda3/envs/python31013new/lib/python3.10/site-packages/vllm/entrypoints/openai/cli_args.py
- 修改前代码:
def __call__(self, parser, namespace, values, option_string=None):
lora_list = []
for item in values:
name, path = item.split('=')
lora_list.append(LoRAModulePath(name, path))
setattr(namespace, self.dest, lora_list)
修改后代码:
def __call__(self, parser, namespace, values, option_string=None):
if not hasattr(namespace, self.dest) or getattr(namespace, self.dest) is None:
setattr(namespace, self.dest, [])
lora_list = getattr(namespace, self.dest)
for item in values:
try:
name, path = item.split('=')
lora_list.append(LoRAModulePath(name, path))
except ValueError as e:
raise argparse.ArgumentError(self, f"Invalid format for {option_string}: {e}")
setattr(namespace, self.dest, lora_list)
发布代码
MODEL_PATH = os.environ.get('MODEL_PATH', '/app/models/glm-4-9b-chat')
#MODEL_PATH = '/work/finetune_demo/output/checkpoint-3000/'
TOKENIZER_DIR = os.environ.get('TOKENIZER_DIR', '/app/models/glm-4-9b-chat')
# 多Lora配置
LORA_List = [
#领导讲话lora
"ldjh=/data/dms/LLaMA-Factory-0.9.1/saves/GLM-4-9B-Chat/lora/ldjh_20241216" ,
#生成摘要lora
"zy=/data/dms/LLaMA-Factory-0.9.1/saves/GLM-4-9B-Chat/lora/zy_train_20241217"
]
# 使用 join 方法将列表中的元素连接成一个字符串,以逗号分隔
LORA_MODEL_PATH = ' '.join(LORA_List)
BLOCK_SIZE=32
# 大语言模型名称
MODEL_NAME = "glm-4"
cmd = [
"python", "-m", "vllm.entrypoints.openai.api_server",
"--model", MODEL_PATH,
"--served-model-name", MODEL_NAME,
"--trust-remote-code",
"--enforce-eager",
# 允许处理LoRA适配器。
"--enable-lora",
#格式为name=path的LoRA模块配置信息。可以指定多个模块。
# 直接传递 LORA_List 列表中的每个元素作为单独的参数
*[item for pair in zip(["--lora-modules"] * len(LORA_List), LORA_List) for item in pair],
"--max-model-len", "65528",
# # 如果你有多张显卡,可以在这里设置成你的显卡数量
"--tensor-parallel-size", "2",
# 默认是0.9.占用显存的比例,请根据你的显卡显存大小设置合适的值,例如,如果你的显卡有80G,您只想使用24G,请按照24/80=0.3设置
"--gpu-memory-utilization","0.8",
"--dtype", "float16",
#kv缓存设置
# 的连续块的令牌块大小“令牌。 [8, 16, 32]
"--block-size", "32" , # 增加此参数并设置合适的批量大小
"--port", "8001",
"--host", str(API_SERVER["host"])
]
# 使用subprocess.Popen来启动子进程
process = subprocess.Popen(cmd)
# 等待进程结束(可选)
process.wait()