一、Create .Mar Files
生成mar文件
torch-model-archiver \
--model-name <your_model_name> \
--version 1.0 \
--model-file <your_model_file>.py # trace存储不需要这个文件\
--serialized-file <your_model_name>.pt \
--handler <default_handler> \
--extra-files ./index_to_name.json \
mkdir model_store
mv <your_model_name>.mar model_store/
依赖一:存储为torch的pt文件
model_jit_script = torch.jit.script(model)
model_jit_script.save('distil_bert.pt')
依赖二:handler文件
from abc import ABC
import json
import time
import logging
import os
from collections import defaultdict
import torch
import transformers
from transformers import (
AutoTokenizer
)
from ts.torch_handler.base_handler import BaseHandler
logger = logging.getLogger(__name__)
logger.info("Transformers version %s", transformers.__version__)
class RebortaHandler(BaseHandler, ABC):
"""
Transformers handler class for sequence, token classification and question answering.
"""
def __init__(self):
super(RebortaHandler, self).__init__()
self.initialized = False
def initialize(self, ctx):
"""In this initialize function, the BERT model is loaded and
the Layer Integrated Gradients Algorithm for Captum Explanations
is initialized here.
Args:
ctx (context): It is a JSON Object containing information
pertaining to the model artefacts parameters.
"""
self.manifest = ctx.manifest
properties = ctx.system_properties
model_dir = properties.get("model_dir")
model_pt_path = None
if "serializedFile" in self.manifest["model"]:
serialized_file = self.manifest["model"]["serializedFile"]
model_pt_path = os.path.join(model_dir, serialized_file)
self.device = torch.device(
"cuda:" + str(properties.get("gpu_id"))
if torch.cuda.is_available() and properties.get("gpu_id") is not None
else "cpu"
)
# read configs for the mode, model_name, etc. from setup_config.json
setup_config_path = os.path.join(model_dir, "setup_config.json")
if os.path.isfile(setup_config_path):
with open(setup_config_path) as setup_config_file:
self.setup_config = json.load(setup_config_file)
else:
logger.warning("Missing the setup_config.json file.")
# Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode
# further setup config can be added.
model_file = self.manifest["model"].get("modelFile", "")
if model_file:
logger.debug("Loading eager model")
self.model = self._load_pickled_model(model_dir, model_file, model_pt_path)
self.model.to(self.device)
else:
logger.debug("Loading torchscript model")
if not os.path.isfile(model_pt_path):
raise RuntimeError("Missing the model.pt file")
self.model = self._load_torchscript_model(model_pt_path)
if self.setup_config["model_parallel"]:
self.model.parallelize()
else:
self.model.to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained(
self.setup_config['model_name'], do_lower_case=self.setup_config["do_lower_case"]
)
self.max_length = int(self.setup_config["max_length"])
self.model.eval()
logger.info("Transformer model from path %s loaded successfully", model_pt_path)
# Read the mapping file, index to object name
mapping_file_path = os.path.join(model_dir, "index_to_name.json")
if os.path.isfile(mapping_file_path):
with open(mapping_file_path) as f:
self.mapping = json.load(f)
else:
logger.warning("Missing the index_to_name.json file.")
self.initialized = True
def preprocess(self, requests):
"""Basic text preprocessing, based on the user's chocie of application mode.
Args:
requests (str): The Input data in the form of text is passed on to the preprocess
function.
Returns:
list : The preprocess function returns a list of Tensor for the size of the word tokens.
"""
input_ids_batch = None
attention_mask_batch = None
token_type_ids_batch = None
# input_texts = []
for idx, data in enumerate(requests):
input_text = data.get("data")
if input_text is None:
input_text = data.get("body")
if isinstance(input_text, (bytes, bytearray)):
input_text = input_text.decode("utf-8")
input_text = [sentence.split('\t')[:2] for sentence in input_text.strip().split('\n') if sentence]
# logger.info("Received text: '%s'", len(input_text))
inputs = self.tokenizer.batch_encode_plus(
input_text,
max_length=self.max_length,
pad_to_max_length=True,
add_special_tokens=True,
return_tensors="pt",
)
input_ids = inputs["input_ids"].to(self.device)
attention_mask = inputs["attention_mask"].to(self.device)
token_type_ids = inputs['token_type_ids'].to(self.device)
# making a batch out of the recieved requests
# attention masks are passed for cases where input tokens are padded.
if input_ids.shape is not None:
if input_ids_batch is None:
input_ids_batch = input_ids
attention_mask_batch = attention_mask
token_type_ids_batch = token_type_ids
else:
input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
attention_mask_batch = torch.cat(
(attention_mask_batch, attention_mask), 0
)
token_type_ids_batch = torch.cat(
(token_type_ids_batch, token_type_ids), 0
)
# if input_text.strip():
# input_texts.append(input_text.strip().split('\t')[:2])
# logger.info("Received text: '%s'", input_texts)
# inputs = self.tokenizer.batch_encode_plus(
# input_texts,
# truncation=True,
# max_length=self.max_length,
# pad_to_max_length=True,
# add_special_tokens=True,
# return_tensors="pt",
# )
# input_ids = inputs["input_ids"].to(self.device)
# attention_mask = inputs["attention_mask"].to(self.device)
# token_type_ids = inputs['token_type_ids'].to(self.device)
# # making a batch out of the recieved requests
# # attention masks are passed for cases where input tokens are padded.
# if input_ids.shape is not None:
# if input_ids_batch is None:
# input_ids_batch = input_ids
# attention_mask_batch = attention_mask
# token_type_ids_batch = token_type_ids
# else:
# input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
# attention_mask_batch = torch.cat(
# (attention_mask_batch, attention_mask), 0
# )
# token_type_ids_batch = torch.cat(
# (token_type_ids_batch, token_type_ids), 0
# )
return (input_ids_batch, attention_mask_batch, token_type_ids_batch)
def inference(self, input_batch):
"""Predict the class (or classes) of the received text using the
serialized transformers checkpoint.
Args:
input_batch (list): List of Text Tensors from the pre-process function is passed here
Returns:
list : It returns a list of the predicted value for the input text
"""
input_ids_batch, attention_mask_batch, token_type_ids_batch = input_batch
# logging.info("input_ids_batch.shape: %s", input_ids_batch)
# inferences = [{'logits':[], "probabilities":[], "predict_class":[]}]
inferences = []
# Handling inference for sequence_classification.
if self.setup_config["mode"] == "sequence_classification":
inferences_dict = [{'logits':[], "probabilities":[], "predict_class":[]}]
inferences_dict = [{"probabilities":[]}]
with torch.no_grad():
predictions = self.model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, token_type_ids=token_type_ids_batch)
# print(
# "This the output size [batch_size, num_class] from the Seq classification model",
# predictions.size(),
# )
# print("This the output from the Seq classification model", predictions)
probabilities = torch.nn.functional.softmax(predictions, dim=-1)[:,-1]
# print("This the output from the Seq classification model", probabilities.tolist())
inferences_dict[0]["probabilities"].extend(probabilities.tolist())
# for i in range(len(probabilities)):
# predict_class = str(int(probabilities[i] > self.setup_config['threshold_value']))
# # inferences.append({"logits": predictions[i][-1].tolist(), "probabilities": probabilities[i].item(), "predict_class": self.mapping[predict_class]})
# inferences_dict["logits"].append(predictions[i][-1].item())
# inferences_dict["probabilities"].append(probabilities[i].item())
# inferences_dict["predict_class"].append(self.mapping[predict_class])
inferences.append(inferences_dict)
return inferences
def postprocess(self, inference_output):
"""Post Process Function converts the predicted response into Torchserve readable format.
Args:
inference_output (list): It contains the predicted response of the input text.
Returns:
(list): Returns a list of the Predictions and Explanations.
"""
return inference_output
def handle(self, data, context):
"""
Invoke by TorchServe for prediction request.
Do pre-processing of data, prediction using model and postprocessing of prediciton output
:param data: Input data for prediction
:param context: Initial context contains model server system properties.
:return: prediction output
"""
start_time = time.time()
model_input = self.preprocess(data)
logger.info("preprocess_time: %s", 1000*(time.time()-start_time))
if not self._is_explain():
inference_output = self.inference(model_input)
logger.info("inference_time: %s", 1000*(time.time()-start_time))
inference_output = self.postprocess(inference_output)
logger.info("postprocess: %s", 1000*(time.time()-start_time))
logger.info("inference_output: %d", len(inference_output))
logger.info("inference_output: %s", inference_output)
else:
inference_output = self.explain_handle(model_input, data)
return inference_output
依赖三:其他额外文件
# index_to_name.json
{
"0":"0",
"1":"1"
}
# setup_config.json
{
"model_name":"hfl/chinese-roberta-wwm-ext",
"mode":"sequence_classification",
"do_lower_case":true,
"num_labels":"2",
"max_length":"32",
"model_parallel":false,
"threshold_value":0.5
}
二、Start TorchServe
torchserve --start --ncs --model-store model_mar --ts-config config.properties
# -- torchserve --start --ncs --model-store model_mar --models wj-first-model=wj-first-model.mar --ts-config config.properties
依赖一:配置文件
inference_address=http://172.20.145.107:8080
management_address=http://172.20.145.107:8081
metrics_address=http://172.20.145.107:8082
NUM_WORKERS=1
number_of_gpu=0
number_of_netty_threads=0
job_queue_size=1000
model_store=/home/model-server/shared/model-store
enable_envvars_config=true
models={ \
"wj-first-model": { \
"1.0": { \
"defaultVersion": true, \
"marName": "wj-first-model.mar", \
"minWorkers": 1, \
"maxWorkers": 1, \
"batchSize": 4, \
"maxBatchDelay": 100, \
"responseTimeout": 120 \
} \
}, \
"wj-distill-model": { \
"1.0": { \
"defaultVersion": true, \
"marName": "wj-distill-model.mar", \
"minWorkers": 32, \
"maxWorkers": 32, \
"batchSize": 50, \
"maxBatchDelay": 100, \
"responseTimeout": 120 \
} \
} \
}
三、查看注册模型
<!-- 注册模型 -->
curl -X POST "http://172.20.145.107:8081/models?url=wj-first-model.mar"
<!-- 查看已经注册的模型 -->
curl "http://localhost:8081/models"
<!-- 给模型分配具体的线程 -->
curl -v -X PUT "http://172.20.145.107:8081/models/wj-first-model?min_worker=1&max_worker=1&synchronous=true"
<!-- 查看模型的具体信息 -->
curl "http://localhost:8081/models/wj-first-model"
<!-- 发送预测请求 -->
curl -X POST http://127.0.0.1:8080/predictions/wj-first-model -T sample.txt
<!-- 删除模型 -->
curl -X DELETE http://127.0.0.1:8081/models/wj-first-model/
依赖环境: torchserve, torch-model-archiver, JDK11
1、https://medium.com/analytics-vidhya/deploy-huggingface-s-bert-to-production-with-pytorch-serve-27b068026d18
2、https://github.com/pytorch/serve/tree/master/examples/Huggingface_Transformers
3、https://pytorch.org/serve/management_api.html#register-a-model
4、https://zhuanlan.zhihu.com/p/361782496