torchserve部署实践

最新推荐文章于 2025-04-07 13:06:46 发布

Massacre96Wj

最新推荐文章于 2025-04-07 13:06:46 发布

阅读量813

点赞数

分类专栏：深度学习文章标签： python 深度学习 pytorch

本文链接：https://blog.youkuaiyun.com/weixin_42047706/article/details/125637530

版权

深度学习专栏收录该内容

4 篇文章

订阅专栏

一、Create .Mar Files

生成mar文件

torch-model-archiver \
  --model-name <your_model_name> \
  --version 1.0 \
  --model-file <your_model_file>.py  # trace存储不需要这个文件\
  --serialized-file <your_model_name>.pt \
  --handler <default_handler> \
  --extra-files ./index_to_name.json \

mkdir model_store
mv <your_model_name>.mar model_store/

依赖一：存储为torch的pt文件

model_jit_script = torch.jit.script(model)
model_jit_script.save('distil_bert.pt')

依赖二：handler文件

from abc import ABC
import json
import time
import logging
import os
from collections import defaultdict
import torch
import transformers
from transformers import (
    AutoTokenizer
)

from ts.torch_handler.base_handler import BaseHandler

logger = logging.getLogger(__name__)
logger.info("Transformers version %s", transformers.__version__)


class RebortaHandler(BaseHandler, ABC):
    """
    Transformers handler class for sequence, token classification and question answering.
    """

    def __init__(self):
        super(RebortaHandler, self).__init__()
        self.initialized = False

    def initialize(self, ctx):
        """In this initialize function, the BERT model is loaded and
        the Layer Integrated Gradients Algorithm for Captum Explanations
        is initialized here.
        Args:
            ctx (context): It is a JSON Object containing information
            pertaining to the model artefacts parameters.
        """
        self.manifest = ctx.manifest
        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        model_pt_path = None
        if "serializedFile" in self.manifest["model"]:
            serialized_file = self.manifest["model"]["serializedFile"]
            model_pt_path = os.path.join(model_dir, serialized_file)

        self.device = torch.device(
            "cuda:" + str(properties.get("gpu_id"))
            if torch.cuda.is_available() and properties.get("gpu_id") is not None
            else "cpu"
        )
        # read configs for the mode, model_name, etc. from setup_config.json
        setup_config_path = os.path.join(model_dir, "setup_config.json")
        if os.path.isfile(setup_config_path):
            with open(setup_config_path) as setup_config_file:
                self.setup_config = json.load(setup_config_file)
        else:
            logger.warning("Missing the setup_config.json file.")

        # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode
        # further setup config can be added.
        model_file = self.manifest["model"].get("modelFile", "")
        if model_file:
            logger.debug("Loading eager model")
            self.model = self._load_pickled_model(model_dir, model_file, model_pt_path)
            self.model.to(self.device)
        else:
            logger.debug("Loading torchscript model")
            if not os.path.isfile(model_pt_path):
                raise RuntimeError("Missing the model.pt file")
            self.model = self._load_torchscript_model(model_pt_path)
        
        if self.setup_config["model_parallel"]:
            self.model.parallelize()
        else:
            self.model.to(self.device)

        self.tokenizer = AutoTokenizer.from_pretrained(
            self.setup_config['model_name'], do_lower_case=self.setup_config["do_lower_case"]
        )
        self.max_length = int(self.setup_config["max_length"])


        self.model.eval()
        logger.info("Transformer model from path %s loaded successfully", model_pt_path)

        # Read the mapping file, index to object name
        mapping_file_path = os.path.join(model_dir, "index_to_name.json")
        if os.path.isfile(mapping_file_path):
            with open(mapping_file_path) as f:
                self.mapping = json.load(f)
        else:
            logger.warning("Missing the index_to_name.json file.")
        self.initialized = True

    def preprocess(self, requests):
        """Basic text preprocessing, based on the user's chocie of application mode.
        Args:
            requests (str): The Input data in the form of text is passed on to the preprocess
            function.
        Returns:
            list : The preprocess function returns a list of Tensor for the size of the word tokens.
        """
        input_ids_batch = None
        attention_mask_batch = None
        token_type_ids_batch = None
        # input_texts = []
        for idx, data in enumerate(requests):
            input_text = data.get("data")
            if input_text is None:
                input_text = data.get("body")
            if isinstance(input_text, (bytes, bytearray)):
                input_text = input_text.decode("utf-8")
            input_text = [sentence.split('\t')[:2] for sentence in input_text.strip().split('\n') if sentence]
            # logger.info("Received text: '%s'", len(input_text))
            inputs = self.tokenizer.batch_encode_plus(
                input_text,
                max_length=self.max_length,
                pad_to_max_length=True,
                add_special_tokens=True,
                return_tensors="pt",
            )
            input_ids = inputs["input_ids"].to(self.device)
            attention_mask = inputs["attention_mask"].to(self.device)
            token_type_ids = inputs['token_type_ids'].to(self.device)
            # making a batch out of the recieved requests
            # attention masks are passed for cases where input tokens are padded.
            if input_ids.shape is not None:
                if input_ids_batch is None:
                    input_ids_batch = input_ids
                    attention_mask_batch = attention_mask
                    token_type_ids_batch = token_type_ids
                else:
                    input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
                    attention_mask_batch = torch.cat(
                        (attention_mask_batch, attention_mask), 0
                    )
                    token_type_ids_batch = torch.cat(
                        (token_type_ids_batch, token_type_ids), 0
                    )
            # if input_text.strip():
            #     input_texts.append(input_text.strip().split('\t')[:2])
            
        # logger.info("Received text: '%s'", input_texts)
        # inputs = self.tokenizer.batch_encode_plus(
        #     input_texts,
        #     truncation=True,
        #     max_length=self.max_length,
        #     pad_to_max_length=True,
        #     add_special_tokens=True,
        #     return_tensors="pt",
        # )
        # input_ids = inputs["input_ids"].to(self.device)
        # attention_mask = inputs["attention_mask"].to(self.device)
        # token_type_ids = inputs['token_type_ids'].to(self.device)
        # # making a batch out of the recieved requests
        # # attention masks are passed for cases where input tokens are padded.
        # if input_ids.shape is not None:
        #     if input_ids_batch is None:
        #         input_ids_batch = input_ids
        #         attention_mask_batch = attention_mask
        #         token_type_ids_batch = token_type_ids
        #     else:
        #         input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
        #         attention_mask_batch = torch.cat(
        #             (attention_mask_batch, attention_mask), 0
        #         )
        #         token_type_ids_batch = torch.cat(
        #             (token_type_ids_batch, token_type_ids), 0
        #         )
        return (input_ids_batch, attention_mask_batch, token_type_ids_batch)

    def inference(self, input_batch):
        """Predict the class (or classes) of the received text using the
        serialized transformers checkpoint.
        Args:
            input_batch (list): List of Text Tensors from the pre-process function is passed here
        Returns:
            list : It returns a list of the predicted value for the input text
        """
        input_ids_batch, attention_mask_batch, token_type_ids_batch = input_batch
        # logging.info("input_ids_batch.shape: %s", input_ids_batch)
        # inferences = [{'logits':[], "probabilities":[], "predict_class":[]}]
        inferences = []
        # Handling inference for sequence_classification.
        if self.setup_config["mode"] == "sequence_classification":
            inferences_dict = [{'logits':[], "probabilities":[], "predict_class":[]}]
            inferences_dict = [{"probabilities":[]}]
            with torch.no_grad():
                predictions = self.model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, token_type_ids=token_type_ids_batch)
            # print(
            #     "This the output size [batch_size, num_class] from the Seq classification model",
            #     predictions.size(), 
            # )
            # print("This the output from the Seq classification model", predictions)
            probabilities = torch.nn.functional.softmax(predictions, dim=-1)[:,-1]
            # print("This the output from the Seq classification model", probabilities.tolist())
            inferences_dict[0]["probabilities"].extend(probabilities.tolist())

            # for i in range(len(probabilities)):
            #     predict_class = str(int(probabilities[i] > self.setup_config['threshold_value']))
            #     # inferences.append({"logits": predictions[i][-1].tolist(), "probabilities": probabilities[i].item(), "predict_class": self.mapping[predict_class]})
            #     inferences_dict["logits"].append(predictions[i][-1].item())
            #     inferences_dict["probabilities"].append(probabilities[i].item())
            #     inferences_dict["predict_class"].append(self.mapping[predict_class])
            inferences.append(inferences_dict)
        return inferences

    def postprocess(self, inference_output):
        """Post Process Function converts the predicted response into Torchserve readable format.
        Args:
            inference_output (list): It contains the predicted response of the input text.
        Returns:
            (list): Returns a list of the Predictions and Explanations.
        """
        return inference_output
    def handle(self, data, context):
        """
        Invoke by TorchServe for prediction request.
        Do pre-processing of data, prediction using model and postprocessing of prediciton output
        :param data: Input data for prediction
        :param context: Initial context contains model server system properties.
        :return: prediction output
        """
        start_time = time.time()
        model_input = self.preprocess(data)
        logger.info("preprocess_time: %s", 1000*(time.time()-start_time))
        if not self._is_explain():
            inference_output = self.inference(model_input)
            logger.info("inference_time: %s", 1000*(time.time()-start_time))
            inference_output = self.postprocess(inference_output)
            logger.info("postprocess: %s", 1000*(time.time()-start_time))
            logger.info("inference_output: %d", len(inference_output))
            logger.info("inference_output: %s", inference_output)
        else:
            inference_output = self.explain_handle(model_input, data)
        return inference_output

依赖三：其他额外文件

# index_to_name.json
{
 "0":"0",
 "1":"1"
}

# setup_config.json
{
  "model_name":"hfl/chinese-roberta-wwm-ext",
  "mode":"sequence_classification",
  "do_lower_case":true,
  "num_labels":"2",
  "max_length":"32",
  "model_parallel":false,
  "threshold_value":0.5
 }

二、Start TorchServe

torchserve --start --ncs --model-store model_mar --ts-config config.properties
# -- torchserve --start --ncs --model-store model_mar --models wj-first-model=wj-first-model.mar --ts-config config.properties

依赖一：配置文件

inference_address=http://172.20.145.107:8080
management_address=http://172.20.145.107:8081
metrics_address=http://172.20.145.107:8082
NUM_WORKERS=1
number_of_gpu=0
number_of_netty_threads=0
job_queue_size=1000
model_store=/home/model-server/shared/model-store
enable_envvars_config=true
models={ \
  "wj-first-model": { \
    "1.0": { \
        "defaultVersion": true, \
        "marName": "wj-first-model.mar", \
        "minWorkers": 1, \
        "maxWorkers": 1, \
        "batchSize": 4, \
        "maxBatchDelay": 100, \
        "responseTimeout": 120 \
    } \
  }, \
  "wj-distill-model": { \
    "1.0": { \
        "defaultVersion": true, \
        "marName": "wj-distill-model.mar", \
        "minWorkers": 32, \
        "maxWorkers": 32, \
        "batchSize": 50, \
        "maxBatchDelay": 100, \
        "responseTimeout": 120 \
    } \
  } \
}

三、查看注册模型

<!-- 注册模型 -->
curl -X POST "http://172.20.145.107:8081/models?url=wj-first-model.mar"
<!-- 查看已经注册的模型 -->
curl "http://localhost:8081/models"
<!-- 给模型分配具体的线程 -->
curl -v -X PUT "http://172.20.145.107:8081/models/wj-first-model?min_worker=1&max_worker=1&synchronous=true"
<!-- 查看模型的具体信息 -->
curl "http://localhost:8081/models/wj-first-model"
<!-- 发送预测请求 -->
curl -X POST http://127.0.0.1:8080/predictions/wj-first-model -T sample.txt
<!-- 删除模型 -->
curl -X DELETE http://127.0.0.1:8081/models/wj-first-model/

依赖环境: torchserve, torch-model-archiver, JDK11

1、https://medium.com/analytics-vidhya/deploy-huggingface-s-bert-to-production-with-pytorch-serve-27b068026d18
2、https://github.com/pytorch/serve/tree/master/examples/Huggingface_Transformers
3、https://pytorch.org/serve/management_api.html#register-a-model
4、https://zhuanlan.zhihu.com/p/361782496