kaggle笔记：对推理函数inference(df, model, tokenizer, device)的分析

原创

于 2024-11-26 18:09:43 发布

· 947 阅读

11 ·

版权

文章标签：

#笔记 #人工智能 #语言模型

代码：

def inference(df, model, tokenizer, device):
    batch_size = 64
    max_length = 512
    sentences = list(df['query_text'].values) # 问题列表

    all_embeddings = []
    # 对句子列表按长度降序排序，以优化内存使用和处理速度。
    length_sorted_idx = np.argsort([-len(sen) for sen in sentences])
    sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
    for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=False):
        sentences_batch = sentences_sorted[start_index: start_index + batch_size]
        features = tokenizer(sentences_batch, max_length=max_length, padding=True, truncation=True,
                             return_tensors="pt")
        features = batch_to_device(features, device)
        with torch.no_grad(): # 在不计算梯度的情况下运行模型，以节省内存和计算资源。
        # 使用双星号 ** 来传递参数是一种常见的做法，它允许你将字典中的键值对作为关键字参数传递给函数
            outputs = model.model