基于Transformer的图像-文本检索模型训练与评估,-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_43405535/article/details/134577819

训练集一张图片五个描述文本
在这里插入图片描述
测试集

创建数据集retrieval dataset

train_dataset, val_dataset, test_dataset = create_dataset('re', config)

跳到这

elif dataset=='re':          
    train_dataset = re_train_dataset(config['train_file'], train_transform, config['image_root'])
    val_dataset = re_eval_dataset(config['val_file'], test_transform, config['image_root'])  
    test_dataset = re_eval_dataset(config['test_file'], test_transform, config['image_root'])                
    return train_dataset, val_dataset, test_dataset

可以看出re_train和re_eval
ann_file = config[‘train_file’]
首先下载数据
创建一个空列表self.ann = []，用来存储包含’caption’和’image’字段的字典

self.ann =
[
    {
		"image": "flickr30k-images/1000092795.jpg",
		"caption": "Two young guys with shaggy hair look at their hands while hanging out in the yard.",
		"image_id": 0
	},
	{
		"image": "flickr30k-images/1000092795.jpg",
		"caption": "Two young, White males are outside near many bushes.",
		"image_id": 0
	},
]

遍历ann_file
创建img_ids = {} 字典
接下来为每个唯一的图像ID创建一个新的索引，并将这个索引存储在一个名为self.img_ids的字典中。
遍历self.ann的列表。

n # 新的索引
img_id = ann['image_id']
if img_id not in self.img_ids.keys() # 检查img_id是否已经存在于self.img_ids字典的键中。
self.img_ids[img_id] = n

re_eval

txt_id = 0
# 为每个元素提供一个索引（img_id），以及对应的注释（ann）
for img_id, ann in enumerate(self.ann):
     self.image.append(ann['image'])
     # 为每个图像id在字典中创建一个条目，并初始化为空列表
     self.img2txt[img_id] = []
     for i, caption in enumerate(ann['caption']):
           self.text.append(pre_caption(caption,self.max_words))
           # 将新的txt_id添加到与当前图像id对应的列表中
           self.img2txt[img_id].append(txt_id)
           # 反向查找文本id对应的图像
           self.txt2img[txt_id] = img_id
           txt_id += 1

所以img2txt{“img_id”,“txt_id”}
img2txt是一个字典，其键是图像的ID，值是一个列表。列表中的每个元素是与该图像相关联的文本ID。
例如，假设有一个图像ID为1的图像，它有两个注释，那么img2txt字典可能如下所示：
{1: [0, 1]}
这意味着图像ID为1的图像与文本ID为0和1的注释相关联。同时，txt2img字典可能如下所示：
{0: 1, 1: 1}
这意味着文本ID为0和1都与图像ID为1的图像相关联。

create_loader()创建数据加载器，接受数据集、采样器、批量大小和其他参数作为输入，并返回训练、验证和测试的数据加载器

建立模型
如果有checkpoint
预训练权重（state_dict），特别是处理其中的位置嵌入（pos_embed）。位置嵌入通常用于Transformer模型，以捕获输入序列中的位置信息。

# reshape positional embedding to accomodate for image resolution change
pos_embed_reshaped = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)         
state_dict['visual_encoder.pos_embed'] = pos_embed_reshaped
m_pos_embed_reshaped = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'],model.visual_encoder_m)   
state_dict['visual_encoder_m.pos_embed'] = m_pos_embed_reshaped

visual_encoder.pos_embed的项（位置嵌入）和模型中的visual_encoder进行插值处理。插值处理可能涉及调整嵌入的大小以匹配模型中不同的维度。结果被存储在pos_embed_reshaped中。
重新赋值，这是为了用新的、可能已经调整过大小的位置嵌入替换原来的嵌入。

开始训练
数据加载、模型前向传播、损失计算、反向传播、优化步骤

# 遍历data_loader
for i,(image, text, idx) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
    image = image.to(device,non_blocking=True)
    idx = idx.to(device,non_blocking=True)   
    text_input = tokenizer(text, padding='longest', max_length=30, return_tensors="pt").to(device)  
    if epoch>0 or not config['warm_up']:
        alpha = config['alpha']
    else:
        alpha = config['alpha']*min(1,i/len(data_loader))
    # 前向传播，计算两种损失
    loss_ita, loss_itm = model(image, text_input,alpha=alpha, idx=idx)                  
    loss = loss_ita + loss_itm
   
    optimizer.zero_grad()  # 将模型参数的梯度归零
    loss.backward() # 通过损失进行反向传播，计算梯度
    optimizer.step() # 使用优化器更新模型参数
    # 更新日志记录器
    metric_logger.update(loss_itm=loss_itm.item())
    metric_logger.update(loss_ita=loss_ita.item())
    metric_logger.update(lr=optimizer.param_groups[0]["lr"])
    if epoch==0 and i%step_size==0 and i<=warmup_iterations: 
        scheduler.step(i//step_size)

evaluation
首先先处理文本数据

 for i in range(0, num_text, text_bs):
     text = texts[i: min(num_text, i+text_bs)] # 从texts列表中提取一部分文本数据
     text_input = tokenizer(text, padding='max_length', truncation=True, max_length=30, return_tensors="pt").to(device) 
     text_output = model.text_encoder(text_input.input_ids, attention_mask = text_input.attention_mask, mode='text')  
     text_feat = text_output.last_hidden_state # 获取编码后的特征表示
     # 使用模型的投影层对特征进行转换，归一化
     text_embed = F.normalize(model.text_proj(text_feat[:,0,:]))
     text_embeds.append(text_embed)   
     text_feats.append(text_feat)
     text_atts.append(text_input.attention_mask)
 text_embeds = torch.cat(text_embeds,dim=0)
 text_feats = torch.cat(text_feats,dim=0)
 text_atts = torch.cat(text_atts,dim=0)

再提取图像

计算图像和文本之间的相似度矩阵

sims_matrix = image_embeds @ text_embeds.t()

图像和文本的相似性计算
使用预训练的模型来计算图像和文本之间的相似度，并将这些相似度值存储在矩阵 score_matrix_i2t 中，用于图像和文本的匹配或检索任务。

  start = rank*step
  end = min(sims_matrix.size(0),start+step)
  # 将一个大矩阵 sims_matrix 分割为多个小块，每个小块的大小为 config['k_test'] 行
  for i,sims in enumerate(metric_logger.log_every(sims_matrix[start:end], 50, header)): 
        #  sims_matrix 中提取出 config['k_test'] 个最相似的图像
        #  topk_sim是一个包含k_test个最大值的张量，而topk_idx是一个包含这些最大值所在位置的索引的张量。
        topk_sim, topk_idx = sims.topk(k=config['k_test'], dim=0)
        # 代码将提取出的图像的特征复制 config['k_test'] 份（在列方向上），并生成一个全是1的注意力掩码（encoder_att）
        encoder_output = image_feats[start+i].repeat(config['k_test'],1,1)
        encoder_att = torch.ones(encoder_output.size()[:-1],dtype=torch.long).to(device)

        # 代码使用预训练的模型（model.text_encoder 和 model.itm_head）对提取出的图像和文本进行编码，并计算它们之间的相似度（通过 model.itm_head(output.last_hidden_state[:,0,:])[:,1]）。这个相似度值被存储在 score_matrix_i2t 的相应位置上。
        output = model.text_encoder(encoder_embeds = text_feats[topk_idx], 
                                    attention_mask = text_atts[topk_idx],
                                    encoder_hidden_states = encoder_output,
                                    encoder_attention_mask = encoder_att,                             
                                    return_dict = True,
                                    mode = 'fusion'
                                   )
        score = model.itm_head(output.last_hidden_state[:,0,:])[:,1]
        score_matrix_i2t[start+i,topk_idx] = score

    for i,sims in enumerate(metric_logger.log_every(sims_matrix[start:end], 50, header)): 
        
        topk_sim, topk_idx = sims.topk(k=config['k_test'], dim=0)
        # topk_sim, topk_idx = torch.topk(sims, k=config['k_test'], dim=0)

        encoder_output = image_feats[topk_idx]
        encoder_att = torch.ones(encoder_output.size()[:-1],dtype=torch.long).to(device)
        output = model.text_encoder(encoder_embeds = text_feats[start+i].repeat(config['k_test'],1,1), 
                                    attention_mask = text_atts[start+i].repeat(config['k_test'],1),
                                    encoder_hidden_states = encoder_output,
                                    encoder_attention_mask = encoder_att,                             
                                    return_dict = True,
                                    mode = 'fusion'
                                   )
        score = model.itm_head(output.last_hidden_state[:,0,:])[:,1]
        score_matrix_t2i[start+i,topk_idx] = score

itm_eval
计算每个查询文本在其对应得分列表中的真实排名，并将这些排名存储在ranks数组中
计算一个文本检索任务的排名指标，具体是TOP-1、TOP-5和TOP-10精度

 #Images->Text 
 ranks = np.zeros(scores_i2t.shape[0])
    for index,score in enumerate(scores_i2t):
        # 对得分进行排序，得到的是排序后的索引值 从大到小
        inds = np.argsort(score)[::-1] 
        # Score
        rank = 1e20
        # 在排序后的索引列表inds中查找当前图像索引i的位置
        # 这个位置代表了图像在排序得分列表中的位置
        # 如果当前图像在排序得分列表中的位置tmp小于当前的rank，
        # 那么更新rank为tmp
        # 结束时得到的rank就是当前查询文本对应的最高得分图像的真实排名
        for i in img2txt[index]:
            tmp = np.where(inds == i)[0][0]
            if tmp < rank:
                rank = tmp
        ranks[index] = rank

    # Compute metrics
    tr1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    tr5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    tr10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
  #Text->Images 
 ranks = np.zeros(scores_t2i.shape[0])
    for index,score in enumerate(scores_t2i):
        inds = np.argsort(score)[::-1]
        ranks[index] = np.where(inds == txt2img[index])[0][0]

    # Compute metrics
    ir1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    ir5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    ir10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)        

    tr_mean = (tr1 + tr5 + tr10) / 3
    ir_mean = (ir1 + ir5 + ir10) / 3
    r_mean = (tr_mean + ir_mean) / 2