目录
一、概述
给出下面一张图片,问:图中人们手里拿的是什么?
可能回答是伞,可能是国旗。但我们要通过一定的模型,使模型能够回答这个问题。因此我们需要把图片,以及问题都输入训练好的模型,通过模型来预测答案。
其实这个事情目前不能真正意义上从模型上实现,我们还是通过已经标记好的图片+问题+答案,对模型进行训练,让模型做一种分类即可。
在该项目中,我们采用出现频率最高的前1000个答案,回头寻找对应的问题以及图片,使用这些数据来进行模型训练。
思路如下:我们需要同时对图片,以及文本进行处理,处理好之后,将二者拼接起来,最后输入全连接层,通过全连接层进行分类即可。
二、细节
1.VGG16模型构建
VGG16模型相关的介绍,很多人有讲解并且很清楚,这里不再详细描述。总的来说该模型是一个结构较为简单,比较深的模型。
这个模型,是用来在训练好大模型之后,对于新输入的图片,使用VGG16预训练参数,对图片进行处理,得到图片的VGG16特征的。因为我们在训练模型的时候,直接拿别人训练好的数十万张图片,以及对应的4096维的VGG16特征输入模型训练的。
代码如下:
import torch
import torch.nn as nn
class VGG16(nn.Module):
def __init__(self, n_class=1000):
super(VGG16, self).__init__()
self.n_class = n_class
# 每一个block以及全连接层结束后都加一个activation_layer以及pooling
# 第一个block
self.block1 = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1),
nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
# 第二个block
self.block2 = nn.Sequential(
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
# 第三个block
self.block3 = nn.Sequential(
nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
# 第四个block
self.block4 = nn.Sequential(
nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
# 第五个block
self.block5 = nn.Sequential(
nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fclayer1 = nn.Linear(7*7*512, 4096)
self.fclayer2 = nn.Linear(4096, 4096)
self.outputlayer = nn.Linear(4096, self.n_class)
def forward(self, x):
# 图片大小(3,224,224)
out = self.block1(x)
out = self.block2(out)
out = self.block3(out)
out = self.block4(out)
out = self.block5(out)
vgg16_features = self.fclayer1(out.view(out.size(0), -1))
out = self.fclayer1(out.view(out.size(0), -1))
out = self.fclayer2(out)
logits = self.outputlayer(out)
return vgg16_features, logits
if __name__ == '__main__':
pics = torch.randn((1,3,224,224))
model = VGG16()
model.load_state_dict(torch.load('./data/vgg16_bn-6c64b313.pth'), strict=False)
vgg16_feature, logits = model(pics)
print(logits)
2.模型主体
我们需要对图片以及文字的结果拼接起来,然后输入全连接层。因此,在我们得到了数十万张图片的4096维的特征之后,可以直接把这个作为输入数据,和文字数据同时输入模型主体,处理完后,直接接上全连接层即可。
这里就直接使用的BiLSTM模型,对其进行修改,主要的改动,是在forward方法中,把输入x拆开来用,x[0]是文字数据,x[1]是图片数据。
在得到BiLSTM的输出之后(这里设置的隐藏层是200,因此双向LSTM输出是400),将其和图片数据的4096进行拼接,得到(B, 4496)的输出,最终输入全连接,完成分类。
import torch.nn as nn
import torch
class BiLSTM(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, num_class=1000):
super(BiLSTM, self).__init__()
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.vocab_size = vocab_size
# 图像矩阵(215519, 4096)
self.embedding_layer = nn.Embedding(self.vocab_size, self.embedding_dim)
self.LSTM = nn.LSTM(self.embedding_dim, self.hidden_dim, batch_first=True, bidirectional=True)
self.fclayer = nn.Linear(4496, num_class)
def forward(self, x):
# x 是两个,第一个是data(B,22),第二个是image(B,4096)
emb = self.embedding_layer(x[0])
output,(h_n,c_n) = self.LSTM(emb)
out = torch.cat([h_n[-1,...], h_n[-2,...]], dim=-1)
# out: (B,512),因为每次输入B张图
concat = torch.cat((out,x[1]),dim=1) # concat:(B, 4496)
logits = self.fclayer(concat) # (B, 1000)
return logits
3.模型训练
这里被注释掉的一些,是对数据进行处理时需要的内容。因为图片量太大,导致我的电脑太卡,因此为了能简单地跑通模型,我把做好的数据保存成csv格式,并只读取了前3000个数据进行训练。其他的流程与其他训练流程没有太大差别,唯一需要注意的就是因为我们需要的输入是两个,文字数据以及图片数据,因此把输入的数据调整一下即可。
训练完毕后,验证准确率77%
import torch.cuda
from torch.utils.data import TensorDataset, DataLoader
from utils import get_samples, get_word_dicts, sentence2ids, train_test_split, get_features, get_questions
from LSTM_model import BiLSTM
import torch.nn as nn
from tqdm import tqdm
from torch import optim
import pandas as pd
ques_path = './data/questions_train2014.txt'
ans_path = './data/answers_train2014_modal.txt'
# img_path = './data/images_train2014.txt'
# pre_trained_feature = './data/vgg_feats.mat'
# img_map = './data/coco_vgg_IDMap.txt'
# new_question_train, new_answer_train, new_image_train = get_samples(ques_path, ans_path, img_path)
new_question_train, new_answer_train = get_questions(ques_path, ans_path)
word2id, id2word = get_word_dicts(new_question_train)
# padded_sentences = sentence2ids(new_question_train, word2id)
# 获取图像矩阵,以及标签
# image_matrix, labels = get_features(pre_trained_feature, img_map, new_answer_train, new_image_train)
padded_sentences = pd.read_csv('./data/processed_data.csv', index_col=0, nrows=3000).to_numpy()
labels = pd.read_csv('./data/processed_labels.csv', index_col=0, nrows=3000).to_numpy()
image_matrix = pd.read_csv('./data/processed_imgs.csv', index_col=0, nrows=3000).to_numpy()
train_data, train_labels,train_imgs, val_data, val_labels, val_imgs = train_test_split(padded_sentences, labels, image_matrix)
train_dt = torch.tensor(train_data, dtype=torch.long)
train_lbdt = torch.tensor(train_labels, dtype=torch.long)
train_imgdt = torch.tensor(train_imgs, dtype=torch.float)
val_dt = torch.tensor(val_data, dtype=torch.long)
val_lbdt = torch.tensor(val_labels, dtype=torch.long)
val_imgdt = torch.tensor(val_imgs, dtype=torch.float)
train_dataset = TensorDataset(train_dt, train_imgdt, train_lbdt)
val_dataset = TensorDataset(val_dt, val_imgdt, val_lbdt)
batch_size = 5
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, pin_memory=True, drop_last=True)
emb_dim = 300
hidden_dim = 200
vocab_size = len(word2id)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BiLSTM(embedding_dim=emb_dim, hidden_dim=hidden_dim, vocab_size=vocab_size)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_loss = 0.0
epochs = 20
best_accuracy = 0.0
for epoch in tqdm(range(epochs), desc='Epoch'):
model.train()
for batch in train_dataloader:
batch = tuple(t.to(device) for t in batch)
data, img, y = batch[0], batch[1], batch[2]
optimizer.zero_grad()
logits = model((data,img))
loss = criterion(logits, y.squeeze())
train_loss += loss.item() * batch_size
loss.backward()
optimizer.step()
train_loss /= len(train_dataloader)
print()
print('train loss: {}'.format(train_loss))
model.eval()
correct = 0
val_loss = 0
for batch in val_dataloader:
batch = tuple(t.to(device) for t in batch)
data, img, y = batch[0], batch[1], batch[2]
logits = model((data,img))
loss = criterion(logits, y.squeeze())
val_loss += loss.item() * batch_size
y_pred = torch.argmax(logits, dim=-1)
correct += y_pred.eq(y).sum().item()
current_accuracy = correct / (len(val_dataloader) * batch_size)
print()
print('val loss: {} accuracy: {}'.format(val_loss / len(val_dataloader), correct / (len(val_dataloader) * batch_size)))
if current_accuracy > best_accuracy:
best_accuracy = current_accuracy
torch.save(model.state_dict(), 'best_model.bin')
print('Save the best model %s, accuracy=%.2f' % ('best_model.bin', best_accuracy))
4.工具函数
这里的get_features方法不太好,因为我在这里面直接对1000个答案进行了encoding,导致在测试的时候,输出了标签之后,没办法进行inverse transform得到答案。因此需要注意一下。其他的方法,通过模型训练的过程,即可看到具体的用途。
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import numpy as np
from scipy import io
from string import punctuation
import random
def get_samples(ques_path, ans_path, img_path):
questions_train = open(ques_path, 'r').read().splitlines()
answers_train = open(ans_path, 'r').read().splitlines()
images_train = open(img_path, 'r').read().splitlines()
# 只取答案出现最多的那些样本。取前1000个
answer_count = Counter(answers_train)
top_1000 = sorted(answer_count.items(),key=lambda x: x[1], reverse=True)[:1000]
top_answer_list = [item[0] for item in top_1000]
new_question_train, new_answer_train, new_image_train = [], [], []
for question, answer, image in zip(questions_train,answers_train,images_train):
if answer in top_answer_list:
new_question_train.append(question)
new_answer_train.append(answer)
new_image_train.append(image)
return new_question_train, new_answer_train, new_image_train
def get_questions(ques_path, ans_path):
questions_train = open(ques_path, 'r').read().splitlines()
answers_train = open(ans_path, 'r').read().splitlines()
# 只取答案出现最多的那些样本。取前1000个
answer_count = Counter(answers_train)
top_1000 = sorted(answer_count.items(),key=lambda x: x[1], reverse=True)[:1000]
top_answer_list = [item[0] for item in top_1000]
new_question_train, new_answer_train = [], []
for question, answer in zip(questions_train,answers_train):
if answer in top_answer_list:
new_question_train.append(question)
new_answer_train.append(answer)
return new_question_train, new_answer_train
def get_features(pre_train_feature,img_map, new_answer_train, new_image_train):
'''返回特征矩阵以及标签'''
LE = LabelEncoder()
LE = LE.fit(new_answer_train)
# y.shape: (215519,), array类型
y = LE.transform(new_answer_train)
data = io.loadmat(pre_train_feature)
feature = data['feats']
images_ids = open(img_map).read().splitlines()
image_id_map = {pair.split()[0]:int(pair.split()[1]) for pair in images_ids}
# 215519
nb_samples = len(new_image_train)
# 4096
nb_dimensions = feature.shape[0]
image_matrix = np.zeros((nb_samples, nb_dimensions))
for j in range(len(new_image_train)):
# 第j行,所有列
image_matrix[j, :] = feature[:, image_id_map[new_image_train[j]]]
return image_matrix, y
def get_word_dicts(all_questions):
'''这里会对同时标点及大小写进行处理'''
question_no_punc = []
for sentence in all_questions:
sentence = ''.join([char for char in sentence if char not in punctuation])
question_no_punc.append(sentence.lower())
all_words = []
for sentence in question_no_punc:
all_words.extend(sentence.split())
unique_words = list(set(all_words))
unique_words.insert(0, "<unk>")
word2id = {word: int(idx) for idx, word in enumerate(unique_words)}
id2word = {idx: word for word, idx in word2id.items()}
return word2id, id2word
def sentence2ids(all_questions, word2id):
'''这里会对同时标点及大小写进行处理'''
question_no_punc = []
for sentence in all_questions:
sentence = ''.join([char for char in sentence if char not in punctuation])
question_no_punc.append(sentence.lower())
length = [len(sentence.split()) for sentence in question_no_punc]
count_length = Counter(length)
max_length = max(count_length)
# 句子转Id
question_list = [sentence.split() for sentence in question_no_punc]
questions_ids = []
for sentence in question_list:
sentence_ids = []
for char in sentence:
sentence_ids.append(word2id.get(char, 0))
questions_ids.append(sentence_ids)
# 对每句话按所有句子的最长的长度进行Padding
padding = 0
for sentence in questions_ids:
if len(sentence) < max_length:
padding_length = max_length - len(sentence)
sentence += [padding] * padding_length
return np.array(questions_ids)
def train_test_split(padded_sentences, labels, image_matrix):
questions_idx = list(range(len(padded_sentences)))
random.shuffle(questions_idx)
train_nums = int(0.75 * len(questions_idx))
rest_nums = len(questions_idx) - train_nums
val_nums = int(rest_nums / 2)
test_nums = rest_nums - val_nums
train_data = padded_sentences[questions_idx[:train_nums]]
train_labels = labels[questions_idx[:train_nums]]
train_imgs = image_matrix[questions_idx[:train_nums]]
val_data = padded_sentences[questions_idx[train_nums:train_nums + val_nums]]
val_labels = labels[questions_idx[train_nums:train_nums + val_nums]]
val_imgs = image_matrix[questions_idx[train_nums:train_nums + val_nums]]
test_data = padded_sentences[questions_idx[train_nums + val_nums:]]
test_labels = labels[questions_idx[train_nums + val_nums:]]
test_imgs = image_matrix[questions_idx[train_nums + val_nums:]]
return train_data, train_labels,train_imgs, val_data, val_labels, val_imgs
5.模型测试
模型训练好之后,我们保存了参数,VGG16的模型,我们也下载了预训练的参数,因此我们将一张图片,处理成需要的形状之后,输入VGG16,即可得到4096维的特征,再把这个特征和文字同时输入模型,即可得到一个1000维的预测结果。这里由于我前面的标签问题,没有进行inverse transform,只得到了预测结果的标签。
from LSTM_model import BiLSTM
from utils import get_questions, get_word_dicts, sentence2ids
import torch
from VGG_model import VGG16
import matplotlib.image as imgplt
import numpy as np
ques_path = './data/questions_train2014.txt'
ans_path = './data/answers_train2014_modal.txt'
img = './data/test_img1.png'
img_data = imgplt.imread(img, format='PNG')
img_data = torch.tensor(img_data, dtype=torch.float).permute((2,0,1)).unsqueeze(dim=0)
new_question_train,new_answer_train = get_questions(ques_path, ans_path)
word2id, id2word = get_word_dicts(new_question_train)
emb_dim = 300
hidden_dim = 200
vocab_size = len(word2id)
device = "cuda" if torch.cuda.is_available() else "cpu"
LSTM_MODEL = BiLSTM(embedding_dim=emb_dim, hidden_dim=hidden_dim, vocab_size=vocab_size)
LSTM_MODEL.load_state_dict(torch.load('best_model.bin'), strict=False)
LSTM_MODEL.to(device)
VGG_MODEL = VGG16()
VGG_MODEL.load_state_dict(torch.load('./data/vgg16_bn-6c64b313.pth'), strict=False)
VGG_MODEL.to(device)
with torch.no_grad():
while True:
print('===============')
img_data = img_data.to(device)
vgg16_features, _ = VGG_MODEL(img_data) # (1,4096)
# 获取问题
question = input('请输入英文问题(输入quit结束):')
if question == 'quit':
break
else:
sentence = [question]
out = list(sentence2ids(sentence, word2id)[0])
max_length = 22
if len(out) < max_length:
padding_length = max_length - len(out)
out.extend([0] * padding_length)
out = np.array(out)
# (1,22)
question_input = torch.tensor(out, dtype=torch.long).unsqueeze(0).to(device)
logits = LSTM_MODEL((question_input, vgg16_features))
y_pred = torch.argmax(torch.softmax(logits, dim=-1))
print(y_pred)