目录
2021SC@SDUSC
Back Translations
为了节省时间,我们对所有转换后的数据进行预处理并将其保存到pickle文件中
预处理German data
def __init__(self, path, transform_type='BackTranslation'):
with open(path + 'de_1.pkl', 'rb') as f:
self.de = pickle.load(f)
预处理Russian data
with open(path + 'ru_1.pkl', 'rb') as f:
self.ru = pickle.load(f)
读取数据,分割数据集,并为数据加载器构建数据集
def get_data(data_path, n_labeled_per_class, unlabeled_per_class=5000, max_seq_len=256, model='bert-base-uncased', train_aug=False):
参数介绍:
data_path {str} -- 数据集文件夹的路径:包含train.csv和test.csv n_labeled_per_class {int} -- 每个类标记的数据数量
主要参数:
unlabeled_per_class {int} -- 每个类的未标记数据数量(默认:{5000}) max_seq_len {int} -- 最大序列长度(默认值:{256}) model {str} -- 模型名称(default: {'bert-base-uncased'}) train_aug {bool} -- 是否对标记的训练集进行增强(默认:{False})
为Bert加载标记器
tokenizer = BertTokenizer.from_pretrained(model)
train_df = pd.read_csv(data_path+'train.csv', header=None)
test_df = pd.read_csv(data_path+'test.csv', header=None)
我们只使用主体和删除的标题来进行分类
train_labels = np.array([v-1 for v in train_df[0]])
train_text = np.array([v for v in train_df[2]])
test_labels = np.array([u-1 for u in test_df[0]])
test_text = np.array([v for v in test_df[2]])
分割有标记的训练集,无标记的训练集,开发集
train_labeled_idxs, train_unlabeled_idxs, val_idxs = train_val_split(
train_labels, n_labeled_per_class, unlabeled_per_class, n_labels)
为每个集合构建数据集类
train_labeled_dataset = loader_labeled(
train_text[train_labeled_idxs], train_labels[train_labeled_idxs], tokenizer, max_seq_len, train_aug)
train_unlabeled_dataset = loader_unlabeled(
train_text[train_unlabeled_idxs], train_unlabeled_idxs, tokenizer, max_seq_len, Translator(data_path))
val_dataset = loader_labeled(
train_text[val_idxs], train_labels[val_idxs], tokenizer, max_seq_len)
test_dataset = loader_labeled(
test_text, test_labels, tokenizer, max_seq_len)
对于已经标记数据的数据加载器
def __init__(self, dataset_text, dataset_label, tokenizer, max_seq_len, aug=False):
self.tokenizer = tokenizer
self.text = dataset_text
self.labels = dataset_label
self.max_seq_len = max_seq_len
self.aug = aug
self.trans_dist = {}