最详细NER实战讲解-bilstm+crf（6）数据增强

最新推荐文章于 2025-04-17 21:55:50 发布

Lyttonkeepgoing

最新推荐文章于 2025-04-17 21:55:50 发布

阅读量1.2k

点赞数 2

分类专栏： NER实战系列文章标签：深度学习 python 人工智能 lstm nlp

本文链接：https://blog.youkuaiyun.com/m0_53292725/article/details/123796486

版权

NER实战系列专栏收录该内容

7 篇文章

订阅专栏

为什么要做数据增强？提高模型学习各种长短句子的能力

import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
import os




# 先把所有的数据 拼接 融合在一起 将所有的字全部换成对应的下标
def get_data_with_windows(name='train'):
    with open (f'data/prepare/dict.pkl', 'rb') as f:
        map_dict = pickle.load(f)
    
    def item2id(data, w2i):
        return  [w2i[x] if x in w2i else w2i['UNK'] for x in data]
    results = []
    root = os.path.join('data/prepare', name)
    files = list(os.listdir(root))
    
    for file in tqdm(files):
        result =[]
        path =  os.path.join(root, file)
        samples = pd.read_csv(path, sep=',')
        num_samples = len(samples)
        sep_index = [-1]+samples[samples['word'] == 'sep'].index.tolist() + [num_samples] # 拿到分割的那些行的下表  # 20 40 50 没有0和最后一个
        
################获取句子并将句子全部转化为id
        for i in range(len(sep_index)):
            start = sep_index[i] + 1
            end = sep_index[i+1]
            data = []
            for feature in samples.columns:  #访问每一列    
                data.append(item2id(list(samples[features])[start:end], map_dict[feature][1]))
            result.append(data)   # 一句话

怎么做数据增强要怎么拼接？

两个拼接

# 数据增强
# 两句话两句话拼接  把两个句子的每一个特征都要拼接到一起
two = []
for i in range(len(result)-1):  # -1 10个只能拼接成9个出来
    first=result[i]
    second = result[i+1]
    two.append([first[i]+sencond[k] for k in range(len(first))])

三个拼接道理一样

three = []
for i in range(len(result)-2):
    first = result[i]
    second = result[i+1]
    third = result[i+2]
    three.append([firstp[k] + second[k] + third[k] for k in range(len(first))])

然后把two three result 全部extend到results里面

相当于把长短句全部放到results里面

    results.extend(result+two+three)
return results

results里面就是我们所有拼接的句子

这里返回之后可以直接写到文件里面去这样可以避免每次运行都要加载很久的情况直接读文件就可以了

就改成

with open(f'data/prepare/' + name +'.pkl', 'wb') as f:
    pickle.dump(results, f)

然后测试一下

if __name__ == '__main__':
    print(get_data_with_windows('train'))

然后定义batch_manager

class BatchManager(object):
    def __init__(self, batch_size, name='train'):
        with open(f'data/prepare/'+name+'.pkl', 'rb') as f:
            data = pickle.load(f)
        self.batch_data = self.sort_and_pad(data, batch_size)
        self.len_data = len(self.batch_data)
    def sort_and_pad(self, data, batch_size):
        num_batch = int(match.ceil(len(data) /batch_size))  # 总共有多少个批次
        sorted_data = sorted(data, key=lambda x: len(x[0]))  # 按照句子长度排序
        batch_data = list()
        for i in range(num_batch):
            batch_data.append(self.pad_data(sorted_data[i*int(batch_size) : (i+1)*int(batch_size)]))
        return batch_data
    @staticmethod
    def pad_data(data):
        chars = []
        bounds = []
        flags = []
        radicals = []
        pinyins = []
        targets = []
        max_length = max([len(sentence[0]) for sentence in data])
        for line in data:
            char, bound, flag, target, radical, pinyin = line  
            padding = [0] * (max_length - len(char))
            chars.append(char + padding)
            bounds.append(bound + padding)
            flags.append(flag + padding)
            targets.append(target + padding)
            radicals.append(radical + padding)
            pinyins.append(pinyin + padding)
        return [chars, bounds, flags, radicals, pinyins, targets]
    def iter_batch(self, shuffle=False):
        if shuffle:
            random.shuffle(self.batch_data)
        for idx in range(self.len_data):
            yield self.batch_data[idx]   # yield 每次拿一个批次

if __name__ == '__main__':
    # 先把数据写进去
    get_data_with_windows('train')
    # train_data = Batch_Manager(10, 'train')

生成train.pkl之后再batch_manager

然后对测试集也要处理一下直接'train' 改'test'就行了到此所有的数据准备就完成了之后只需要构建模型然后训练就行了