


Seq2Seq实战:中英翻译

1. 读取原始数据
from tqdm import tqdm
import pandas as pd
import os
import joblib
import jieba
import opencc
import pandas
data = pd.read_csv(filepath_or_buffer="data.txt",sep="\t",header=None)
"""
src: source 源,第一个seq输入
tgt: target 目标,第二个seq输出
"""
data.columns = ["src","tgt"]
# 读取行
data.loc[0,:]
# 列
data.loc[:,"tgt"]
2. 构建分词器
class Tokenizer(object):
"""
自定义一个分词器,实现基本功能:
- 1. 根据输入的语料,构建字典
- 2. 输入src的句子,输出对应的id
- 3. 输入tgt的句子,输出对应的id
- 4. 输入tgt的id,输出tgt的句子
"""
def __init__(self, data_file):
"""
分词器初始化
- 默认:根据输入的语料,构建字典
"""
self.data_file = data_file
# 输入侧 src --> source
self.src_token2idx = None
self.src_idx2token = None
# 输出侧 tgt --> target
self.tgt_token2idx = None
self.tgt_idx2token = None
# 构建字典
self._build_dict()
def _build_dict(self):
"""
构建字典
"""
if self.src_token2idx:
print("字典已经构建过了")
return
elif os.path.exists(os.path.join(".cache", "dicts.csblog")):
print("从缓存中读取字典")
self.src_token2idx, self.src_idx2token, self.tgt_token2idx, self.tgt_idx2token = joblib.load(filename=os.path.join(".cache", "dicts.lxh"))
return
# 从零构建字典
data = pd.read_csv(filepath_or_buffer=self.data_file, sep="\t", header=None)
data.columns = ["src", "tgt"]
rows, cols = data.shape
# 构建词典
src_tokens = {"<UNK>", "<PAD>", "<SOS>", "<EOS>"}
tgt_tokens = {"<UNK>", "<PAD>", "<SOS>", "<EOS>"}
for row_idx in tqdm(range(rows)):
src, tgt = data.loc[row_idx, :]
src_tokens.update(set(self.split_english_sentence(src)))
tgt_tokens.update(set(self.split_chinese_sentence(tgt)))
# 构建 src 的 字典
self.src_token2idx = {token: idx for idx, token in enumerate(src_tokens)}
self.src_idx2token = {idx: token for token, idx in self.src_token2idx.items()}
# 构建 tgt 的 字典
self.tgt_token2idx = {token: idx for idx, token in enumerate(tgt_tokens)}
self.tgt_idx2token = {idx: token for token, idx in self.tgt_token2idx.items()}
# 保存
dicts = [self.src_token2idx, self.src_idx2token, self.tgt_token2idx, self.tgt_idx2token]
joblib.dump(value=dicts, filename=os.path.join(".cache", "dicts.csblog"))
def split_english_sentence(self, sentecne):
"""
英文句子切分
"""
sentecne = sentecne.strip()
# 小写
tokens = [token for token in jieba.lcut(sentecne.lower()) if token not in ("", " ", "'")]
return tokens
def split_chinese_sentence(self, sentence):
"""
中文句子切分
"""
# 实例化一个繁体转简体的工具
converter = opencc.OpenCC(config="t2s")
sentence = converter.convert(text=sentence)
# 分词
tokens = [token for token in jieba.lcut(sentence) if token not in ["", " "]]
return tokens
def __str__(self):
"""
返回必要的打印信息
"""
if self.src_token2idx:
out = f"Tokenizer: [src: {len(self.src_token2idx)}, tgt: {len(self.tgt_token2idx)}]"
else:
out = f"尚无字典信息"
return