0.前言
参考:这位博主 。自己再写博客是为了方便再回顾
1.电影评论数据集
数据集下载:链接:https://pan.baidu.com/s/1zultY2ODRFaW3XiQFS-36w
提取码:mgh2 压缩包里有四个文件,将解压好的文件夹放在项目目录里即可
训练数据集过大所以我使用的是test数据集进行训练
2.加载数据
import pandas as pd
# 加载数据
train_data = pd.read_csv('./Dataset/test.txt', names=['label', 'review'], sep='\t')
train_labels = train_data['label']
train_reviews = train_data['review']
训练数据共有369条
comments_len = train_data.iloc[:, 1].apply(lambda x: len(x.split(' ')))
print(comments_len)
train_data['comments_len'] = comments_len
print(train_data['comments_len'].describe(percentiles=[.5, .95]))
train_data.iloc[:, 1].apply(lambda x: len(x.split(’ ')))的意思是对train_data第2列(编号为1即review)的数据返回每一条数据词的数量
可以看出95%的评论词的个数都在63以内,那么我们取每条评论词的数量最多为max_sent=63,超过了就切掉后面的词,否则就补充
3.数据预处理
from collections import Counter
def text_process(review):
"""
数据预处理
:param review: 评论数据train_reviews
:return: 词汇表words、词-id的字典word2id、id-词的字典id2word、pad_sentencesid
"""
words = []
for i in range(len(review)):
words += review[i].split(' ')
# 选出频率较高的词,存放到word_freq.txt中
with open('./Dataset/word_freq.txt', 'w', encoding='utf-8') as f:
# Counter(words).most_common() 找出出现频率最高的词(不加参数则返回所有词及其频率)
for word, freq in Counter(words).most_common():
if freq > 1:
f.write(word+'\n')
# 取出出数据
with open('./Dataset/word_freq.txt', encoding='utf-8') as f:
words = [i.strip() for i in f]
# 去重(词汇表)
words = list(set(words))
# 词-id的字典word2id
word2id = {
j: i for i, j in enumerate(words)}
# id-词的字典id2word
id2word = {
i: j for i, j in enumerate(words)}
pad_id = word2id['把'] # 中性词的id 用于填充
sentences = [i.split(' ') for i in review]
# 填充后的所有句子 每个词用id表示
pad_sentencesid = []
for i in sentences:
# 如果词汇表中没有这个词 用pad_id替代 如果有这个词返回这个词对应的id
temp = [word2id.get(j, pad_id) for j in i]
# 如果句子词的数量大于max_sent,则截断后面的
if len(i) > max_sent:
temp = temp[:max_sent]
else: # 如果句子词的数量小于max_sent,则用pad_id进行填充
for j in range(max_sent - len(i)):
temp.append(pad_id)
pad_sentencesid.append(temp)</