一、15.py相邻的两个情感词之间的词语已经进行判断,不需要重复判断for改为while
1、由于已经对相邻得两个情感词中间得词语进行了判断,所以不需要重复判断,
2、 for循环会循规蹈矩的重复判断,所以我们使用while循环
3、问题:就是分数在不停得叠加,所以导致负值较小,正值较大,就会发生抵消,导致结果最终是正值,怎么避免抵消?
def score_sentiment(sen_word, not_word, degree_word, seg_result):
# 权重初始化为1
w = 1
score = 0
# 情感词下标初始化
sentiment_index = -1
# 情感词的位置下标集合
sentiment_index_list = list(sen_word.keys()) # [1, 3]
# 对第一个情感词前的程度副词或者否定词进行得分处理!!!!!!!
# print(sentiment_index_list)
if len(sentiment_index_list) > 1:
for j in range(0, sentiment_index_list[0]): # 查看第一个情感词的位置下标之前有没有否定词和程度副词(可能情感词为空,注意报错)
if j in not_word.keys():
w = w * float(not_word[j]) # 否定词的极性固定为-1
elif j in degree_word.keys():
w = w * float(degree_word[j])
i = 0
while i < len(seg_result):
# 如果是情感词
if i in sen_word.keys():
score = score + w * abs(float(sen_word[i]))
# w一定要初始化一下 每个W只跟你所对应的情感词有关系
w = 1.0
# 情感词下标加1,获取下一个情感词的位置
sentiment_index += 1
if sentiment_index < len(sentiment_index_list) - 1:
# 判断当前的情感词与下一个情感词之间是否有程度副词或否定词
for j in range(sentiment_index_list[sentiment_index], sentiment_index_list[sentiment_index + 1]):
# 更新权重,如果有否定词,权重取反
if j in not_word.keys():
score = score * float(not_word[j])
elif j in degree_word.keys():
w = w * float(degree_word[j])
i = sentiment_index_list[sentiment_index + 1]
else:
i += 1
elif i in not_word:
score = score * (float(not_word[i]))
i += 1
elif i in degree_word:
score = score * (float(degree_word[i]))
i += 1
else:
i += 1
return score
二、16.py扩展否定词列表,先判断否定词,再判断情感词
1、很多负面的句子被辨别为正面,是因为没有识别到否定词,所以尝试对否定词列表进行扩展,查看最终的输出效果
2、由于先判断情感词,再判断否定词,导致"太少"属于情感词,而不计入否定词,于是调换顺序,先判断否定词
3、由于最开始找到否定词,w为负,但是情感词里面,w也是负,导致,负负得正,输出得分数为正,修改情感词程度
import csv
from collections import defaultdict
import jieba
def classify_words(word_list):
sen_file = open('BosonNLP_sentiment_score.txt','r+',encoding='utf-8')
sen_list = sen_file.readlines()
sen_dict = defaultdict()
for i in sen_list:
if len(i.split(' ')) == 2:
sen_dict[i.split(' ')[0]] = i.split(' ')[1]
# print('sen_dict', sen_dict)
not_word_file = open('否定词_degree.txt', 'r+', encoding = 'utf-8')
not_word_list = not_word_file.readlines()
not_word_dict = defaultdict()
for i in not_word_list:
not_word_dict[i.split(',')[0]] = i.split(',')[1]
degree_file = open('程度副词.txt', 'r+', encoding = 'utf-8')
degree_list = degree_file.readlines()
degree_dict = defaultdict()
for i in degree_list:
degree_dict[i.split(',')[0]] = i.split(',')[1]
sen_word = dict()
not_word = dict()
degree_word = dict()
# 分类
for i in range(len(word_list)):
word = word_list[i] # 遍历每一个单词
# 先判断是否在否定词的列表内
if word in not_word_dict.keys():
# 加入否定词和否定词对应的极性
not_word[i] = not_word_dict[word]
elif word in sen_dict.keys() and word not in degree_dict.keys():
# 属于情感词,不属于否定词,不属于程度副词
sen_word[i] = sen_dict[word]
elif word in degree_dict.keys():
degree_word[i] = degree_dict[word]
# 如果都没有在它们之中则不进行计算
sen_file.close()
not_word_file.close()
degree_file.close()
# 返回分类结果(情感词key,value 否定词 , 程度副词key, value)
# print('情感词:', sen_word)
print('否定词:', not_word)
# print('程度词:', degree_word)
return sen_word, not_word, degree_word
三、17.py结合15,16对整个训练集进行测试
import csv
from collections import defaultdict
import jieba
def seg_word(sentence):
seg_list = jieba.cut(sentence) # 将句子分为各个分词
seg_result = [] # 把这些单词加入到列表
for i in seg_list:
seg_result.append(i)
stopwords = set() # 加入停用词
with open('stopwords.txt', 'r', encoding='utf-8') as fr:
for i in fr:
stopwords.add(i.strip())
print(list(filter(lambda x: x not in stopwords, seg_result)))
return list(filter(lambda x: x not in stopwords, seg_result))
def classify_words(word_list):
sen_file = open('BosonNLP_sentiment_score.txt','r+',encoding='utf-8')
sen_list = sen_file.readlines()
sen_dict = defaultdict()
for i in sen_list:
if len(i.split(' ')) == 2:
sen_dict[i.split(' ')[0]] = i.split(' ')[1]
not_word_file = open('否定词_degree.txt', 'r+', encoding = 'utf-8')
not_word_list = not_word_file.readlines()
not_word_dict = defaultdict()
for i in not_word_list:
not_word_dict[i.split(',')[0]] = i.split(',')[1]
degree_file = open('程度副词.txt', 'r+', encoding = 'utf-8')
degree_list = degree_file.readlines()
degree_dict = defaultdict()
for i in degree_list:
degree_dict[i.split(',')[0]] = i.split(',')[1]
sen_word = dict()
not_word = dict()
degree_word = dict()
# 分类
for i in range(len(word_list)):
word = word_list[i]
if word in not_word_dict.keys():
not_word[i] = not_word_dict[word]
elif word in sen_dict.keys() and word not in degree_dict.keys():
sen_word[i] = sen_dict[word]
elif word in degree_dict.keys():
degree_word[i] = degree_dict[word]
sen_file.close()
not_word_file.close()
degree_file.close()
# 返回分类结果(情感词key,value 否定词 , 程度副词key, value)
# print('情感词:', sen_word)
# print('否定词:', not_word)
# print('程度词:', degree_word)
return sen_word, not_word, degree_word
def score_sentiment(sen_word, not_word, degree_word, seg_result):
# 权重初始化为1
w = 1
score = 0
# 情感词下标初始化
sentiment_index = -1
# 情感词的位置下标集合
sentiment_index_list = list(sen_word.keys()) # [1, 3]
# 对第一个情感词前的程度副词或者否定词进行得分处理!!!!!!!
if len(sentiment_index_list) > 1:
for j in range(0, sentiment_index_list[0]): # 查看第一个情感词的位置下标之前有没有否定词和程度副词(可能情感词为空,注意报错)
if j in not_word.keys():
w = w * float(not_word[j])
elif j in degree_word.keys():
w = w * float(degree_word[j])
i = 0
while i < len(seg_result):
# 如果是情感词
if i in sen_word.keys():
score = score + w * abs(float(sen_word[i]))
w = 1.0
# 情感词下标加1,获取下一个情感词的位置
sentiment_index += 1
if sentiment_index < len(sentiment_index_list) - 1:
# 判断当前的情感词与下一个情感词之间是否有程度副词或否定词
for j in range(sentiment_index_list[sentiment_index], sentiment_index_list[sentiment_index + 1]):
# 更新score,如果有否定词,score*极性
if j in not_word.keys():
score = score * float(not_word[j])
elif j in degree_word.keys():
w = w * float(degree_word[j])
i = sentiment_index_list[sentiment_index + 1]
else:
i += 1
elif i in not_word:
score = score * (float(not_word[i]))
i += 1
elif i in degree_word:
score = score * (float(degree_word[i]))
i += 1
else:
i += 1
return score
# 调用整个程序,计算每个短文本的分数
def sentiment_score(sentence):
wordlist = seg_word(sentence)
sen_word, not_word, degree_word = classify_words(wordlist)
score = score_sentiment(sen_word, not_word, degree_word, wordlist)
return score
# # 1、读取原文件
# file_origin = r'测试短篇章读取.csv'
# # 2、创建文件对象
# file_newcreate = r'newtest_测试短篇章读取.csv'
# 这个函数计算召回率, 精确率,f1值
def caculate_accuracy(file_origin, file_newcreate):
f = open(file_newcreate, 'w', encoding='utf-8')
csv_write = csv.writer(f) # 基于文件对象构建csv写入对象
csv_write.writerow(['text_a', 'label', 'test_label'])
with open(file_origin, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile) # 读取结果生成一个dict
for row in reader:
text_a = row['text_a']
label = row['label']
test_label = sentiment_score(text_a)
if test_label > 0:
test_label = 1
else:
test_label = 0
csv_write.writerow([text_a, label, test_label])
f.close()
with open(file_newcreate, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
pos_pos = 0 # 正面,评测为正
neg_neg = 0 # 负面,评测为负
pos_neg = 0 # 正面,评测为负
neg_pos = 0 # 负面,评测为正
for row in reader:
label = row['label']
test_label = row['test_label']
# print(label, test_label)
if label == '1' and test_label == '1':
pos_pos += 1
elif label == '0' and test_label == '0':
neg_neg += 1
elif label == '1' and test_label == '0':
pos_neg += 1
else:
neg_pos += 1
recall = pos_pos/(pos_pos + pos_neg)
precision = pos_pos/(pos_pos + neg_pos)
f1 = 2*(precision*recall)/(precision+recall)
print("召回率:", recall)
print("精确率:", precision)
print("F1:", f1)
return recall, precision, f1
if __name__ == "__main__":
file_origin = r'chnsenticorp_train.tsv'
file_newcreate = r'new_17chnsenticorp_train.tsv'
tmp = caculate_accuracy(file_origin, file_newcreate)
print(tmp)
输出结果
# 召回率: 0.7062159214830971
# 精确率: 0.5359152598477326
# F1: 0.6093911734261787
四、18.py在情感词计算分数中,变换权重值
def score_sentiment(sen_word, not_word, degree_word, seg_result):
# 权重初始化为1
w = 1
score = 0
# 情感词下标初始化
sentiment_index = -1
# 情感词的位置下标集合
sentiment_index_list = list(sen_word.keys()) # [1, 3]
# 对第一个情感词前的程度副词或者否定词进行得分处理!!!!!!!
if len(sentiment_index_list) > 1:
for j in range(0, sentiment_index_list[0]): # 查看第一个情感词的位置下标之前有没有否定词和程度副词(可能情感词为空,注意报错)
if j in not_word.keys():
w = w * float(not_word[j])
elif j in degree_word.keys():
w = w * float(degree_word[j])
i = 0
while i < len(seg_result):
# 如果是情感词
if i in sen_word.keys():
score = score + w * abs(float(sen_word[i]))
w = 1.0
# 情感词下标加1,获取下一个情感词的位置
sentiment_index += 1
if sentiment_index < len(sentiment_index_list) - 1:
# 判断当前的情感词与下一个情感词之间是否有程度副词或否定词
for j in range(sentiment_index_list[sentiment_index], sentiment_index_list[sentiment_index + 1]):
# 更新score,如果有否定词,score*极性
if j in not_word.keys():
w = w * float(not_word[j]) * 100 # 这里确保负值增大
elif j in degree_word.keys():
w = w * float(degree_word[j])
i = sentiment_index_list[sentiment_index + 1]
else:
i += 1
elif i in not_word:
score = score * (float(not_word[i]))
i += 1
elif i in degree_word:
score = score * (float(degree_word[i]))
i += 1
else:
i += 1
return score
五、19.py根据测试集demo,进行预估评判每个短文本的情感倾向
核心代码:
def create_newfile(filename, new_filename):
f = open(new_filename, 'w', encoding='utf-8')
csv_write = csv.writer(f) # 基于文件对象构建csv写入对象
csv_write.writerow(['qid', 'text_a', 'test_label']) # 确定每一列的索引
with open(filename, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile) # 读取结果生成一个dict
for row in reader:
print(row)
qid = row['qid']
text_a = row['text_a']
test_label = sentiment_score(text_a)
if test_label > 0:
test_label = 1
else:
test_label = 0
csv_write.writerow([qid, text_a, test_label])
f.close()
if __name__ == "__main__":
file_origin = r'testdata.csv' # 读取的原始文件
file_newcreate = r'testdata_new.csv' # 添加正负向标签的文件
tmp = create_newfile(file_origin, file_newcreate)
整体代码:
# 根据目前精确率最高的14.py进行对测试集的情感预测,输出 序号,文本,情感值
import csv
from collections import defaultdict
import jieba
def seg_word(sentence):
seg_list = jieba.cut(sentence) # 将句子分为各个分词
seg_result = [] # 把这些单词加入到列表
for i in seg_list:
seg_result.append(i)
stopwords = set() # 加入停用词
with open('stopwords.txt', 'r', encoding='utf-8') as fr:
for i in fr:
stopwords.add(i.strip())
# print('情感列表:', list(filter(lambda x: x not in stopwords, seg_result)))
return list(filter(lambda x: x not in stopwords, seg_result))
def classify_words(word_list):
# 读取情感词典文件
sen_file = open('BosonNLP_sentiment_score.txt','r+',encoding='utf-8')
# 获取词典文件内容
sen_list = sen_file.readlines()
# 创建情感词典
sen_dict = defaultdict()
for i in sen_list: # 遍历每一行
if len(i.split(' ')) == 2: # 每一行都能分为两部分
sen_dict[i.split(' ')[0]] = i.split(' ')[1]
not_word_file = open('否定词_degree.txt', 'r+', encoding = 'utf-8')
not_word_list = not_word_file.readlines()
not_word_dict = defaultdict() # 修改1
for i in not_word_list:
not_word_dict[i.split(',')[0]] = i.split(',')[1]
degree_file = open('程度副词.txt', 'r+', encoding = 'utf-8')
degree_list = degree_file.readlines()
degree_dict = defaultdict()
for i in degree_list:
degree_dict[i.split(',')[0]] = i.split(',')[1]
# print('degree_dict', degree_dict)
sen_word = dict() # 情感词
not_word = dict() # 否定词
degree_word = dict() # 程度副词
# 分类
for i in range(len(word_list)):
word = word_list[i] # 遍历每一个单词
# print('word', word)
if word in sen_dict.keys() and word not in not_word_dict.keys() and word not in degree_dict.keys():
sen_word[i] = sen_dict[word] # 直接将字典的key,value加入
elif word in not_word_dict.keys() and word not in degree_dict.keys():
not_word[i] = not_word_dict[word]
elif word in degree_dict.keys():
degree_word[i] = degree_dict[word]
sen_file.close()
not_word_file.close()
degree_file.close()
# print('情感词:', sen_word)
# print('否定词:', not_word)
# print('程度词:', degree_word)
return sen_word, not_word, degree_word
def score_sentiment(sen_word, not_word, degree_word, seg_result):
w = 1
score = 0
sentiment_index = -1
sentiment_index_list = list(sen_word.keys()) # [1, 3]
# 对第一个情感词前的程度副词或者否定词进行得分处理
if len(sentiment_index_list) > 1:
for j in range(0, sentiment_index_list[0]): # 查看第一个情感词的位置下标之前有没有否定词和程度副词
if j in not_word.keys():
w = w * float(not_word[j]) # 否定词的极性固定为-1
elif j in degree_word.keys():
w = w * float(degree_word[j])
# 遍历分词结果
for i in range(0, len(seg_result)):
if i in sen_word.keys():
score = score + w * float(sen_word[i])
w = 1.0
sentiment_index += 1
if sentiment_index < len(sentiment_index_list) - 1:
for j in range(sentiment_index_list[sentiment_index], sentiment_index_list[sentiment_index + 1]):
if j in not_word.keys():
w = w * float(not_word[j])
elif j in degree_word.keys():
w = w * float(degree_word[j])
if sentiment_index < len(sentiment_index_list) - 1:
i = sentiment_index_list[sentiment_index + 1]
return score
# 调用整个程序
def sentiment_score(sentence):
wordlist = seg_word(sentence)
print(wordlist)
sen_word, not_word, degree_word = classify_words(wordlist)
score = score_sentiment(sen_word, not_word, degree_word, wordlist)
return score
def create_newfile(filename, new_filename):
f = open(new_filename, 'w', encoding='utf-8')
csv_write = csv.writer(f) # 基于文件对象构建csv写入对象
csv_write.writerow(['qid', 'text_a', 'test_label']) # 确定每一列的索引
with open(filename, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile) # 读取结果生成一个dict
for row in reader:
print(row)
qid = row['qid']
text_a = row['text_a']
test_label = sentiment_score(text_a)
if test_label > 0:
test_label = 1
else:
test_label = 0
csv_write.writerow([qid, text_a, test_label])
f.close()
if __name__ == "__main__":
file_origin = r'testdata.csv' # 读取的原始文件
file_newcreate = r'testdata_new.csv' # 添加正负向标签的文件
tmp = create_newfile(file_origin, file_newcreate)
后续工作:验证18.py的准确率,召回率,F1等信息