import jieba
import pandas as pd
# 1.读文件:
#
data = pd.read_csv(r'data.csv', engine='python')
# print(data.info())
# 2. 停用词:
with open(r'stopwords.txt', 'r', encoding='utf-8')as fp:
stop_words = fp.readlines()
# 3. 处理停用词:去空,去重
stop_words_list = list(set(map(lambda word: word.strip(), stop_words)))
# 4.去除停用词: 分词: 剔除:
comment_list = data['内容 ']
labels = data['评价']
# 存放去除停用词以后的完整文本:
comment_cut_list = []
for comment in comment_list:
# 文本分割;
seg_list = jieba.cut(comment, cut_all=False) # 精确模式
final = ''
for seg in seg_list:
# 去掉停用:
if seg not in stop_words_list:
final += seg
comment_cut_list.append(final)
# 5.计算词频:
from sklearn.feature_extraction.text import CountVectorizer
# (1)实例化;
vector = CountVectorizer()
# (2) 词频统计:
x = vector.fit_transform(comment_cut_list)
print(x)
# (3) 转换成数组类型:
X = x.toarray()
print(X)
# 6. 处理标签:
y = labels.transform(lambda x: 1 if x == '好评' else 0).values
print(y)
# 7.分割训练集与测试集:
X_train = X[:10, :] # 训练集特征值
y_train = y[:10] # 训练集的标签
X_test = X[10:] # 测试集特征值
y_test = y[10:] # 测试集标签
# 8. 朴素贝叶斯算法: 这里用多项式
from sklearn.naive_bayes import MultinomialNB
# 实例化
clf = MultinomialNB()
# 训练模型
clf.fit(X_train, y_train)
# 测试
print(clf.predict(X_test))