朴素贝叶斯
使用贝叶斯思想实现对一个句子的分类,该句子为对一件物品的评价。
import pandas as pd
import numpy as np
import jieba
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
#获取数据
data = pd.read_csv("文件路径.csv",encoding="gbk")
#数据基本处理
#取出内容列
content = data["内容"]
#判定评判标准
data.loc[data.loc[:,'评价'] == "好评","评价标号"] = 1
data.loc[data.loc[:,'评价'] == "差评","评价标号"] = 0
good_or_bad = data['评价'].values
#选择停用词
#加载停用词
stopwords=[]
with open('停用词路径.txt','r',encoding='utf-8') as f:
lines = f.readlines()
for tmp in lines:
line = tmp.strip()
stopwords.append(line)
#对停用词表进行去重
stopwords = list(set(stopwords))
#把"内容"处理成标准格式
comment_list = []
for tmp in content:
seg_list = jieba.cut(tmp,cut_all=False)
seg_str = ','.join(seg_list)
comment_list.append(seg_str)
#统计词的个数
#CountVectorizer类会将文本中的词语转换为词频矩阵
con = CountVectorizer(stop_words=stopwords)
x = con.fit_transform(comment_list)
#通过get_feature_names()可以获取词袋中所有文本的关键字
name = con.get_feature_names()
print(x.toarray())
print(name)
#准备训练集
x_train = x.toarray()[:10,:]
y_train = good_or_bad[:10]
#准备测试集
x_test = x.toarray()[10:,:]
y_test = good_or_bad[10:]
#模型训练
#构建贝叶斯算法分类器
mb = MultinomialNB(alpha=1)
#训练数据
mb.fit(x_train,y_train)
#预测数据
y_predict = mb.predict(x_test)
#预测值与真实值展示
print('预测值为:',y_predict)
print('真实值:',y_test)
#模型评估
mb.score(x_test,y_test)