1、下载数据
数据来源于搜狐新闻,下载连接:
链接:https://pan.baidu.com/s/1St_s7CRPeghF2Z8MuVG56w
提取码:yd1l
2、数据简单分析
import numpy as np
import pandas as pd
train_data = pd.read_csv('sohu_train.txt', sep='\t', header=None, dtype=np.str_, encoding='utf8', names=[u'频道', u'文章'])
train_data.head()
训练集前五行数据如下:
频道 文章
0 娱乐 《青蛇》造型师默认新《红楼梦》额妆抄袭(图) 凡是看过电影《青蛇》的人,都不会忘记青白二蛇的...
1 娱乐 6.16日剧榜 <最后的朋友> 亮最后杀招成功登顶 《最后的朋友》本周的电视剧排行榜单依然只...
2 娱乐 超乎想象的好看《纳尼亚传奇2:凯斯宾王子》 现时资讯如此发达,搜狐电影评审团几乎人人在没有看...
3 娱乐 吴宇森:赤壁大战不会出现在上集 “希望《赤壁》能给你们不一样的感觉。”对于自己刚刚拍完的影片...
4 娱乐 组图:《多情女人痴情男》陈浩民现场耍宝 陈浩民:外面的朋友大家好,现在是搜狐现场直播,欢迎《...
查看每个频道下文章数量
train_data.groupby(u'频道')[u'频道'].count()
文章类别统计,总计12个类,24000篇文章。
频道
体育 2000
健康 2000
女人 2000
娱乐 2000
房地产 2000
教育 2000
文化 2000
新闻 2000
旅游 2000
汽车 2000
科技 2000
财经 2000
Name: 频道, dtype: int64
查看每个频道下最短、最长文章字数。
train_data[u'文章长度'] = train_data[u'文章'].apply(len)
train_data.groupby(u'频道')[u'文章长度'].agg([np.min, np.max])
输出如下:
amin amax
频道
体育 1619 88185
健康 655 8267
女人 1141 24348
娱乐 1290 20826
房地产 2923 36993
教育 890 20017
文化 96 12368
新闻 3095 42501
旅游 685 10557
汽车 975 17735
科技 1130 29527
财经 2917 76901
查看测试数据:
test_data = pd.read_csv('data/sohu_test.txt', sep='\t', header=None, dtype=np.str_, encoding='utf8', names=[u'频道', u'文章'])
test_data.head()
输出如下:
频道 文章
0 娱乐 组图:黄健翔拍时装大片 承认口无遮拦 2006年之前,他只是最好的体育节目主持人之一。200...
1 娱乐 奥运明星写真集锦曝光 展现健康时尚(图) 来源:人民网奥运明星奥运明星大满贯――属于最强者的...
2 娱乐 内地票房榜:《功夫熊猫》获全胜 带动内地影市 《功夫熊猫》首映周末逼4000万2006年1月...
3 娱乐 编者按: 昨天,央视紧急停播动画片《虹猫蓝兔七侠传》事件经报道后,引发了数十万网民的热烈大辩...
4 娱乐 第十一届上海国际电影节 金爵奖评委名单 [点击图片进入下一页]金爵奖评委陈冲陈冲(美籍华裔女...
查看每个频道下文章数量:
test_data.groupby(u'频道')[u'频道'].count()
输出如下:
频道
体育 1000
健康 1000
女人 1000
娱乐 1000
房地产 1000
教育 1000
文化 1000
新闻 1000
旅游 1000
汽车 1000
科技 1000
财经 1000
Name: 频道, dtype: int64
查看每个频道下最短、最长文章字数:
test_data[u'文章长度'] = train_data[u'文章'].apply(len)
test_data.groupby(u'频道')[u'文章长度'].agg([np.min, np.max])
输出如下:
amin amax
频道
体育 2923 36993
健康 685 10557
女人 1620 88185
娱乐 1290 20826
房地产 2917 67608
教育 685 10557
文化 1619 20075
新闻 1130 29527
旅游 2917 76901
汽车 1130 14255
科技 2923 31332
财经 1290 19824
3、说明:本文在上一篇文章https://blog.youkuaiyun.com/m0_38088359/article/details/83004972基础上对所有函数进行类的封装,并修改了参数,对该搜狐新闻数据进行分类。
函数封装如下:
第一步:导包
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import tensorflow.contrib.keras as kr
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import pandas as pd
from sklearn.metrics import confusion_matrix,precision_recall_fscore_support
import random
第二步:构建神经网络的必要参数
class TextConfig():
def __init__(self,vocab_size = 5000,seq_length = 600,embedding_dim = 64,num_filters = 256,kernel_size = 5,
hidden_dim = 128,keep_prob = 0.5,learning_rate = 1e-3,batch_size = 32,num_iteration = 5000):
self.vocab_size = vocab_size
self.seq_length = seq_length
self.embedding_dim = embedding_dim
self.num_filters = num_filters
self.kernel_size = kernel_size
self.hidden_dim = hidden_dim
self.keep_prob = keep_prob
self.learning_rate = learning_rate
self.batch_size = batch_size
self.num_iteration = num_iteration
第三步:构建神经网络的类,并在类中添加训练方法和评估方法。
class textClassification():
def __init__(self,textConfig,content,label=None,train_list=None,train_label=None,test_list=None,test_label=None):
self.textConfig = textConfig
if train_list!=None and train_label!=None and test_list!=None and test_label!=None:
self.train_list = train_list
self.train_label = train_label
self.test_list = test_list
self.test_label = test_label
self.content_list = content
self.label_list = label or self.train_label+self.test_label
elif content != None and label != None:
self.train_list, self.test_list, self.train_label, self.test_label = train_test_split(content, label)
self.content_list = content
self.label_list = label
else:
print("Warning!Your input is wrong!")
def Config(self):
self.vocab_size = self.textConfig.vocab_size
self.seq_length = self.textConfig.seq_length
self.embedding_dim = self.textConfig.embedding_dim
self.num_filters = self.textConfig.num_filters
self.kernel_size = self.textConfig.kernel_size
self.hidden_dim = self.textConfig.hidden_dim
self.keep_prob = self.textConfig.keep_prob
self.learning_rate = self.textConfig.learning_rate
self.batch_size = self.textConfig.batch_size
self.num_iteration = self.textConfig.num_iteration
self.num_classes = np.unique(self.label_list).shape[0]
def getData(self):
id_train = self.content2Vector(self.train_list)
id_test = self.content2Vector(self.test_list)
#统一句子序列长度
self.train_X = kr.preprocessing.sequence.pad_sequences(id_train,self.seq_length)
self.test_X = kr.preprocessing.sequence.pad_sequences(id_test,self.seq_length)
self.label = LabelEncoder()
self.train_Y = kr.utils.to_categorical(self.label.fit_transform(self.train_label),num_classes=self.num_classes)
self.test_Y = kr.utils.to_categorical(self.label.fit_transform(self.test_label),num_classes=self.num_classes)
def getVocabularyText(self):
size = self.vocab_size - 1
allContent = ''.join(self.content_list)
#将内容列表中的所有文章合并起来变成字符串str形式
counter = Counter(allContent)
#将Counter对象实例化并传入字符串形式的内容
vocabulary = []
vocabulary.append('<PAD>')
for i in counter.most_common(size):
vocabulary.append(i[0])
return vocabulary
def content2Vector(self,content_list):
vocabulary_list = self.getVocabularyText()
word2id_dict = dict(((b, a) for a, b in enumerate(vocabulary_list)))
content_vector_list = []
for content in content_list:
content_vector = []
for word in content:
if word in word2id_dict:
content_vector.append(word2id_dict[word])
else:
content_vector.append(word2id_dict['<PAD>'])
content_vector_list.append(content_vector)
return content_vector_list
def setModel(self):
tf.reset_default_graph()
self.X_holder = tf.placeholder(tf.int32, [None, self.seq_length])
self.Y_holder = tf.placeholder(tf.float32, [None, self.num_classes])
embedding = tf.get_variable('embedding', [self.vocab_size, self.embedding_dim])
#embedding字典维度为5000*128,128为词向量维度
embedding_inputs = tf.nn.embedding_lookup(embedding, self.X_holder)
#embedding_inputs的维度为(batch_size)64*600*128
conv1 = tf.layers.conv1d(inputs=embedding_inputs,filters=self.num_filters,kernel_size=self.kernel_size)
max_pool = tf.reduce_max(conv1,reduction_indices=[1])
full_connect = tf.layers.dense(max_pool,self.hidden_dim)
full_connect_dropout = tf.contrib.layers.dropout(full_connect,keep_prob=self.keep_prob)
full_connect_activate = tf.nn.relu(full_connect_dropout)
full_connect_last = tf.layers.dense(full_connect_activate,self.num_classes)
self.predict_y = tf.argmax(tf.nn.softmax(full_connect_last),1)
cross_entry = tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.Y_holder,logits=full_connect_last)
self.loss = tf.reduce_mean(cross_entry)
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.train = optimizer.minimize(self.loss)
isCorrect = tf.equal(tf.argmax(self.Y_holder,1),self.predict_y)
self.accuracy = tf.reduce_mean(tf.cast(isCorrect,tf.float32))
def Train(self):
self.Config()
self.getData()
self.setModel()
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
for i in range(self.num_iteration):
train_index = random.sample(list(range(len(self.train_Y))),k=self.batch_size)
X = self.train_X[train_index]
Y = self.train_Y[train_index]
self.sess.run(self.train,feed_dict={self.X_holder:X,self.Y_holder:Y})
step = i + 1
if step % 100 == 0 or step == 1:
test_index = random.sample(list(range(len(self.test_Y))), k=self.batch_size)
x = self.test_X[test_index]
y = self.test_Y[test_index]
loss_value, accuracy_value = self.sess.run([self.loss, self.accuracy], {self.X_holder:x, self.Y_holder:y})
print('step:%d loss:%.4f accuracy:%.4f' %(step, loss_value, accuracy_value))
def predict(self,cm=True,el=True,new_test=None):
if new_test==None:
predict_labels = []
for i in range(0, len(self.test_X), 100):
X = self.test_X[i: i + 100]
predict_value = self.sess.run(self.predict_y, {self.X_holder:X})
predict_labels.append(predict_value)
Y = np.array(predict_labels)
self.predict_label = self.label.inverse_transform(Y)
if cm:
#混淆矩阵
df = pd.DataFrame(confusion_matrix(self.test_label, self.predict_label),
columns=self.label.classes_,
index=self.label.classes_ )
print(df)
if el:
# 计算每个分类的Precision, Recall, f1, support
p, r, f1, s = precision_recall_fscore_support(self.test_label, self.predict_label)
# 计算总体的平均Precision, Recall, f1, support
tot_p = np.average(p, weights=s)
tot_r = np.average(r, weights=s)
tot_f1 = np.average(f1, weights=s)
tot_s = np.sum(s)
res1 = pd.DataFrame({
u'Label': self.label.classes_,
u'Precision': p,
u'Recall': r,
u'F1': f1,
u'Support': s
})
res2 = pd.DataFrame({
u'Label': ['总体'],
u'Precision': [tot_p],
u'Recall': [tot_r],
u'F1': [tot_f1],
u'Support': [tot_s]
})
res2.index = [999]
res = pd.concat([res1, res2])
print(res[['Label', 'Precision', 'Recall', 'F1', 'Support']])
else:
new_id_test = self.content2Vector(new_test)
#统一句子序列长度
new_test_X = kr.preprocessing.sequence.pad_sequences(new_id_test,self.seq_length)
predict_value = self.sess.run(self.predict_y,{self.X_holder:new_test_X})
predict_label = self.label.inverse_transform(predict_value)
print(predict_label)
4、训练神经网络,输出测试结果。
import pandas as pd
train = pd.read_csv('./sohudata/sohu_train.txt',sep='\t',header=None)
test = pd.read_csv('./sohudata/sohu_test.txt',sep='\t',header=None)
train_list = [x for x in train[1]]
test_list = [x for x in test[1]]
content = train_list+test_list
train_label = [x for x in train[0]]
test_label = [x for x in test[0]]
textConfig = TextConfig()
classification = textClassification(textConfig,content=content,train_list=train_list,train_label=train_label,
test_list=test_list,test_label=test_label)
classification.Train()
抽取一部分结果如下:
step:3000 loss:0.1762 accuracy:0.9688
step:3100 loss:0.0268 accuracy:1.0000
step:3200 loss:0.7533 accuracy:0.8438
step:3300 loss:0.3369 accuracy:0.9062
step:3400 loss:0.0014 accuracy:1.0000
step:3500 loss:0.3314 accuracy:0.9375
step:3600 loss:0.2525 accuracy:0.9375
step:3700 loss:0.5601 accuracy:0.8438
step:3800 loss:0.3356 accuracy:0.9375
step:3900 loss:0.3003 accuracy:0.9375
step:4000 loss:0.0435 accuracy:1.0000
混淆矩阵以及Precision、Recall、F1、Support:
classification.predict()
结果如下:
体育 健康 女人 娱乐 房地产 教育 文化 新闻 旅游 汽车 科技 财经
体育 989 0 2 1 0 4 1 0 0 1 0 2
健康 0 983 3 0 0 1 2 5 2 1 1 2
女人 1 4 967 5 0 5 7 4 3 2 2 0
娱乐 2 3 3 970 0 2 19 1 0 0 0 0
房地产 4 2 2 2 909 2 15 35 9 4 1 15
教育 1 0 4 2 0 979 3 6 2 1 0 2
文化 0 0 2 3 6 5 940 22 16 4 2 0
新闻 7 4 1 0 14 21 30 891 8 3 9 12
旅游 0 0 8 1 6 3 12 8 957 1 1 3
汽车 6 0 2 0 1 3 0 4 4 977 0 3
科技 3 4 2 5 6 6 7 12 2 1 942 10
财经 1 1 0 0 9 4 1 20 8 3 8 945
Label Precision Recall F1 Support
0 体育 0.975345 0.989000 0.982125 1000
1 健康 0.982018 0.983000 0.982509 1000
2 女人 0.970884 0.967000 0.968938 1000
3 娱乐 0.980789 0.970000 0.975365 1000
4 房地产 0.955836 0.909000 0.931830 1000
5 教育 0.945894 0.979000 0.962162 1000
6 文化 0.906461 0.940000 0.922926 1000
7 新闻 0.883929 0.891000 0.887450 1000
8 旅游 0.946588 0.957000 0.951765 1000
9 汽车 0.978958 0.977000 0.977978 1000
10 科技 0.975155 0.942000 0.958291 1000
11 财经 0.950704 0.945000 0.947844 1000
999 总体 0.954380 0.954083 0.954099 12000