第一篇博文,算是正式开始学习编程。
介绍下自己的本科毕。
题目:网络新闻标签与民众情绪关联性分析及其软件设计开发
使用的是python语言,对在datatang获得的新闻语料进行处理分析。
首先是新闻语料的读取,一共49000条新闻及对应的新闻情绪投票:
就python文件流依次读取txt文件,字符串处理去掉空格之类的字符,再保存到txt,代码如下
# -*- coding: utf-8 -*-
' txt to txt module '
import string
import sys
__author__ = 'Lust '
reload(sys)
sys.setdefaultencoding('utf-8')
count = 0
if __name__ == '__main__':
# for x in range(1,49001):
wr = open("b.txt", 'w')
for x in range(1, 5000):
f = open(r'C:\Users\Administrator\data\%s.txt' % x, 'r')
a = f.readlines()
b1 = a[0].decode('utf-8', 'ignore')
b2 = a[-2].decode('utf-8', 'ignore')
b3 = a[-1].decode('utf-8', 'ignore')
b1 = b1.replace(' ', '')
b2 = b2.replace(' ', '')
b3 = b3.replace(' ', '')
b1 = b1.replace('#title#', '')
b2 = b2.replace('#emotion#', '')
b3 = b3.replace('#emotion_sum#', '')
b1 = b1.replace('#/title#', '')
b2 = b2.replace('#/emotion#', '')
b3 = b3.replace('#/emotion_sum#', '')
b3 = b3.replace(u'共有', '')
b3 = b3.replace(u'投票', '')
if(string.atoi(b3)) > 100:
wr.write(b1)
wr.write(b3)
wr.write('\n')
wr.write(b2)
count = count + 1
print 'count', count
# print x
f.close()
wr.close()
另外还把提取的信息存到了excel里,但是并没有什么卵用,excel存的东西越多执行起来越慢,用的是xlrd和xlutils两个包。
下一步,对新闻进行分词,提取关键字,直接用的jieba分词包,代码如下
# coding=utf-8
import jieba.analyse
import sys
import pickle
_author__ = 'Lust'
reload(sys)
sys.setdefaultencoding('utf-8')
f = open('topic.txt', 'r')
a = f.readlines()
l = []
for i in range(0, 5495):
seg = jieba.analyse.extract_tags(a[i])
l.append(seg)
f.close()
f = open('keywords.pkl', 'w')
pickle.dump(l, f)
f.close()
自然语言处理里面nltk据说也很好用,因为教程基本是对英文,就试了一下放弃了。
这个过程中还把每条新闻对应关键字出现情况的矩阵转换成了MATLAB中的.mat文件,也没什么卵用,就是用下scipy.io
新闻内容处理里面还用了点正则表达式,就是python中的re
temp = re.compile(r'#comment#(.+)#/comment#')
comment_temp = temp.findall(txt)
if comment_temp == []:
comment_temp.append(title_temp)
comment_temp = comment_temp[0]
大概就是这么个意思。
根据新闻关键词的共现关系建立网络,并对网络进行分析和可视化。复杂网络分析及可视化用的是python中networkx,发现网络中的社团结构并可视化的代码如下:
import sys
import cPickle as pickle
import matplotlib.pyplot as plt
import networkx as nx
import jieba.analyse
import random
__author__ = 'Lust '
reload(sys)
sys.setdefaultencoding('utf-8')
def b():
f = open(
r'C:\Users\Administrator\Desktop\python\graduation_project\network\keywords_freq_dict.pkl', 'r')
keywords_freq_dict = pickle.load(f)
f.close()
f = open(
r'C:\Users\Administrator\Desktop\python\graduation_project\network\keywords_idf_dict.pkl', 'r')
keywords_idf_dict = pickle.load(f)
f.close()
f = open(
r'C:\Users\Administrator\Desktop\python\graduation_project\network\keywords_value_idf_content.pkl', 'r')
keywords_value_idf_content = pickle.load(f)
f.close()
f = open(
r'C:\Users\Administrator\Desktop\python\graduation_project\network\label_emotion.pkl', 'r')
label_emotion = pickle.load(f)
f.close()
G = nx.Graph()
H = nx.Graph()
# threshold = 1.3
threshold = 0.6
sample = 5494
s = u'警方'
count = 0
for x in xrange(0, sample):
if label_emotion[x][8] == False:
count = count + 1
for keywords_len_1 in xrange(len(keywords_value_idf_content[x])):
for keywords_len_2 in xrange(len(keywords_value_idf_content[x])):
# if (keywords_len_1 != keywords_len_2) and (keywords_value_idf_content[x][keywords_len_1][1] + keywords_value_idf_content[x][keywords_len_2][1] > 0):
if (keywords_len_1 != keywords_len_2):
# G.add_node(keywords_value_idf_content[x][keywords_len_1][0], IDF=keywords_idf_dict[keywords_value_idf_content[x][keywords_len_1][0]])
# if keywords_value_idf_content[x][keywords_len_1][0] == s:
# print s, keywords_value_idf_content[x][keywords_len_1][2], keywords_value_idf_content[x][keywords_len_1][3]
# G.add_node(keywords_value_idf_content[x][keywords_len_2][0], IDF=keywords_idf_dict[keywords_value_idf_content[x][keywords_len_2][0]])
# if keywords_value_idf_content[x][keywords_len_2][0] == s:
# print s, keywords_value_idf_content[x][keywords_len_2][2], keywords_value_idf_content[x][keywords_len_2][3]
G.add_edge(keywords_value_idf_content[x][keywords_len_1][0], keywords_value_idf_content[x][keywords_len_2][0], weight=0)
# G.add_edge(keywords_1[0], keywords_2[0])
# weight_temp = G.get_edge_data(keywords_1[0], keywords_2[0])
# if(weight_temp == {}):
# G[keywords_1[0]][keywords_2[0]]['weight'] = keywords_1[1] + keywords_2[1]
# print 'first:', keywords_1[0], keywords_2[0], G[keywords_1[0]][keywords_2[0]]['weight']
# else:
# G[keywords_1[0]][keywords_2[0]]['weight'] = keywords_1[1] + keywords_2[1] + weight_temp['weight']
# print 'second:', keywords_1[0], keywords_2[0], weight_temp['weight']
# print x
# return
for x in xrange(0, sample):
for keywords_1 in keywords_value_idf_content[x]:
for keywords_2 in keywords_value_idf_content[x]:
if (keywords_1 != keywords_2) and ((keywords_1[0], keywords_2[0]) in G.edges()):
weight_temp = G.get_edge_data(keywords_1[0], keywords_2[0])['weight']
G[keywords_1[0]][keywords_2[0]]['weight'] = keywords_1[1] + keywords_2[1] + weight_temp
# if keywords_1[0] == s or keywords_2[0] == s:
# print keywords_1[0], keywords_2[0], G[keywords_1[0]][keywords_2[0]]['weight']
# print weight_temp, keywords_1[0], keywords_2[0], G[keywords_1[0]][keywords_2[0]]['weight']
print x
# 穷举赋予权值
edgewidth = []
for (u, v, d) in G.edges(data=True):
if d['weight'] >= threshold:
edgewidth.append(round((d['weight']-threshold)*10, 2))
H.add_node(u, IDF=keywords_idf_dict[u])
H.add_node(v, IDF=keywords_idf_dict[v])
H.add_edge(u, v, weight=d['weight'])
# c = list(nx.k_clique_communities(G, 2))
# G = nx.create_empty_copy(G)
# pos = nx.graphviz_layout(G)
pos = nx.spring_layout(H, iterations=20)
# nodecolor = []
# for i in xrange(nx.number_of_nodes(G)):
# nodecolor.append(random.random())
# nodecolor = [float(G.degree(v)) for v in G]
IDF = dict.fromkeys(H.nodes(), 0.0)
for (u, v) in H.nodes(data=True):
IDF[u] = v['IDF']
nodesize = [IDF[v]*100 for v in H]
b = list(nx.k_clique_communities(H, 3))
temp = []
for i in b:
for j in i:
temp.append(j)
nodesize_2 = []
for i in temp:
nodesize_2.append(IDF[i]*100)
nx.draw_networkx_edges(H, pos, alpha=0.3, width=edgewidth, edge_color='m')
nx.draw_networkx_edges(H, pos, alpha=0.4, width=1, edge_color='k')
# nx.draw_networkx_nodes(G, pos, alpha=0.4, node_size=nodesize, node_color=nodecolor, vmin=0.0, vmax=1.0,)
nx.draw_networkx_nodes(H, pos, alpha=0.4, node_size=nodesize, node_color='w')
nx.draw_networkx_nodes(H, pos, alpha=0.4, nodelist=temp, node_size=nodesize_2, node_color='r')
nx.draw_networkx_labels(H, pos, fontsize=14)
# plt.axis('off')
plt.show()
for i in b:
for j in i:
print j
# print G.number_of_edges()
return G, b, count
搞出来的图就是这样的:
最后把关键词作为特征训练分类器,对新闻情绪情绪进行分类,用python中的sklearn
from sklearn import svm
from sklearn import cross_validation
from sklearn import decomposition
def c():
f = open('C:\Users\Administrator\Desktop\python\graduation_project\wrapper\label.pkl', 'r')
label = pickle.load(f)
f.close()
f = open('C:\Users\Administrator\Desktop\python\graduation_project\wrapper\data.pkl', 'r')
data = pickle.load(f)
f.close()
print data.shape, label.shape
label = label.reshape(-1)
print data.shape, label.shape
# perm = np.random.permutation(label.size)
# data = data[perm]
# label = label[perm]
pca = decomposition.PCA(n_components=200)
pca.fit(data)
data = pca.transform(data)
x_train, x_test, y_train, y_test = cross_validation.train_test_split(data, label, test_size=0.5, random_state=0)
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print score
return data, label
对特征矩阵进行了主成分分析PCA,sklearn里面的交叉验证cross_validation我用来分测试集训练集,用的svm分类器换各种各样的核。结果用matplotlib画图
最后的最后,用pyqt做了个交互界面
具体pyqt的用法全是学的这个http://www.cnblogs.com/rrxc/p/4462890.html
这个gui的代码如下:
# -*- coding: utf-8 -*-
import sys
from PyQt4 import QtCore, QtGui, uic
import cPickle as pickle
import random
import matplotlib.pyplot as plt
import networkx as nx
from matplotlib.backends import qt4_compat
from matplotlib.backends.backend_qt4agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.backends.backend_qt4agg import NavigationToolbar2QTAgg as NavigationToolbar
from matplotlib.figure import Figure
from sklearn import svm
from sklearn import cross_validation
from sklearn import decomposition
qtCreatorFile = "C:\Users\Administrator\Downloads\gui.ui" # Enter file here.
Ui_MainWindow, QtBaseClass = uic.loadUiType(qtCreatorFile)
class MyApp(QtGui.QMainWindow, Ui_MainWindow):
def __init__(self):
f = open(
r'C:\Users\Administrator\Desktop\python\graduation_project\statistics\data\two_tuple.pkl', 'r')
global two_tuple
two_tuple = pickle.load(f)
f.close()
f = open(
r'C:\Users\Administrator\Desktop\python\graduation_project\network\keywords_idf_dict.pkl', 'r')
global keywords_idf_dict
keywords_idf_dict = pickle.load(f)
f.close()
f = open(
r'C:\Users\Administrator\Desktop\python\graduation_project\network\keywords_value_idf_content.pkl', 'r')
global keywords_value_idf_content
keywords_value_idf_content = pickle.load(f)
f.close()
f = open(
r'C:\Users\Administrator\Desktop\python\graduation_project\network\label_emotion.pkl', 'r')
global label_emotion
label_emotion = pickle.load(f)
f.close()
f = open('C:\Users\Administrator\Desktop\python\graduation_project\wrapper\label.pkl', 'r')
global label_2
label_2 = pickle.load(f)
label_2 = label_2.reshape(-1)
f.close()
f = open('C:\Users\Administrator\Desktop\python\graduation_project\wrapper\data.pkl', 'r')
global data_2
data_2 = pickle.load(f)
f.close()
QtGui.QMainWindow.__init__(self)
Ui_MainWindow.__init__(self)
self.setupUi(self)
self.compute_button.clicked.connect(self.Compute)
self.plt_button.clicked.connect(self.Plt)
self.two_tuple_button.clicked.connect(self.Two_tuple)
self.pretreatment_button.clicked.connect(self.Pretreatment)
self.predict_button.clicked.connect(self.Predict)
# self.setWindowTitle('Demo: PyQt with matplotlib')
self.setWindowTitle(u'标签组合提取与情绪预测')
self.create_menu()
self.create_main_frame()
self.create_status_bar()
self.textbox.setText('1 2 3')
self.on_draw()
def create_main_frame(self):
# self.main_frame = QtGui.QWidget()
# Create the mpl Figure and FigCanvas objects.
# 5x4 inches, 100 dots-per-inch
#
self.dpi = 80
self.fig = Figure((9.0, 5.0), dpi=self.dpi)
self.canvas = FigureCanvas(self.fig)
self.canvas.setParent(self.main_frame)
# Since we have only one plot, we can use add_axes
# instead of add_subplot, but then the subplot
# configuration tool in the navigation toolbar wouldn't
# work.
#
# self.axes = self.fig.add_subplot(111)
self.axes = self.fig.add_axes([0.1, 0.1, 0.8, 0.8])
# Bind the 'pick' event for clicking on one of the bars
#
self.canvas.mpl_connect('pick_event', self.on_pick)
# Create the navigation toolbar, tied to the canvas
#
# self.mpl_toolbar = NavigationToolbar(self.canvas, self.main_frame)
# Other GUI controls
#
# self.textbox = QtGui.QLineEdit()
self.textbox.setMinimumWidth(200)
self.connect(self.textbox, QtCore.SIGNAL('editingFinished ()'), self.on_draw)
# self.draw_button = QtGui.QPushButton("&Draw")
self.connect(self.draw_button, QtCore.SIGNAL('clicked()'), self.on_draw)
# self.grid_cb = QtGui.QCheckBox("Show &Grid")
self.grid_cb.setChecked(False)
self.connect(self.grid_cb, QtCore.SIGNAL('stateChanged(int)'), self.on_draw)
# slider_label = QtGui.QLabel('Bar width (%):')
# self.slider = QtGui.QSlider(QtCore.Qt.Horizontal)
self.slider.setRange(1, 100)
self.slider.setValue(20)
self.slider.setTracking(True)
self.slider.setTickPosition(QtGui.QSlider.TicksBothSides)
self.connect(self.slider, QtCore.SIGNAL('valueChanged(int)'), self.on_draw)
#
# Layout with box sizers
#
# hbox = QtGui.QHBoxLayout()
# for w in [self.textbox, self.draw_button, self.grid_cb,
# slider_label, self.slider]:
# hbox.addWidget(w)
# hbox.setAlignment(w, QtCore.Qt.AlignVCenter)
# vbox = QtGui.QVBoxLayout()
# vbox.addWidget(self.canvas)
# vbox.addWidget(self.mpl_toolbar)
# vbox.addLayout(hbox)
# self.main_frame.setLayout(vbox)
# self.setCentralWidget(self.main_frame)
def on_draw(self):
""" Redraws the figure
"""
str = unicode(self.textbox.text())
self.data = map(int, str.split())
x = range(len(self.data))
# clear the axes and redraw the plot anew
#
self.axes.clear()
self.axes.grid(self.grid_cb.isChecked())
self.axes.bar(
left=x,
height=self.data,
width=self.slider.value() / 100.0,
align='center',
alpha=0.44,
picker=5)
# G = nx.star_graph(20)
# pos = nx.spring_layout(G)
# colors = range(20)
# nx.draw(G, pos, node_color='#A0CBE2', edge_color=colors, width=4, edge_cmap=plt.cm.Blues, with_labels=False)
# plt.savefig("atlas.png", dpi=50)
# plt.show()
self.canvas.draw()
def on_pick(self, event):
# The event received here is of the type
# matplotlib.backend_bases.PickEvent
#
# It carries lots of information, of which we're using
# only a small amount here.
#
box_points = event.artist.get_bbox().get_points()
msg = "You've clicked on a bar with coords:\n %s" % box_points
QtGui.QMessageBox.information(self, "Click!", msg)
def create_menu(self):
self.file_menu = self.menuBar().addMenu("&File")
quit_action = self.create_action("&Quit", slot=self.close,
shortcut="Ctrl+Q", tip="Close the application")
self.add_actions(self.file_menu,
(None, quit_action))
self.help_menu = self.menuBar().addMenu("&Help")
about_action = self.create_action("&About",
shortcut='F1', slot=self.on_about,
tip='About the demo')
self.add_actions(self.help_menu, (about_action,))
def create_action(self, text, slot=None, shortcut=None,
icon=None, tip=None, checkable=False,
signal="triggered()"):
action = QtGui.QAction(text, self)
if icon is not None:
action.setIcon(QtGui.QIcon(":/%s.png" % icon))
if shortcut is not None:
action.setShortcut(shortcut)
if tip is not None:
action.setToolTip(tip)
action.setStatusTip(tip)
if slot is not None:
self.connect(action, QtCore.SIGNAL(signal), slot)
if checkable:
action.setCheckable(True)
return action
def add_actions(self, target, actions):
for action in actions:
if action is None:
target.addSeparator()
else:
target.addAction(action)
def on_about(self):
msg = u""" 李~欣~桐~的demo程序:
* 可以画图!
* Add values to the text box and press Enter (or click "Draw")
* Show or hide the grid
* Drag the slider to modify the width of the bars
* Save the plot to a file using the File menu
* Click on a bar to receive an informative message
"""
QtGui.QMessageBox.about(self, "About the demo", msg.strip())
def create_status_bar(self):
self.status_text = QtGui.QLabel(u"这是李~欣~桐~的程序状态信息")
self.statusBar().addWidget(self.status_text, 1)
def Compute(self):
f = open(
r'C:\Users\Administrator\Desktop\python\graduation_project\network\title.pkl', 'r')
title = pickle.load(f)
f.close()
a = title[random.randint(1, 20)]
b = self.topic.toPlainText()
total_price_string = "The total price with tax is: " + a[2] + '\n' + a[1] + b
self.accuracy.setText(total_price_string)
self.textbox.setText('4 5 6')
def Plt(self):
# G = nx.star_graph(20)
# pos = nx.spring_layout(G)
# colors = range(20)
# nx.draw(G, pos, node_color='#A0CBE2', edge_color=colors, width=4, edge_cmap=plt.cm.Blues, with_labels=False)
# plt.show()
# f = open(
# r'C:\Users\Administrator\Desktop\python\graduation_project\network\keywords_idf_dict.pkl', 'r')
# keywords_idf_dict = pickle.load(f)
# f.close()
# f = open(
# r'C:\Users\Administrator\Desktop\python\graduation_project\network\keywords_value_idf_content.pkl', 'r')
# keywords_value_idf_content = pickle.load(f)
# f.close()
# f = open(
# r'C:\Users\Administrator\Desktop\python\graduation_project\network\label_emotion.pkl', 'r')
# label_emotion = pickle.load(f)
# f.close()
G = nx.Graph()
# threshold = 1.3
threshold = 1.5
sample = 5
# s = u'警方'
count = 0
for x in xrange(0, sample):
if label_emotion[x][8] == False:
count = count + 1
for keywords_len_1 in xrange(len(keywords_value_idf_content[x])):
for keywords_len_2 in xrange(len(keywords_value_idf_content[x])):
if (keywords_len_1 != keywords_len_2) and (keywords_value_idf_content[x][keywords_len_1][1] + keywords_value_idf_content[x][keywords_len_2][1] > 0):
# if (keywords_len_1 != keywords_len_2):
G.add_node(keywords_value_idf_content[x][keywords_len_1][0], IDF=keywords_idf_dict[keywords_value_idf_content[x][keywords_len_1][0]])
# if keywords_value_idf_content[x][keywords_len_1][0] == s:
# print s, keywords_value_idf_content[x][keywords_len_1][2], keywords_value_idf_content[x][keywords_len_1][3]
G.add_node(keywords_value_idf_content[x][keywords_len_2][0], IDF=keywords_idf_dict[keywords_value_idf_content[x][keywords_len_2][0]])
# if keywords_value_idf_content[x][keywords_len_2][0] == s:
# print s, keywords_value_idf_content[x][keywords_len_2][2], keywords_value_idf_content[x][keywords_len_2][3]
G.add_edge(keywords_value_idf_content[x][keywords_len_1][0], keywords_value_idf_content[x][keywords_len_2][0], weight=0)
# G.add_edge(keywords_1[0], keywords_2[0])
# weight_temp = G.get_edge_data(keywords_1[0], keywords_2[0])
# if(weight_temp == {}):
# G[keywords_1[0]][keywords_2[0]]['weight'] = keywords_1[1] + keywords_2[1]
# print 'first:', keywords_1[0], keywords_2[0], G[keywords_1[0]][keywords_2[0]]['weight']
# else:
# G[keywords_1[0]][keywords_2[0]]['weight'] = keywords_1[1] + keywords_2[1] + weight_temp['weight']
# print 'second:', keywords_1[0], keywords_2[0], weight_temp['weight']
# print x
# return
for x in xrange(0, sample):
for keywords_1 in keywords_value_idf_content[x]:
for keywords_2 in keywords_value_idf_content[x]:
if (keywords_1 != keywords_2) and ((keywords_1[0], keywords_2[0]) in G.edges()):
weight_temp = G.get_edge_data(keywords_1[0], keywords_2[0])['weight']
G[keywords_1[0]][keywords_2[0]]['weight'] = keywords_1[1] + keywords_2[1] + weight_temp
# if keywords_1[0] == s or keywords_2[0] == s:
# print keywords_1[0], keywords_2[0], G[keywords_1[0]][keywords_2[0]]['weight']
# print weight_temp, keywords_1[0], keywords_2[0], G[keywords_1[0]][keywords_2[0]]['weight']
# print x
# 穷举赋予权值
edgewidth = []
for (u, v, d) in G.edges(data=True):
edgewidth.append(round((d['weight']-threshold)*10, 2))
# G = nx.create_empty_copy(G)
# pos = nx.graphviz_layout(G)
pos = nx.spring_layout(G, iterations=20)
# nodecolor = []
# for i in xrange(nx.number_of_nodes(G)):
# nodecolor.append(random.random())
# nodecolor = [float(G.degree(v)) for v in G]
IDF = dict.fromkeys(G.nodes(), 0.0)
for (u, v) in G.nodes(data=True):
IDF[u] = v['IDF']
nodesize = [IDF[v]*100 for v in G]
b = list(nx.k_clique_communities(G, 3))
temp = []
for i in b:
for j in i:
temp.append(j)
nodesize_2 = []
for i in temp:
nodesize_2.append(IDF[i]*100)
nx.draw_networkx_edges(G, pos, alpha=0.3, width=edgewidth, edge_color='m')
nx.draw_networkx_edges(G, pos, alpha=0.4, width=1, edge_color='k')
# nx.draw_networkx_nodes(G, pos, alpha=0.4, node_size=nodesize, node_color=nodecolor, vmin=0.0, vmax=1.0,)
nx.draw_networkx_nodes(G, pos, alpha=0.4, node_size=nodesize, node_color='w')
nx.draw_networkx_nodes(G, pos, alpha=0.4, nodelist=temp, node_size=nodesize_2, node_color='r')
nx.draw_networkx_labels(G, pos, fontsize=14)
# plt.axis('off')
plt.show()
def Two_tuple(self):
# f = open(
# r'C:\Users\Administrator\Desktop\python\graduation_project\statistics\data\two_tuple.pkl', 'r')
# two_tuple = pickle.load(f)
# f.close()
a = two_tuple[random.randint(1, 20)]
total_price_string = a[0] + '\n' + a[1]
self.two_tuple_txt.setText(total_price_string)
self.textbox.setText('4 5 6')
def Pretreatment(self):
a = self.address.toPlainText()
total_price_string = a
self.accuracy.setText(total_price_string)
s = '%s 5 6' % len(data_2)
self.textbox.setText(s)
def Predict(self):
data_3 = data_2
label_3 = label_2
pca = decomposition.PCA(n_components=200)
pca.fit(data_3)
data_3 = pca.transform(data_3)
x_train, x_test, y_train, y_test = cross_validation.train_test_split(data_3, label_3, test_size=0.5, random_state=0)
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
self.predict_accuracy.setText(u'%s' % score)
if __name__ == "__main__":
app = QtGui.QApplication(sys.argv)
window = MyApp()
window.show()
sys.exit(app.exec_())
PS:拖延症啊拖延症,拖的我都忘了毕设怎么做的了·····潦草的写了下,见谅