newstraceGui.py
from tkinter import *
import codecs
import getdata
import allobject
class newstraceGui:
def __init__(self):
window = Tk()
window.title('新闻动态追踪')
# 创建菜单栏
menubar = Menu(window)
# 创建下拉菜单
# "文件"菜单
menu1 = Menu(menubar, tearoff=0)
menubar.add_cascade(label="历史记录", menu=menu1)
menu1.add_command(label="保监会-新闻动态",command=lambda:self.historynote('保监会'))
menu1.add_command(label="银监会-新闻动态", command=lambda:self.historynote('银监会'))
# "退出"菜单
menu2 = Menu(menubar, tearoff=0)
menubar.add_cascade(label="退出", menu=menu2)
menu2.add_command(label="退出", command=window.quit)
window.config(menu=menubar)
frame1 = Frame(window)
frame1.pack()
# 文本与滚动条相结合
self.text = Text(frame1, width=40, height=10)
self.text.grid(row=1, column=1)
# 创建滚动条
scrollbar = Scrollbar(frame1)
scrollbar.grid(row=1, column=2)
# 配置文本和滚动条
self.text.config(yscrollcommand=scrollbar.set)
scrollbar.config(command=self.text.yview)
frame2 = Frame(window)
frame2.pack()
# 按钮
button1 = Button(frame2,text="更新",command=self.updatenote)
button1.config(width=20,bg="blue",fg="white")
button2 = Button(frame2,text="清除",command=self.cleartext)
button2.config(width=20,bg="#F0F8FF",fg="red")
button1.grid(row=1,column=1)
button2.grid(row=1,column=2)
window.mainloop()
def historynote(self,targetstr):
self.note = targetstr+'-历史记录:'+'\n'
# print(self.note)
try:
self.note = targetstr+'-历史记录:'+'\n'
# print(self.note)
with codecs.open(targetstr+'-更新记录.txt','r','utf-8') as f:
ff=f.readlines()
f.close()
for s in ff:
self.note += s
self.note +='\n'
except:
self.note='无法打开:'+targetstr+'-历史记录:\n'
self.text.insert(END,'\n'+self.note)
def updatenote(self):
self.text.insert(END,'---正在更新记录---\n')
[c_name,c_news_url,c_title_class,c_title_re,c_date_re]=allobject.obj()
for num in range(2):
self.text.insert(END, '\n'+c_name[num])
self.text.insert(END, getdata.get(c_name[num],c_news_url[num], \
c_title_class[num],c_title_re[num],c_date_re[num]))
self.text.insert(END, '\n')
def cleartext(self):
self.text.delete(1.0,END)
newstraceGui()
getdata.py
import urllib.request
from bs4 import BeautifulSoup
import re
import codecs
import allobject
def get(c_name, c_news_url, c_title_class, c_title_re, c_date_re):
#匹配标题和时间
if c_title_class!=None:
class_title=re.compile(c_title_class)
else:
class_title=None
#匹配时间
#保存更新记录文件名
headers = {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
req=urllib.request.Request(url=c_news_url,headers=headers)
html=urllib.request.urlopen(req).read()
try:
url=html.decode('utf-8')
except:
url=html.decode('gbk')
print('--------网页解码完成-----------')
# BeautifulSoup解析
soup=BeautifulSoup(url,'lxml')
match1=soup.find(class_=class_title,title=re.compile(c_title_re))
match2=match1.parent.parent
# 提取时间
match3=match2.find_all('td')
# print(match3)
c_title_re=re.compile(c_date_re)
for s in match3:
try:
getdate=s.string.split()[0]
except:
continue
# 提取标题
gettext=match1.string
gettext=gettext.replace(' ',',')
# 读取历史记录
try:
with codecs.open(c_name+'-更新记录.txt','r','utf-8') as f:
record=f.readlines()
f.close()
recorddate = record[0].split()[1]
recordtext = record[1].split()[1]
if recorddate==getdate and recordtext==gettext:
record=c_name+'-无更新记录'
with codecs.open(c_name+'-更新记录.txt', 'w', 'utf-8') as f:
record = '更新日期: ' + getdate + '\n' + '更新内容: ' + gettext
f.write(record)
f.close()
return record
except:
with codecs.open(c_name+'-更新记录.txt', 'w', 'utf-8') as f:
record ='更新日期: ' + getdate + '\n' + '更新内容: ' + gettext
f.write(record)
f.close()
return record
allobject.py
'''
c_name:机构名称
c_news_url:机构新闻网址
c_title_class:机构新闻标题class
c_title_re:机构最新新闻标题正则匹配
c_date_re:机构新闻最新发布时间
'''
def obj():
c_name = ['保监会','银监会']
c_news_url = ["http://www.circ.gov.cn/web/site0/tab5207/", \
"http://www.cbrc.gov.cn/chinese/home/docViewPage/110010.html"]
c_title_class = [None,'.*']
c_title_re = ['[\w\u4e00-\u9fa5].*','[\w\u4e00-\u9fa5].*']
c_date_re = ['(.*)','\d\d\d\d-\d\d-\d\d']
return c_name,c_news_url,c_title_class,c_title_re,c_date_re
效果图: