from re import findall, sub, S
from urllib.request import urlopen
from urllib.parse import urljoin
from collections import Counter
from jieba import cut
import matplotlib.pyplot as plt
import os
import numpy as np
plt.rcParams['font.sans-serif'] = 'simHei'#中文字体
url = r'https://www.sdtbu.edu.cn/'
with urlopen(url) as fp:
content=fp.read().decode()#读取页面内容解码
pattern = r'<UL class="news-list">.*?<li><a href="(.+?)"'
first_url = findall(pattern,content)[0]#用pattern定义的正则表达式 提取内容中地址
url = urljoin(url, first_url) #地址拼接
text={ }
stext=[]
for i in range(1000):
with urlopen(url) as fp:
content = fp.read().decode()
pattern = r'<h1 align="center" class="content-title">(.+?)</h1>'
parasss = r'日期:\d+年\d+月\d+日'
paras = r'2021年01月01日'
parass = findall(paras, content)
month=findall(parasss,content)
month=sub(r'日期:\d+年0|\d+年|\d+日|月','',month[0]).strip()
title = findall(pattern,content)[0]
title = sub(r'<.*?>| |"|/','',title)
# if os.path.exists(title+'.txt'):
# continue
pattern = r'<p class="MsoNormal".*?>(.+?)</p>'
with open(title+'.txt','w',encoding='utf8') as fp:
for para in findall(pattern, content, S):
para = sub(r'<.*?>| ','',para).strip() #strip()移除头尾空格
if os.path.exists(title+'.txt'):#如果文件存在就不二次写入
pass
elif para:
fp.write(para+'\n')
# print(int(month))
# text[month].append(para)
if para:
stext.append(para)#用来统计词频
text.setdefault(month, []).append(para)#将信息存入对应的月份
if parass:#当到2021.1.1时停止匹配
break
pattern = r'下一条:<a href="(.+?)"'
result = findall(pattern, content)
if result:
url = urljoin(url, result[0])
stext=''.join(stext)#将列表中所有项,用空格隔开,然后拼在一起
words =filter(lambda word: len(word)>1,cut(stext))#对词的字数大于1的提取出来
words = Counter(words).most_common(10)
#Counter(words[0])
data=[]
print(words)
labels = []
for atem in words:
otext = []
newList=[]
for key in text:
#print(key)
text[key] =''.join(text[key])#txt中包含了所有的新闻,并通过空格将每个新闻分隔开来
otext.append(text[key].count(atem[0]))#高频词在第key月出现的次数
labels.append(atem[0])
data.append(list(reversed(otext)))
print(data)
print(labels)
x = ["1","2","3","4","5"]
plt.plot(x, data[0], 'r--*', lw=1, ms=5,label=labels[0])
plt.plot(x, data[1], 'b:*', lw=2, ms=5,label=labels[1])
plt.plot(x, data[2], 'g-.v', lw=3, ms=5,label=labels[2])
plt.plot(x, data[3], 'c-.>', lw=4, ms=5,label=labels[3])
plt.plot(x, data[4], 'm-.<', lw=5, ms=5,label=labels[4])
plt.plot(x, data[5], 'y-.s', lw=4, ms=5,label=labels[5])
plt.plot(x, data[6], 'k-.p', lw=3, ms=5,label=labels[6])
plt.plot(x, data[7], color='coral',linestyle='-.',marker='*', lw=2, ms=5,label=labels[7])
#plt.plot(x, data[8], 'w-.*', lw=1, ms=5,label=labels[8])
plt.plot(x, data[8], color='darkseagreen',linestyle='-.',marker='*', lw=2, ms=5,label=labels[7])
#plt.plot(x, data[9], 'w-.*', lw=1, ms=5,label=labels[9])
plt.plot(x, data[9], color='cyan',linestyle='-.',marker='*', lw=1, ms=5,label=labels[9])
# plt.plot(key, otext[key], 's-', color='r', label=atem[0])
plt.legend(loc="best") # 图例
plt.xlabel("月份")#横坐标名字
plt.ylabel("出现次数")#纵坐标名字
# for item in words:
# print(item[0])
# x=[item[0] for item in words]#遍历words[0]
# y=[item[1] for item in words]#遍历words[1]
#将字数作为label,y为词频,x为月份遍历
# plt.bar(x,y)
plt.show()
记录一下学习爬虫编的一个程序
最新推荐文章于 2025-06-04 16:29:08 发布