记录一下学习爬虫编的一个程序

最新推荐文章于 2025-06-04 16:29:08 发布

sheldonggz

最新推荐文章于 2025-06-04 16:29:08 发布

阅读量123

点赞数

文章标签： python

本文链接：https://blog.youkuaiyun.com/sheldonggz/article/details/120419140

版权

from re import findall, sub, S
from urllib.request import urlopen
from urllib.parse import  urljoin
from collections import Counter
from jieba import cut
import matplotlib.pyplot as plt
import os
import numpy as np

plt.rcParams['font.sans-serif'] = 'simHei'#中文字体

url = r'https://www.sdtbu.edu.cn/'
with urlopen(url) as fp:
    content=fp.read().decode()#读取页面内容解码
pattern = r'<UL class="news-list">.*?<li><a href="(.+?)"'
first_url = findall(pattern,content)[0]#用pattern定义的正则表达式 提取内容中地址
url = urljoin(url, first_url)          #地址拼接

text={ }
stext=[]
for i in range(1000):
    with urlopen(url) as fp:
        content = fp.read().decode()
    pattern = r'<h1 align="center" class="content-title">(.+?)</h1>'
    parasss = r'日期：\d+年\d+月\d+日'
    paras = r'2021年01月01日'
    parass = findall(paras, content)
    month=findall(parasss,content)
    month=sub(r'日期：\d+年0|\d+年|\d+日|月','',month[0]).strip()

    title = findall(pattern,content)[0]
    title = sub(r'<.*?>|&nbsp;|"|/','',title)
    # if os.path.exists(title+'.txt'):
    #     continue
    pattern = r'<p class="MsoNormal".*?>(.+?)</p>'
    with open(title+'.txt','w',encoding='utf8') as fp:
        for para in findall(pattern, content, S):
            para = sub(r'<.*?>|&nbsp;','',para).strip() #strip()移除头尾空格
            if os.path.exists(title+'.txt'):#如果文件存在就不二次写入
               pass
            elif para:
                    fp.write(para+'\n')
              #      print(int(month))
              #     text[month].append(para)

            if para:
                stext.append(para)#用来统计词频
                text.setdefault(month, []).append(para)#将信息存入对应的月份
    if parass:#当到2021.1.1时停止匹配
        break
    pattern = r'下一条：<a href="(.+?)"'
    result = findall(pattern, content)
    if result:
        url = urljoin(url, result[0])
stext=''.join(stext)#将列表中所有项，用空格隔开，然后拼在一起
words =filter(lambda word: len(word)>1,cut(stext))#对词的字数大于1的提取出来
words = Counter(words).most_common(10)
#Counter(words[0])
data=[]
print(words)
labels = []
for atem in words:
    otext = []
    newList=[]
    for key in text:
    #print(key)
        text[key] =''.join(text[key])#txt中包含了所有的新闻，并通过空格将每个新闻分隔开来
        otext.append(text[key].count(atem[0]))#高频词在第key月出现的次数
    labels.append(atem[0])
    data.append(list(reversed(otext)))
    print(data)
    print(labels)
x = ["1","2","3","4","5"]
plt.plot(x, data[0], 'r--*', lw=1, ms=5,label=labels[0])
plt.plot(x, data[1], 'b:*', lw=2, ms=5,label=labels[1])
plt.plot(x, data[2], 'g-.v', lw=3, ms=5,label=labels[2])
plt.plot(x, data[3], 'c-.>', lw=4, ms=5,label=labels[3])
plt.plot(x, data[4], 'm-.<', lw=5, ms=5,label=labels[4])
plt.plot(x, data[5], 'y-.s', lw=4, ms=5,label=labels[5])
plt.plot(x, data[6], 'k-.p', lw=3, ms=5,label=labels[6])
plt.plot(x, data[7], color='coral',linestyle='-.',marker='*', lw=2, ms=5,label=labels[7])
#plt.plot(x, data[8], 'w-.*', lw=1, ms=5,label=labels[8])
plt.plot(x, data[8], color='darkseagreen',linestyle='-.',marker='*', lw=2, ms=5,label=labels[7])
#plt.plot(x, data[9], 'w-.*', lw=1, ms=5,label=labels[9])
plt.plot(x, data[9], color='cyan',linestyle='-.',marker='*', lw=1, ms=5,label=labels[9])

     # plt.plot(key, otext[key], 's-', color='r', label=atem[0])
plt.legend(loc="best")  # 图例
plt.xlabel("月份")#横坐标名字
plt.ylabel("出现次数")#纵坐标名字
# for item in words:
#     print(item[0])
# x=[item[0] for item in words]#遍历words[0]
# y=[item[1] for item in words]#遍历words[1]

#将字数作为label，y为词频，x为月份遍历
# plt.bar(x,y)
plt.show()