使用Python语言,运用正则表达式编写代码 任务:爬取古诗文网站页面信息 要求:1、网址:https://www.gushiwen.cn/2、爬取内容为网站前五页,内容包括:诗文标题、作者名、年代、

import requests
import re

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/109',
    'Cookie': 'Hm_lvt_9007fab6814e892d3020a64454da5a55=1731131797; HMACCOUNT=57F58C282E0BC1DF; login=flase; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1731139237'
}

urls = []
for i in range(1, 6):
    url = 'https://www.gushiwen.cn/shiwens/default.aspx?page={}&tstr=&astr=&cstr=&xstr='.format(i)
    urls.append(url)
print(urls)
gushici = []
i = 0
for url in urls:
    response = requests.get(url, headers=headers)
    content = response.content.decode('utf-8')
    titles = re.findall('<div class="cont".*?<b>(.*?)</b>', content, re.DOTALL)
    dynasties = re.findall('<p class="source">.*?<a.*?><a.*?>(.*?)</a>', content, re.DOTALL)
    anthors = re.findall('<p class="source">.*?<a.*?>(.*?)</a>', content, re.DOTALL)
    poems = re.findall('<div class="contson".*?>(.*?)</div>', content, re.DOTALL)
    new_poems = []
    for poem in poems:
        new_poem = re.sub('<.*?>', "", poem)
        new_poem = re.sub('\n', "", new_poem)
        new_poem = re.sub('\u3000', "", new_poem)
        new_poem = new_poem.strip()
        new_poems.append(new_poem)

    print(titles, dynasties, anthors, new_poems)
    for title, dynasty, anthor, poem in zip(titles, dynasties, anthors, poems):
        poems = {
            "诗文标题": title,
            "朝代": dynasty,
            "作者": anthor,
            "诗文正文": new_poems
        }
        gushici.append(poems)
    i += 1
    print("第{}页爬取成功".format(i))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值