使用Python语言，运用正则表达式编写代码任务：爬取古诗文网站页面信息要求：1、网址：https://www.gushiwen.cn/2、爬取内容为网站前五页，内容包括：诗文标题、作者名、年代、

原创已于 2024-11-30 13:09:46 修改 · 487 阅读

CC 4.0 BY-SA版权

文章标签：

于 2024-11-10 10:43:13 首次发布

import requests
import re

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/109',
    'Cookie': 'Hm_lvt_9007fab6814e892d3020a64454da5a55=1731131797; HMACCOUNT=57F58C282E0BC1DF; login=flase; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1731139237'
}

urls = []
for i in range(1, 6):
    url = 'https://www.gushiwen.cn/shiwens/default.aspx?page={}&tstr=&astr=&cstr=&xstr='.format(i)
    urls.append(url)
print(urls)
gushici = []
i = 0
for url in urls:
    response = requests.get(url, headers=headers)
    content = response.content.decode('utf-8')
    titles = re.findall('<div class="cont".*?<b>(.*?)</b>', content, re.DOTALL)
    dynasties = re.findall('<p class="source">.*?<a.*?><a.*?>(.*?)</a>', content, re.DOTALL)
    anthors = re.findall('<p class="source">.*?<a.*?>(.*?)</a>', content, re.DOTALL)
    poems = re.findall('<div class="contson".*?>(.*?)</div>', content, re.DOTALL)
    new_poems = []
    for poem in poems:
        new_poem = re.sub('<.*?>', "", poem)
        new_poem = re.sub('\n', "", new_poem)
        new_poem = re.sub('\u3000', "", new_poem)
        new_poem = new_poem.strip()
        new_poems.append(new_poem)

    print(titles, dynasties, anthors, new_poems)
    for title, dynasty, anthor, poem in zip(titles, dynasties, anthors, poems):
        poems = {
            "诗文标题": title,
            "朝代": dynasty,
            "作者": anthor,
            "诗文正文": new_poems
        }
        gushici.append(poems)
    i += 1
    print("第{}页爬取成功".format(i))