import requests
import re
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/109',
'Cookie': 'Hm_lvt_9007fab6814e892d3020a64454da5a55=1731131797; HMACCOUNT=57F58C282E0BC1DF; login=flase; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1731139237'
}
urls = []
for i in range(1, 6):
url = 'https://www.gushiwen.cn/shiwens/default.aspx?page={}&tstr=&astr=&cstr=&xstr='.format(i)
urls.append(url)
print(urls)
gushici = []
i = 0
for url in urls:
response = requests.get(url, headers=headers)
content = response.content.decode('utf-8')
titles = re.findall('<div class="cont".*?<b>(.*?)</b>', content, re.DOTALL)
dynasties = re.findall('<p class="source">.*?<a.*?><a.*?>(.*?)</a>', content, re.DOTALL)
anthors = re.findall('<p class="source">.*?<a.*?>(.*?)</a>', content, re.DOTALL)
poems = re.findall('<div class="contson".*?>(.*?)</div>', content, re.DOTALL)
new_poems = []
for poem in poems:
new_poem = re.sub('<.*?>', "", poem)
new_poem = re.sub('\n', "", new_poem)
new_poem = re.sub('\u3000', "", new_poem)
new_poem = new_poem.strip()
new_poems.append(new_poem)
print(titles, dynasties, anthors, new_poems)
for title, dynasty, anthor, poem in zip(titles, dynasties, anthors, poems):
poems = {
"诗文标题": title,
"朝代": dynasty,
"作者": anthor,
"诗文正文": new_poems
}
gushici.append(poems)
i += 1
print("第{}页爬取成功".format(i))
使用Python语言,运用正则表达式编写代码 任务:爬取古诗文网站页面信息 要求:1、网址:https://www.gushiwen.cn/2、爬取内容为网站前五页,内容包括:诗文标题、作者名、年代、
于 2024-11-10 10:43:13 首次发布