这次用正则表达式进行爬取
链接地址:https://www.gushiwen.org/default_1.aspx
同样是爬取前7页的数据
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import re
def parse_page(url):
headers = {
'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3610.2 Mobile Safari/537.36"
}
response = requests.get(url, headers=headers)
text = response.text
#print(text)
titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
#print(titles[0])
dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
# for x in titles:
# print(x)
contents = []
for content in content_tags:
x = re.sub(r'<.*?>', "", content)
contents.append(x.strip())
peoms = []
for value in zip(titles, dynasties, authors, contents):
title, dynasty, author, content = value
peom = {
'title': title,
'dynasty': dynasty,
'author': author,
'content': content
}
peoms.append(peom)
#print(len(peoms))
for peom in peoms:
print(peom['title'])
print(peom['dynasty'])
print(peom['author'])
print(peom['content'])
print('='*40)
def main():
base_url = 'https://www.gushiwen.org/default_{}.aspx'
for x in range(1, 8):
url = base_url.format(x)
parse_page(url)
if __name__ == '__main__':
main()
对代码中用到的知识点进行复习:
re.sub用于替换字符串中的匹配项,
参考:https://blog.youkuaiyun.com/geekleee/article/details/75309433
python里使用正则表达式的DOTALL标志,
参考:https://blog.youkuaiyun.com/caimouse/article/details/78395297
zip函数,
参考:https://www.cnblogs.com/waltsmith/p/8029539.html
https://www.cnblogs.com/wushuaishuai/p/7766470.html