题目:爬取诗词名句网的全本三国演义。
import requests
from bs4 import BeautifulSoup
import os
if not os.path.exists("三国演义"):
os.mkdir("三国演义")
url = "https://www.shicimingju.com/book/sanguoyanyi.html"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\
/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/\
87.0.664.75'}
response = requests.get(headers = headers, url = url)
response.encoding = 'utf-8'
page_text = response.text
soup = BeautifulSoup(page_text, 'lxml')
chapters = soup.select('.book-mulu > ul > li > a')
num = 1
for each_chapter in chapters:
chapter_name = each_chapter.text
chapter_url = "https://www.shicimingju.com/"+each_chapter['href']
response = requests.get(url = chapter_url, headers = headers)
response.encoding = 'utf-8'
chapter_soup = BeautifulSoup(response.text,'lxml')
#chapter_content = chapter_soup.select('.chapter_content')
chapter_content = chapter_soup.find('div', class_ = 'chapter_content')
filename = '三国演义/'+str(num)+'.txt'
with open(filename, 'w', encoding = 'utf-8') as f:
f.write(chapter_name+'\n'+chapter_content.text)
print(chapter_name+"下载完成")
num += 1
题目:爬取彼岸图网的图片数据
import requests
from lxml import etree
import os
if not os.path.exists('pictures'):
os.mkdir('pictures')
url = "http://pic.netbian.com/4kfengjing/"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\
/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/\
87.0.664.75'}
response = requests.get(url = url, headers = headers)
page_text = response.text
tree = etree.HTML(page_text)
a_list = tree.xpath('//ul[@class="clearfix"]/li/a')
for each_a in a_list:
img_src = 'http://pic.netbian.com'+each_a.xpath('./@href')[0]
img_name = each_a.xpath('./img/@alt')[0]+'.jpg'
img_name = img_name.encode('iso-8859-1').decode('gbk')
img = requests.get(url = img_src, headers=headers).content
path = 'pictures/'+img_name
with open(path,'wb') as f:
f.write(img)
print(img_name,"下载成功")












386

被折叠的 条评论
为什么被折叠?



