Python练习014

原创已于 2022-06-16 18:58:16 修改 · 250 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#python #爬虫 #数据挖掘

于 2021-01-24 14:11:31 首次发布

python小练习专栏收录该内容

15 篇文章

订阅专栏

题目：爬取诗词名句网的全本三国演义。

import requests
from bs4 import BeautifulSoup
import os

if not os.path.exists("三国演义"):
    os.mkdir("三国演义")
url = "https://www.shicimingju.com/book/sanguoyanyi.html"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\
           /537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/\
           87.0.664.75'}
response = requests.get(headers = headers, url = url)
response.encoding = 'utf-8'
page_text = response.text
soup = BeautifulSoup(page_text, 'lxml')

chapters = soup.select('.book-mulu > ul > li > a')
num = 1
for each_chapter in chapters:
    chapter_name = each_chapter.text
    chapter_url = "https://www.shicimingju.com/"+each_chapter['href']
    response = requests.get(url = chapter_url, headers = headers)
    response.encoding = 'utf-8'
    chapter_soup = BeautifulSoup(response.text,'lxml')
    #chapter_content = chapter_soup.select('.chapter_content')
    chapter_content = chapter_soup.find('div', class_ = 'chapter_content')
    filename = '三国演义/'+str(num)+'.txt'
    with open(filename, 'w', encoding = 'utf-8') as f:
        f.write(chapter_name+'\n'+chapter_content.text)
        print(chapter_name+"下载完成")
        num += 1

题目：爬取彼岸图网的图片数据

import requests
from lxml import etree
import os


if not os.path.exists('pictures'):
    os.mkdir('pictures')
url = "http://pic.netbian.com/4kfengjing/"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\
           /537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/\
           87.0.664.75'}
response = requests.get(url = url, headers = headers)
page_text = response.text
tree = etree.HTML(page_text)
a_list = tree.xpath('//ul[@class="clearfix"]/li/a')
for each_a in a_list:
    img_src = 'http://pic.netbian.com'+each_a.xpath('./@href')[0]
    img_name = each_a.xpath('./img/@alt')[0]+'.jpg'
    img_name = img_name.encode('iso-8859-1').decode('gbk')
    img = requests.get(url = img_src, headers=headers).content
    path = 'pictures/'+img_name
    with open(path,'wb') as f:
        f.write(img)
        print(img_name,"下载成功")

请添加图片描述