爬虫初学者指南-优快云博客

本文分享了一位爬虫新手的学习经历，详细展示了如何使用Python的BeautifulSoup库抓取小说章节并保存为文本文件的过程，适合爬虫初学者参考。

 
本人小白，刚学爬虫，代码应该有很多可优化的地方，欢迎各位大大指点，交流。感谢！！！

# coding=utf-8

from bs4 import BeautifulSoup
from urllib import request
import requests
import os
import time


# 获取html
url = 'http://www.biqukan.com/1_1094/'
req = request.Request(url)
response = request.urlopen(req)
html = response.read()
print(html)

# 解析html
listmain_soup = BeautifulSoup(html, 'lxml')

# 找到html下<div,class="listmain">节点，进行过滤，得到<div>内容以列表形式存储
listmain = listmain_soup.find_all('div', class_="listmain")
# print(listmain)
# 对过滤后的listmain html继续进行解析
download_soup = BeautifulSoup(str(listmain), 'lxml')

# 获取节点下的<a>标签
all_a = download_soup.find_all('a')
# print(all_a)
# 通过列表切片方法过滤不需要的章节
for a in all_a[15:]:
    # print(a)

    # 获取章节url
    href = a['href']
    href_url = href
    # 拼接章节完整下载地址
    section_full_url = "http://www.biqukan.com/" + href_url
    # print(section_full_url)
    # 获取章节名
    section_name = a.text
    print(section_name + " " + section_full_url)

    # 定义存储路径
    root = "F://xiaoshuo//"
    path = root + section_name + ".txt"

    # 获取下载章节HTML
    download_req = request.Request(url=section_full_url)
    download_response = request.urlopen(download_req)
    # time.sleep(5)
    download_html = download_response.read()
    # print(download_html)

    # 解析
    bf = BeautifulSoup(download_html, 'lxml')
    # 获取标签<div,class='showtxt'>
    texts = bf.find_all('div', id='content', class_='showtxt')
    # 获取标签文本内容
    soup_text = BeautifulSoup(str(texts), 'lxml')
    # print(soup_text)
    # replace方法去除文本中的空格
    section_texts = texts[0].text.replace('\xa0'*8, '\n')
    nr = str(section_texts)
    # 将爬取的内容写入txt中
    with open(path, 'a', encoding='utf-8') as file:
        file.write(section_name + '\n')
        file.write(nr)
        print('\033[1;32m 爬取完成*********************** \033[0m')