本人小白,刚学爬虫,代码应该有很多可优化的地方,欢迎各位大大指点,交流。感谢!!!
# coding=utf-8
from bs4 import BeautifulSoup
from urllib import request
import requests
import os
import time
# 获取html
url = 'http://www.biqukan.com/1_1094/'
req = request.Request(url)
response = request.urlopen(req)
html = response.read()
print(html)
# 解析html
listmain_soup = BeautifulSoup(html, 'lxml')
# 找到html下<div,class="listmain">节点,进行过滤,得到<div>内容以列表形式存储
listmain = listmain_soup.find_all('div', class_="listmain")
# print(listmain)
# 对过滤后的listmain html继续进行解析
download_soup = BeautifulSoup(str(listmain), 'lxml')
# 获取节点下的<a>标签
all_a = download_soup.find_all('a')
# print(all_a)
# 通过列表切片方法过滤不需要的章节
for a in all_a[15:]:
# print(a)
# 获取章节url
href = a['href']
href_url = href
# 拼接章节完整下载地址
section_full_url = "http://www.biqukan.com/" + href_url
# print(section_full_url)
# 获取章节名
section_name = a.text
print(section_name + " " + section_full_url)
# 定义存储路径
root = "F://xiaoshuo//"
path = root + section_name + ".txt"
# 获取下载章节HTML
download_req = request.Request(url=section_full_url)
download_response = request.urlopen(download_req)
# time.sleep(5)
download_html = download_response.read()
# print(download_html)
# 解析
bf = BeautifulSoup(download_html, 'lxml')
# 获取标签<div,class='showtxt'>
texts = bf.find_all('div', id='content', class_='showtxt')
# 获取标签文本内容
soup_text = BeautifulSoup(str(texts), 'lxml')
# print(soup_text)
# replace方法去除文本中的空格
section_texts = texts[0].text.replace('\xa0'*8, '\n')
nr = str(section_texts)
# 将爬取的内容写入txt中
with open(path, 'a', encoding='utf-8') as file:
file.write(section_name + '\n')
file.write(nr)
print('\033[1;32m 爬取完成*********************** \033[0m')