import requests
from lxml import etree
import re
import time
n = int(input("请输入你要爬取小说的章数:"))
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
}
# 创建文件,如果文件不存在
with open('novel_contents.txt', 'a+', encoding='utf-8') as f:
f.close()
for i in range(n):
url = f"https://www.3bqg.cc/book/10376/{i + 1}.html"
response = requests.get(url, headers=headers)
tree = etree.HTML(response.text)
print(f'正在获取第{i + 1}章内容')
text = tree.xpath('//div[@class="Readarea ReadAjax_content"]//text()')
novel_content = []
for line in text:
# 使用正则表达式将空白字符替换为空字符串
cleaned_line = re.sub(r'\s+', '', line)
novel_content.append(cleaned_line)
# 将当前章内容写入文件,并添加首行缩进2个字符以及分隔线
with open('novel_contents.txt', 'a', encoding='utf-8') as f:
if i > 0:
f.write("===============\n")
# 首行缩进2个字符
if len(novel_content) > 0:
f.write(" " + novel_content[0] + '\n')
for line in novel_content[1:]:
f.write(line + '\n')
else:
f.write('\n')
f.close()
time.sleep(3)
python 作业06 基础聚焦爬虫爬取小说
最新推荐文章于 2026-01-09 15:10:18 发布
5169

被折叠的 条评论
为什么被折叠?



