1.导入相关库
import requests
from lxml import etree
import time
import random
2.获取下一页链接的函数
def next_url(next_url_element):
nxturl = 'http://www.365kk.cc/255/255036/'
index = next_url_element.rfind('/') + 1
nxturl += next_url_element[index:]
return nxturl
3.数据清洗函数
def clean_data(filename, info):
print("\n==== 数据清洗开始 ====")
new_filename = 'new' + filename
f_old = open(filename, 'r', encoding='utf-8')
f_new = open(new_filename, 'w', encoding='utf-8')
f_new.write('== 《' + info[0] + '》\r\n') # 标题
f_new.write('== ' + info[1] + '\r\n') # 作者
f_new.write('== ' + info[2] + '\r\n') # 最后更新时间
f_new.write("=" * 10)
f_new.write('\r\n')
f_new.write('== ' + info[3] + '\r\n') # 简介
f_new.write("=" * 10)
f_new.write('\r\n')
lines = f_old.readlines() # 按行读取原文档中的内容
empty_cnt = 0 # 用于记录连续的空行数
4.遍历原文档中的每行
for line in lines:
if line == '\n':
empty_cnt += 1
if empty_cnt >= 2:
continue
else:
empty_cnt = 0
if line.startswith("\u3000\u3000"):
line = line[2:]
f_new.write(line)
elif line.startswith("第"):
f_new.write("\r\n")
f_new.write("-" * 20)
f_new.write("\r\n")
f_new.write(line)
else:
f_new.write(line)
f_old.close()
f_new.close()
5.请求头(填入自己浏览器的信息)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188',
'Cookie': 'ASP.NET_SessionId=bo0bt32byauazdbtr25knqv4; fontFamily=null; fontColor=null; fontSize=null; bg=null',
'Host': 'www.365kk.cc',
'Connection': 'keep-alive'
具体如下,打开抓到的文件,就可以找到 User-Agent和Cookie
6.进入,分析
main_url = "http://www.365kk.cc/255/255036/"
main_resp = requests.get(main_url, headers=headers)
main_text = main_resp.content.decode('utf-8')
main_html = etree.HTML(main_text)
bookTitle = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]
author = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]
update = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]
introduction = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[2]/text()')[0]
maxPages = 6
cnt = 0
lastTitle = ''
7.爬(结束)
url = 'http://www.365kk.cc/255/255036/4147599.html' # 爬取起点
endurl = 'http://www.365kk.cc/255/255036/4148385.html' # 爬取终点
while url != endurl:
cnt += 1 # 记录当前爬取的页面
resp = requests.get(url, headers)
text = resp.content.decode('utf-8')
html = etree.HTML(text)
title = html.xpath('//*[@class="title"]/text()')[0]
contents = html.xpath('//*[@id="content"]/text()')
print("cnt: {}, title = {}, url = {}".format(cnt, title, url))
print(contents)
with open(bookTitle + '.txt', 'a', encoding='utf-8') as f_new:
if title != lastTitle: # 章节标题改变
f_new.write(title) # 写入新的章节标题
lastTitle = title # 更新章节标题
for content in contents:
f_new.write(content)
f_new.write('\n\n')
f_new.close()
# 获取"下一页"按钮指向的链接
next_url_element = html.xpath('//*[@class="section-opt m-bottom-opt"]/a[3]/@href')[0]
# 传入函数next_url得到下一页链接
url = next_url(next_url_element)
sleepTime = random.randint(2, 5)
time.sleep(sleepTime)
clean_data(bookTitle + '.txt', [bookTitle, author, update, introduction])
print("complete!")
8.结果
完整代码
import requests
from lxml import etree
import time
import random
def next_url(next_url_element):
nxturl = 'http://www.365kk.cc/255/255036/'
index = next_url_element.rfind('/') + 1
nxturl += next_url_element[index:]
return nxturl
def clean_data(filename, info):
print("\n==== 数据清洗开始 ====")
new_filename = 'new' + filename
f_old = open(filename, 'r', encoding='utf-8')
f_new = open(new_filename, 'w', encoding='utf-8')
f_new.write('== 《' + info[0] + '》\r\n')
f_new.write('== ' + info[1] + '\r\n')
f_new.write('== ' + info[2] + '\r\n')
f_new.write("=" * 10)
f_new.write('\r\n')
f_new.write('== ' + info[3] + '\r\n')
f_new.write("=" * 10)
f_new.write('\r\n')
lines = f_old.readlines()
empty_cnt = 0
for line in lines:
if line == '\n':
empty_cnt += 1
if empty_cnt >= 2:
continue
else:
empty_cnt = 0
if line.startswith("\u3000\u3000"):
line = line[2:]
f_new.write(line)
elif line.startswith("第"):
f_new.write("\r\n")
f_new.write("-" * 20)
f_new.write("\r\n")
f_new.write(line)
else:
f_new.write(line)
f_old.close()
f_new.close()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188',
'Cookie': 'ASP.NET_SessionId=bo0bt32byauazdbtr25knqv4; fontFamily=null; fontColor=null; fontSize=null; bg=null',
'Host': 'www.365kk.cc',
'Connection': 'keep-alive'
}
main_url = "http://www.365kk.cc/255/255036/"
main_resp = requests.get(main_url, headers=headers)
main_text = main_resp.content.decode('utf-8')
main_html = etree.HTML(main_text)
bookTitle = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]
author = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]
update = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]
introduction = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[2]/text()')[0]
maxPages = 6
cnt = 0
lastTitle = ''
url = 'http://www.365kk.cc/255/255036/4147599.html'
endurl = 'http://www.365kk.cc/255/255036/4148385.html'
while url != endurl:
cnt += 1
resp = requests.get(url, headers)
text = resp.content.decode('utf-8')
html = etree.HTML(text)
title = html.xpath('//*[@class="title"]/text()')[0]
contents = html.xpath('//*[@id="content"]/text()')
print("cnt: {}, title = {}, url = {}".format(cnt, title, url))
print(contents)
with open(bookTitle + '.txt', 'a', encoding='utf-8') as f_new:
if title != lastTitle:
f_new.write(title)
lastTitle = title
for content in contents:
f_new.write(content)
f_new.write('\n\n')
f_new.close()
next_url_element = html.xpath('//*[@class="section-opt m-bottom-opt"]/a[3]/@href')[0]
url = next_url(next_url_element)
sleepTime = random.randint(2, 5)
time.sleep(sleepTime)
clean_data(bookTitle + '.txt', [bookTitle, author, update, introduction])
print("complete!")