Python爬虫作业——小说

最新推荐文章于 2024-04-26 13:34:48 发布

酒酿祺子

最新推荐文章于 2024-04-26 13:34:48 发布

阅读量359

点赞数 1

文章标签：爬虫

本文链接：https://blog.youkuaiyun.com/weixin_74984440/article/details/132074543

版权

本文介绍了使用Python进行网页抓取的过程，包括使用requests库获取HTML内容，lxml解析HTML，数据清洗，以及模拟浏览器行为（如设置User-Agent和Cookie）。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.导入相关库

import requests
from lxml import etree
import time
import random

2.获取下一页链接的函数

def next_url(next_url_element):
    nxturl = 'http://www.365kk.cc/255/255036/' 
    index = next_url_element.rfind('/') + 1
    nxturl += next_url_element[index:]
    return nxturl

3.数据清洗函数

def clean_data(filename, info):
    print("\n==== 数据清洗开始 ====")
    new_filename = 'new' + filename

    f_old = open(filename, 'r', encoding='utf-8')
    f_new = open(new_filename, 'w', encoding='utf-8')

    f_new.write('==  《' + info[0] + '》\r\n')  # 标题
    f_new.write('==  ' + info[1] + '\r\n')  # 作者
    f_new.write('==  ' + info[2] + '\r\n')  # 最后更新时间
    f_new.write("=" * 10)
    f_new.write('\r\n')
    f_new.write('==  ' + info[3] + '\r\n')  # 简介
    f_new.write("=" * 10)
    f_new.write('\r\n')

    lines = f_old.readlines()  # 按行读取原文档中的内容
    empty_cnt = 0  # 用于记录连续的空行数

4.遍历原文档中的每行

    for line in lines:
        if line == '\n':
            empty_cnt += 1
            if empty_cnt >= 2:
                continue
        else:
            empty_cnt = 0
        if line.startswith("\u3000\u3000"):
            line = line[2:]
            f_new.write(line)
        elif line.startswith("第"):
            f_new.write("\r\n")
            f_new.write("-" * 20)
            f_new.write("\r\n")
            f_new.write(line)
        else:
            f_new.write(line)

    f_old.close()
    f_new.close()

5.请求头（填入自己浏览器的信息）

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188',
    'Cookie': 'ASP.NET_SessionId=bo0bt32byauazdbtr25knqv4; fontFamily=null; fontColor=null; fontSize=null; bg=null',
    'Host': 'www.365kk.cc',
    'Connection': 'keep-alive'

具体如下，打开抓到的文件，就可以找到 User-Agent和Cookie

6.进入，分析

main_url = "http://www.365kk.cc/255/255036/"

main_resp = requests.get(main_url, headers=headers)

main_text = main_resp.content.decode('utf-8')

main_html = etree.HTML(main_text)

bookTitle = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]
author = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]
update = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]
introduction = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[2]/text()')[0]

maxPages = 6  
cnt = 0
lastTitle = ''

7.爬（结束）

url = 'http://www.365kk.cc/255/255036/4147599.html'  # 爬取起点

endurl = 'http://www.365kk.cc/255/255036/4148385.html'   # 爬取终点

while url != endurl:
    cnt += 1  # 记录当前爬取的页面

    resp = requests.get(url, headers)
    text = resp.content.decode('utf-8')
    html = etree.HTML(text)
    title = html.xpath('//*[@class="title"]/text()')[0]
    contents = html.xpath('//*[@id="content"]/text()')

    print("cnt: {}, title = {}, url = {}".format(cnt, title, url))
    print(contents)

    with open(bookTitle + '.txt', 'a', encoding='utf-8') as f_new:
        if title != lastTitle:  # 章节标题改变
            f_new.write(title)  # 写入新的章节标题
            lastTitle = title  # 更新章节标题
        for content in contents:
            f_new.write(content)
            f_new.write('\n\n')
        f_new.close()

    # 获取"下一页"按钮指向的链接
    next_url_element = html.xpath('//*[@class="section-opt m-bottom-opt"]/a[3]/@href')[0]

    # 传入函数next_url得到下一页链接
    url = next_url(next_url_element)

    sleepTime = random.randint(2, 5) 
    time.sleep(sleepTime) 

clean_data(bookTitle + '.txt', [bookTitle, author, update, introduction])
print("complete!")

8.结果

完整代码

import requests
from lxml import etree
import time
import random

def next_url(next_url_element):
    nxturl = 'http://www.365kk.cc/255/255036/'
    index = next_url_element.rfind('/') + 1
    nxturl += next_url_element[index:]
    return nxturl

def clean_data(filename, info):

    print("\n==== 数据清洗开始 ====")

    new_filename = 'new' + filename

    f_old = open(filename, 'r', encoding='utf-8')
    f_new = open(new_filename, 'w', encoding='utf-8')

    f_new.write('==  《' + info[0] + '》\r\n')  
    f_new.write('==  ' + info[1] + '\r\n')  
    f_new.write('==  ' + info[2] + '\r\n')  
    f_new.write("=" * 10)
    f_new.write('\r\n')
    f_new.write('==  ' + info[3] + '\r\n')  
    f_new.write("=" * 10)
    f_new.write('\r\n')

    lines = f_old.readlines()
    empty_cnt = 0

    for line in lines:
        if line == '\n':
            empty_cnt += 1
            if empty_cnt >= 2:
                continue
        else:
            empty_cnt = 0
        if line.startswith("\u3000\u3000"):
            line = line[2:]
            f_new.write(line)
        elif line.startswith("第"):
            f_new.write("\r\n")
            f_new.write("-" * 20)
            f_new.write("\r\n")
            f_new.write(line)
        else:
            f_new.write(line)

    f_old.close()
    f_new.close()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188',
    'Cookie': 'ASP.NET_SessionId=bo0bt32byauazdbtr25knqv4; fontFamily=null; fontColor=null; fontSize=null; bg=null',
    'Host': 'www.365kk.cc',
    'Connection': 'keep-alive'
}
main_url = "http://www.365kk.cc/255/255036/"

main_resp = requests.get(main_url, headers=headers)

main_text = main_resp.content.decode('utf-8')

main_html = etree.HTML(main_text)

bookTitle = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]
author = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]
update = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]
introduction = main_html.xpath('/html/body/div[4]/div[1]/div/div/div[2]/div[2]/text()')[0]

maxPages = 6
cnt = 0

lastTitle = ''

url = 'http://www.365kk.cc/255/255036/4147599.html'

endurl = 'http://www.365kk.cc/255/255036/4148385.html'

while url != endurl:
    cnt += 1
    resp = requests.get(url, headers)
    text = resp.content.decode('utf-8')
    html = etree.HTML(text)
    title = html.xpath('//*[@class="title"]/text()')[0]
    contents = html.xpath('//*[@id="content"]/text()')

    print("cnt: {}, title = {}, url = {}".format(cnt, title, url))
    print(contents)

    with open(bookTitle + '.txt', 'a', encoding='utf-8') as f_new:
        if title != lastTitle:
            f_new.write(title)
            lastTitle = title
        for content in contents:
            f_new.write(content)
            f_new.write('\n\n')
        f_new.close()

    next_url_element = html.xpath('//*[@class="section-opt m-bottom-opt"]/a[3]/@href')[0]

    url = next_url(next_url_element)

    sleepTime = random.randint(2, 5)
    time.sleep(sleepTime)

clean_data(bookTitle + '.txt', [bookTitle, author, update, introduction])
print("complete!")