python爬虫 小说下载

笔趣阁的网页结构比较简单,但也有点乱,需要注意细节。

  • 需要增加运行日志
#-*- coding:utf8 -*-
#从https://www.xbiquge.cc/网站下载小说
#https://www.xbiquge.cc/book/9860/
#https://www.xbiquge.cc/book/9860/7063460.html
#catalog目录,chapter章节
#r'[\u4e00-\u9fa5]+' 1到任意多个汉字
#r'\d{1,10}' 章节链接编号,章节链接在类名为box_con的第2个div中
#r'[\u4e00-\u9fa5]+\d{1,4}[\u4e00-\u9fa5]+ [\u4e00-\u9fa5]+' 小说章节名
import requests
import json
import re
import time
import os
import sys
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Cm
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
url = input('please input url:')
if len(url) < 24:
    #url = 'https://www.xbiquge.cc/book/9860/'#为了测试方便,设置默认地址
    url = 'https://www.xbiquge.cc/book/14779/'
rootPath = r'C:\Users\QQ\Desktop\ls\py\{}'
#name = '我的微信连三界 狼烟新书'#name和saveCatalog()必须要注释掉一个
name = '一世兵王 我本疯狂新书'
def getCatalog():

    def saveCatalog():
        rep = requests.get(url, headers = headers)
        print(rep.text[:10])
        rep.encoding = 'gbk'
        soup = BeautifulSoup(rep.text, 'lxml')#解析
        title = soup.title.contents[0]
        print(title)
        global name
        name = (re.findall('(.+?) ', title))[0] + ' ' + (re.findall('_(.+?)_', title))[0]#小说名
        print(name)
        mkDir(path = rootPath.format(name))#为之后将要保存的文件创建文件夹
        f1 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'目录')
        with open(f1, 'w') as f:
            f.write(rep.text)
    #saveCatalog()#只需要运行一次
    
    def findAllChapter():
        f1 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'目录')
        f2 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'章节链接')
        with open(f1, 'r') as f:
            rep = f.read()
            soup = BeautifulSoup(rep, 'lxml')
        s = str(soup.find(id='list'))
        soup = BeautifulSoup(s, 'lxml')
        ss = soup.findAll('a')[:]
        global cul,cnl
        cul = re.findall(r'\d{7,8}.html', str(s))#ChapterUrlList
        #cnl = re.findall(r'第\d{1,4}章 [\u4e00-\u9fa5]+', str(ss))#ChapterNameList,我的微信连三界,漏掉了第373章 B级任务,修改
        #cnl = re.findall(r'>(第{0,1}\d{1,4}章 .+?)<', str(s))#ChapterNameList,一世兵王,漏掉了010 章 搂腰算非礼吗?
        #cnl = re.findall(r'>(第{0,1}\d{1,4} {0,1}章 .+?)<', str(s))#ChapterNameList,一世兵王,漏掉了137章无名字
        cnl = re.findall(r'>(第?\d{1,4} ?章? ?.*?)<', str(s))
        print(len(ss),len(cul),len(cnl))
        print(cul,cnl)
        print('len(cul):',len(cul),'len(cnl):',len(cnl))
        
        for i in range(0,1588):
            #检查正则表达式,检查完后需注释掉
            c = str(ss[i])
            cu = re.search(r'\d{7,8}.html',str(c)).group()
            cn = c[c.index('.html')+7:-4]
            if cu != cul[i] or cn != cnl[i]:
                print(cu,cul[i],cu==cul[i],cn,cnl[i],cn==cnl[i])
                break
        
        with open(f2, 'w') as f:
            for u,n in zip(cul,cnl):
                f.write(u + n + '\n')
        if len(cul) == len(cnl):
            with open(f2, 'w') as f:
                for u,n in zip(cul,cnl):
                    f.write(u + n + '\n')
            print('All url and name of chapters from source have been saved in this file:{}'.format(f2))
        else:
            print('Rules require changes the regular expression')#需要修改正则表达式来适应网页的变化
        #如果未保存小说目录信息,则获取并保存,反之,开始提取各个章节的信息
    findAllChapter()


def mkDir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def missingChapter():
    new = int(re.search(r'\d{1,4}',cnl[-1]).group())
    #print('newest chapter: ',cnl[-1])
    nl = [0]#chapter number list
    ml = []#missing chapter number list
    for i in range(len(cnl)):
        nl.append(int(re.search(r'\d{1,4}',cnl[i]).group()))
        d = nl[i] - nl[i-1]-1
        while d>0:
            ml.append(nl[i]-d)
            #print("missing chapters' number:{}!!!".format(ml[-1]),d)
            d-=1
    return nl
    '''
    for i in ml:
        if str(i) in str(cnl):
            print(i,True)
        else:
            print(i,False)
    '''
def saveChapter():
    f3 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,name)
    #print(list(zip(cul[1900:],cnl[1900:])))
    with open(f3, 'a') as f:
        for cu,cn in zip(cul[modify():],cnl[modify():]):#开始位置根据实际情况调整
            rep = requests.get(url + cu, headers = headers)
            rep.encoding = 'gbk'
            content = ''
            for s in rep.text.splitlines():
                test1 = re.findall(r'&nbsp;&nbsp;&nbsp;&nbsp;(.+)<', s)
                if test1:
                    content += test1[0] + '\n'
            if len(content)>1200:#章节字数少于1200则不写入文件
                f.write(content)
                f.write('\n')
                print('contents has been writen to file which from : {} {}'.format(cu,cn))
            else:
                f.write('\n')
                print(content)
                print('There are problems in this chapter : {} {} !!!'.format(cu,cn))
                break
def runlog():
    #记录每次运行时长、运行时间、已保存的章节、缺失章节、增加的章节等信息
    pass
def modify():
    #检查文件中是否有广告信息、多余字符、空章节。根据检查结果对saveChapter()进行完善
    f3 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,name)
    f4 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'modify')
    with open(f3, 'r') as f, open(f4, 'w') as fs:
        cc(f)
        c=0
        li = f.readlines()
        #print(type(li),len(li))
        for n,i in enumerate(li):
            fs.write(i)
            if  i == '\n' and n < len(li)-1:
                c+=1
                if '第' not in li[n+1] and '章' not in li[n+1]:
                    #print(cnl[c])
                    fs.write(cnl[c] + '\n')
                    pass
            
    print('c :',c,'cnl[c] :', cnl[c])
    return c
def cc(file):
    #count characters
    f00 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'other characters')
    hs0 = {
        3 : '·、【】!@¥—~……();‘’:“”《》,。?、',
        4 : ''' `~!@#$%^&*()_+-={}|:%"<>?[]\;',./×'''
        }
    hs = {
        1 : 0,#中文
        2 : 0,#english letter
        3 : 0,#中文标点符号
        4 : 0,#english punctuation marks
        5 : 0,#数字
        6 : 0,#行数
        7 : 0,#中文字数占总字符数的比例
        }
    string = file.read()
    with open(f00, 'w') as f:
        for i in string:
            if 19968 <= ord(i) <= 40869:
                hs[1]+=1
            elif 65 <= ord(i) <=90 or 97 <= ord(i) <= 122:
                hs[2]+=1
            elif i in hs0[3]:
                hs[3]+=1
            elif i in hs0[4]:
                hs[4]+=1
            elif 48 <= ord(i) <= 57:
                hs[5]+=1
            elif i == '\n':
                hs[6]+=1
            else:
                f.write(i)#检查是否有其他特殊字符,应该是没有的。如果有,可能乱码了
    hs[7] = hs[1]/(len(string)+1)#len+1避免报错ZeroDivisionError: division by zero
    file.seek(0)
    l = ['中文', 'english letter', '中文标点符号', 'english punctuation marks', '数字', '行数', '中文字数占总字符数的比例']
    for i in range(7):
        if i == 6:
            print('{} : {:.2%}'.format(l[i], hs[i+1]))
        else:
            print('{} : {:.2f}万'.format(l[i], hs[i+1]/10000))
def main():
    start = time.perf_counter()
    getCatalog()
    missingChapter()
    saveChapter()
    modify()
    end = time.perf_counter()
    
    print('total time consuming : ',(end - start)//60, 'minutes',(end  - start)%60, 'seconds')
main()

需要改进的地方:

  1. 逐一访问各章节非常耗时
  2. 没有完全避开广告信息
  3. 笔趣阁网页内容经常缺失,正则表达式未完全适应所有情况
  4. 章节序号为中文时,无法匹配
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值