笔趣阁的网页结构比较简单,但也有点乱,需要注意细节。
- 需要增加运行日志
#-*- coding:utf8 -*-
#从https://www.xbiquge.cc/网站下载小说
#https://www.xbiquge.cc/book/9860/
#https://www.xbiquge.cc/book/9860/7063460.html
#catalog目录,chapter章节
#r'[\u4e00-\u9fa5]+' 1到任意多个汉字
#r'\d{1,10}' 章节链接编号,章节链接在类名为box_con的第2个div中
#r'[\u4e00-\u9fa5]+\d{1,4}[\u4e00-\u9fa5]+ [\u4e00-\u9fa5]+' 小说章节名
import requests
import json
import re
import time
import os
import sys
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Cm
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
url = input('please input url:')
if len(url) < 24:
#url = 'https://www.xbiquge.cc/book/9860/'#为了测试方便,设置默认地址
url = 'https://www.xbiquge.cc/book/14779/'
rootPath = r'C:\Users\QQ\Desktop\ls\py\{}'
#name = '我的微信连三界 狼烟新书'#name和saveCatalog()必须要注释掉一个
name = '一世兵王 我本疯狂新书'
def getCatalog():
def saveCatalog():
rep = requests.get(url, headers = headers)
print(rep.text[:10])
rep.encoding = 'gbk'
soup = BeautifulSoup(rep.text, 'lxml')#解析
title = soup.title.contents[0]
print(title)
global name
name = (re.findall('(.+?) ', title))[0] + ' ' + (re.findall('_(.+?)_', title))[0]#小说名
print(name)
mkDir(path = rootPath.format(name))#为之后将要保存的文件创建文件夹
f1 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'目录')
with open(f1, 'w') as f:
f.write(rep.text)
#saveCatalog()#只需要运行一次
def findAllChapter():
f1 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'目录')
f2 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'章节链接')
with open(f1, 'r') as f:
rep = f.read()
soup = BeautifulSoup(rep, 'lxml')
s = str(soup.find(id='list'))
soup = BeautifulSoup(s, 'lxml')
ss = soup.findAll('a')[:]
global cul,cnl
cul = re.findall(r'\d{7,8}.html', str(s))#ChapterUrlList
#cnl = re.findall(r'第\d{1,4}章 [\u4e00-\u9fa5]+', str(ss))#ChapterNameList,我的微信连三界,漏掉了第373章 B级任务,修改
#cnl = re.findall(r'>(第{0,1}\d{1,4}章 .+?)<', str(s))#ChapterNameList,一世兵王,漏掉了010 章 搂腰算非礼吗?
#cnl = re.findall(r'>(第{0,1}\d{1,4} {0,1}章 .+?)<', str(s))#ChapterNameList,一世兵王,漏掉了137章无名字
cnl = re.findall(r'>(第?\d{1,4} ?章? ?.*?)<', str(s))
print(len(ss),len(cul),len(cnl))
print(cul,cnl)
print('len(cul):',len(cul),'len(cnl):',len(cnl))
for i in range(0,1588):
#检查正则表达式,检查完后需注释掉
c = str(ss[i])
cu = re.search(r'\d{7,8}.html',str(c)).group()
cn = c[c.index('.html')+7:-4]
if cu != cul[i] or cn != cnl[i]:
print(cu,cul[i],cu==cul[i],cn,cnl[i],cn==cnl[i])
break
with open(f2, 'w') as f:
for u,n in zip(cul,cnl):
f.write(u + n + '\n')
if len(cul) == len(cnl):
with open(f2, 'w') as f:
for u,n in zip(cul,cnl):
f.write(u + n + '\n')
print('All url and name of chapters from source have been saved in this file:{}'.format(f2))
else:
print('Rules require changes the regular expression')#需要修改正则表达式来适应网页的变化
#如果未保存小说目录信息,则获取并保存,反之,开始提取各个章节的信息
findAllChapter()
def mkDir(path):
if not os.path.exists(path):
os.makedirs(path)
def missingChapter():
new = int(re.search(r'\d{1,4}',cnl[-1]).group())
#print('newest chapter: ',cnl[-1])
nl = [0]#chapter number list
ml = []#missing chapter number list
for i in range(len(cnl)):
nl.append(int(re.search(r'\d{1,4}',cnl[i]).group()))
d = nl[i] - nl[i-1]-1
while d>0:
ml.append(nl[i]-d)
#print("missing chapters' number:{}!!!".format(ml[-1]),d)
d-=1
return nl
'''
for i in ml:
if str(i) in str(cnl):
print(i,True)
else:
print(i,False)
'''
def saveChapter():
f3 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,name)
#print(list(zip(cul[1900:],cnl[1900:])))
with open(f3, 'a') as f:
for cu,cn in zip(cul[modify():],cnl[modify():]):#开始位置根据实际情况调整
rep = requests.get(url + cu, headers = headers)
rep.encoding = 'gbk'
content = ''
for s in rep.text.splitlines():
test1 = re.findall(r' (.+)<', s)
if test1:
content += test1[0] + '\n'
if len(content)>1200:#章节字数少于1200则不写入文件
f.write(content)
f.write('\n')
print('contents has been writen to file which from : {} {}'.format(cu,cn))
else:
f.write('\n')
print(content)
print('There are problems in this chapter : {} {} !!!'.format(cu,cn))
break
def runlog():
#记录每次运行时长、运行时间、已保存的章节、缺失章节、增加的章节等信息
pass
def modify():
#检查文件中是否有广告信息、多余字符、空章节。根据检查结果对saveChapter()进行完善
f3 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,name)
f4 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'modify')
with open(f3, 'r') as f, open(f4, 'w') as fs:
cc(f)
c=0
li = f.readlines()
#print(type(li),len(li))
for n,i in enumerate(li):
fs.write(i)
if i == '\n' and n < len(li)-1:
c+=1
if '第' not in li[n+1] and '章' not in li[n+1]:
#print(cnl[c])
fs.write(cnl[c] + '\n')
pass
print('c :',c,'cnl[c] :', cnl[c])
return c
def cc(file):
#count characters
f00 = r'C:\Users\QQ\Desktop\ls\py\{}\{}.txt'.format(name,'other characters')
hs0 = {
3 : '·、【】!@¥—~……();‘’:“”《》,。?、',
4 : ''' `~!@#$%^&*()_+-={}|:%"<>?[]\;',./×'''
}
hs = {
1 : 0,#中文
2 : 0,#english letter
3 : 0,#中文标点符号
4 : 0,#english punctuation marks
5 : 0,#数字
6 : 0,#行数
7 : 0,#中文字数占总字符数的比例
}
string = file.read()
with open(f00, 'w') as f:
for i in string:
if 19968 <= ord(i) <= 40869:
hs[1]+=1
elif 65 <= ord(i) <=90 or 97 <= ord(i) <= 122:
hs[2]+=1
elif i in hs0[3]:
hs[3]+=1
elif i in hs0[4]:
hs[4]+=1
elif 48 <= ord(i) <= 57:
hs[5]+=1
elif i == '\n':
hs[6]+=1
else:
f.write(i)#检查是否有其他特殊字符,应该是没有的。如果有,可能乱码了
hs[7] = hs[1]/(len(string)+1)#len+1避免报错ZeroDivisionError: division by zero
file.seek(0)
l = ['中文', 'english letter', '中文标点符号', 'english punctuation marks', '数字', '行数', '中文字数占总字符数的比例']
for i in range(7):
if i == 6:
print('{} : {:.2%}'.format(l[i], hs[i+1]))
else:
print('{} : {:.2f}万'.format(l[i], hs[i+1]/10000))
def main():
start = time.perf_counter()
getCatalog()
missingChapter()
saveChapter()
modify()
end = time.perf_counter()
print('total time consuming : ',(end - start)//60, 'minutes',(end - start)%60, 'seconds')
main()
需要改进的地方:
- 逐一访问各章节非常耗时
- 没有完全避开广告信息
- 笔趣阁网页内容经常缺失,正则表达式未完全适应所有情况
- 章节序号为中文时,无法匹配