import os
import re
import urllib.request
import urllib.error
from multiprocessing import Process
interval = 1000
process_start = 1
process_end = 30
base_path = r'f:\novel'
def get_novel(args, inr):
ite = (args - 1) * inr
process = str(args)
log_path = base_path + r'\log' + str(args) + '.txt'
base_url = 'https://www.x23qb.com'
chapter_base_url = base_url + r'/book/'
urls_complete_reg = compile_reg(r'<ul class="chaw_c" id="chapterList">(.*?)<div class="chaptername">',
re.S)
urls_reg = compile_reg(r'<li><a href="(.*?)">(第.*?)</a></li>', 0)
title_reg = compile_reg(r'<title>(.*?)全.*?</title>', 0)
chapter_reg_has_next = compile_reg(r'<dt class="rd"><script>chapter.*?;</script></dt>(.*?)<p style=',
re.S)
chapter_reg_no_next = compile_reg(r'<dt class="rd"><script>chapter.*?;</script></dt>(.*?)<p id="p1nt"',
re.S)
if os.path.isfile(log_path):
with open(log_path, 'r') as f1:
log_list = f1.readlines()
ite = int(log_list[-1])
ite = ite - 1
while ite < args * inr:
ite = ite + 1
print('\033[1;34m进程:' + process + ' \033[0m 准备下载文章: ' + str(ite))
with open(log_path, 'a') as k:
k.write(str(ite) + '\n')
html = get_html(chapter_base_url + str(ite) + '/', process)
urls_html = re.findall(urls_complete_reg, html)
title = re.findall(title_reg, html)
if not (title and urls_html):
continue
urls = re.findall(urls_reg, urls_html[0])
title[0] = title[0].replace('?', '')
novel_path = os.path.join(base_path, str(ite) + ' ' + title[0] + '.txt')
if os.path.exists(novel_path):
os.remove(novel_path)
for url in urls:
count = 1
chapter_url = base_url + url[0]
chapter_title = url[1]
chapter_html = get_html(chapter_url, process)
chapter_content = re.findall(chapter_reg_has_next, chapter_html)
if chapter_content:
chapter_content[0] = chapter_content[0][:-4]
while True:
count = count + 1
chapter_next_url = chapter_url.replace('.html', '') + '_' + str(count) + '.html'
chapter_next_html = get_html(chapter_next_url, process)
chapter_next_content = re.findall(chapter_reg_has_next, chapter_next_html)
if chapter_next_content:
chapter_next_content[0] = chapter_next_content[0][8:-10]
chapter_content[0] = chapter_content[0] + chapter_next_content[0]
else:
chapter_next_content = re.findall(chapter_reg_no_next, chapter_next_html)
chapter_next_content[0] = chapter_next_content[0][8:-10]
chapter_content[0] = chapter_content[0] + chapter_next_content[0]
break
else:
chapter_content = re.findall(chapter_reg_no_next, chapter_html)
for content in chapter_content:
content = chapter_title + content
content = content.replace('</p>', '\n')
content = content.replace('<p>', '')
with open(novel_path, 'a') as f:
f.write(content)
print('\033[1;34m进程:' + process + ' \033[0m' + str(ite) + ' ' + title[
0] + ' ' + chapter_title + ' 下载成功')
def get_html(url, process):
count = 0
while True:
count = count + 1
try:
html = urllib.request.urlopen(url, timeout=10).read()
if html:
break
except:
print('\033[1;34m进程:' + process + '\033[0m \033[1;31m网络连接出现问题, 正在尝试再次请求第 ' + str(count) + ' 次\033[0m')
html = html.decode('gbk', 'ignore')
return html
def compile_reg(reg, flags):
return re.compile(reg, flags)
if __name__ == "__main__":
for i in range(process_start, process_end + 1):
p = Process(target=get_novel, args=(i, interval))
p.start()
print('进程: ' + str(i) + '启动成功')