python学习之小说爬虫

# coding:utf8
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing
import requests, os, codecs, time
from lxml import etree

url = 'https://www.biquge5200.cc/79_79883/' # 要下载的小说章节列表页面url


def getsource(url):
try:
s = requests.get(url)
except:
print('访问异常,跳过~!')
else:
s.encoding = 'gbk'
return s.text


def getlist(url):
global txtname, txtzz
#解析地址
html = getsource(url)
ehtml = etree.HTML(html)
u = ehtml.xpath('//*[@id="list"]/dl/dd/a/@href')
t = ehtml.xpath('//*[@id="list"]/dl/dd/a/text()')
txtname = ehtml.xpath('//*[@id="info"]/h1/text()')[0].replace('\\', '').replace('/', '').replace(':', '').replace(
'*', '').replace('?', '').replace('"', '').replace('<', '').replace('>', '').replace('|', '')
txtzz = ehtml.xpath('//*[@id="info"]/p[1]/text()')[0].replace('\xa0', '')
num = 0
#循环urllist
for i in range(9, len(u)):
urllist.append(u[i] + '|' + t[i] + '|' + str(num))
print(urllist)
print(u[i] + '|' + t[i] + '|' + str(num))
num += 1


def downtxt(url):
global downcount
u = url.split('|')[0]
t = url.split('|')[1]
num = url.split('|')[2]
content = ''
while len(content) == 0:
html = getsource(u)
ehtml = etree.HTML(html)
content = ehtml.xpath('string(//*[@id="content"])').replace(' ', '\r\n').replace('  ', '\r\n').replace(
'\xa0', '').replace('\ufffd', '').replace('\u266a', '').replace('readx;', '')
if os.path.exists(savepath + num + '.txt'):
print(num + '.txt 已经存在!')
else:
with codecs.open(savepath + num + '.txt', 'a')as f:
f.write('\r\n' + t + '\r\n' + content)
print(t + ' 下载完成!')
downcount += 1


time_start = time.time();
downcount = 0
urllist = []
getlist(url)
savepath = os.getcwd() + '\\' + txtname + '\\'
if os.path.exists(savepath) == False:
os.makedirs(savepath)
pool = ThreadPool(multiprocessing.cpu_count())
results = pool.map(downtxt, urllist)
pool.close()
pool.join()
print('开始合并txt...')
with codecs.open(savepath + txtname + '.txt', 'a')as f:
f.write(txtname)
f.write('\r\n')
f.write(txtzz)
f.write('\r\n')
for i in range(0, len(urllist)):
with open(savepath + str(i) + '.txt', "r") as fr:
txt = fr.read()
f.write(txt)
f.write('===========================')
fr.close()
os.remove(savepath + str(i) + '.txt')
print('小说合并完成~!')

print('')
print('*' * 15 + ' 任务完成,结果如下:' + '*' * 15)
print('')
print('<' + txtname + '> 下载完成' + ',获取并下载章节页面:' + str(downcount) + ' 个')
print('')
print('耗时:' + str(time.time() - time_start) + ' s')
print('')
print('*' * 51)

转载于:https://www.cnblogs.com/hfct/p/10977974.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值