年纪不小了,又来学编程,压力不小
网上找了不少的资料,断断续续的,学了有大半个月了,到现在,只学会了爬虫爬小说的部分
单进程爬取速度太慢,今天在网上搜集了很多的多进程资料,自己摸索多进程的爬取。
学习过程中,遇到了不少的困难,也走了不少的弯路,代码写的不够严谨和优美,欢迎老师们批评指正。
==
废话不多说,贴代码:
from multiprocessing import Pool
import time
import os
import requests
from bs4 import BeautifulSoup
#用request.get获取小说列表页html内容
def getHtml(url):
i=0
while i <= 3:
try:
html = requests.get(url,timeout=1)
html.encoding = html.apparent_encoding
html = html.content.decode(html.apparent_encoding)
return html
except:
i +=1
print(i)
if i > 3 :exit()
# 获取小说章节内容
def getText(url):
url = "http://www.jingcaiyuedu.com"+url
html = getHtml(url)
soup = BeautifulSoup(html,'html.parser')
title = soup.find('h1',attrs={'class':'readTitle'}).get_text()
content = soup.find('div',attrs={'id': 'htmlContent'}).get_text()
with open('{}.txt'.format(title),'w') as f: f.write(content)
print("%s 保存完成..."% title)
# 获取章节列表
def getList():
url = "http://www.jingcaiyuedu.com/book/43206.html"
html = getHtml(url)
soup = BeautifulSoup(html,'html.parser')
#list = soup.find('div', attrs={'class': 'panel-body panel-chapterlist'}).find_all('a')
title = soup.find('meta',attrs={'property':'og:title'})['content']
author = soup.find('meta',attrs={'property':'og:novel:author'})['content']
#with open("{}.txt".format(title),'w') as f:f.write("<<%s>>\n\t作者:%s\n\n"%(title,author))
#创建小说保存目录
if not os.path.exists(title):
os.mkdir(title)
#转到小说路径
os.chdir(title)
print(title + " "+ author )
list = soup.find('div', attrs={'class': 'panel panel-default hidden-xs'}).find_all('a')
return list
def run():
start = time.time()
list = getList()
pool = Pool(processes=6,maxtasksperchild=1)
for url in list:
#print(url['href'])
pool.apply_async(getText,args=(url['href'],))
pool.close()
pool.join()
print("小说抓取完毕,共计耗时%d秒。"%(time.time()-start))
if __name__ == "__main__":
run()