一个被无数人爬取的网站,桌面壁纸网站,尤其是某个分类板块,堪称是LSP的最爱,各种小姐姐壁纸合集,最重要的类型无数,能让你收获满满,就是本渣渣农村人有点营养跟不上了啊!

这就有了下面的文章,异步爬虫,Python美女图异步爬虫案例小姐姐我全都要!

单线程,异步,多线程,本渣渣都粘贴复制上了,给各位大佬都安排上了,后台回复“小姐姐”即可获取源码,复制粘贴可用,什么文档,原理,本渣渣统统不会,直接 ctrl+c , ctrl+v 开搞!



目标网站:彼岸桌面
网址:http://www.netbian.com/meinv/

本渣渣仅仅爬取了前三页,如果有兴趣可自行调试,安排上!
当然也可以更换其他分类,同样也没有问题哈!
单线程运行效果

单线程爬取效果

单线程爬取时间

异步+多线程下载图片时间

异步核心源码:
async def get_content(self, url):
async with aiohttp.ClientSession() as session:
response = await session.get(url, headers=self.headers, timeout=5)
content = await response.read()
return content
async def get_parse_urls(self, url):
content = await self.get_content(url)
html = content.decode('gbk')
req = etree.HTML(html)
hrefs = req.xpath('//div[@class="list"]/ul/li/a/@href')
print(len(hrefs))
self.pare_urls.extend(hrefs)
def list_run(self):
urls_tasks = []
loop = asyncio.get_event_loop()
for i in range(1, self.page_max + 1):
print(f'>> 正在爬取第{i}页列表链接..')
if i == 1:
url = f'{self.url}{self.category}/'
else:
url = f'{self.url}{self.category}/index_{i}.htm'
c = self.get_parse_urls(url)
# 通过返回的协程对象进一步封装成一个任务对象
urls_task = asyncio.ensure_future(c)
urls_tasks.append(urls_task)
loop.run_until_complete(asyncio.wait(urls_tasks))
print(len(self.pare_urls))
async def get_parse(self, url):
content = await self.get_content(url)
html = content.decode('gbk')
req = etree.HTML(html)
img_url = req.xpath('//div[@class="pic"]/p/a/img/@src')[0]
img_name = req.xpath('//div[@class="pic"]/p/a/img/@alt')[0]
print(img_url, img_name)
data = img_url, img_name
self.datas.append(data)
多线程采集图片时间

多线程核心源码参考:
def down_run(self):
print(f'>> 正在启动多线程下载图片..')
try:
# 开4个 worker,没有参数时默认是 cpu 的核心数
pool = ThreadPool()
results = pool.map(self.down, self.datas)
pool.close()
pool.join()
print("下载所有图片完成!")
except:
print("Error: unable to start thread")
def get_urllist(self,url):
html=requests.get(url,headers=self.headers).content.decode('gbk')
req=etree.HTML(html)
hrefs=req.xpath('//div[@class="list"]/ul/li/a/@href')
print(len(hrefs))
threadings = []
for href in hrefs:
href=f'{self.url}{href}'
print(f'>> 正在爬取 {href}..')
t=threading.Thread(target=self.parse,args=(href,))
threadings.append(t)
t.start()
for x in threadings:
x.join()
print("多线程采集该页图片完成!")
单线程爬取源码参考:
# -*- coding=utf-8 -*-
# 彼岸桌面图片采集
# @author 微信:huguo00289
# @微信公众号:二爷记
import requests, os,time
from lxml import etree
from fake_useragent import UserAgent
class Net(object):
def __init__(self, category="meinv", page_max=3):
self.ua = UserAgent()
self.headers = {'User-Agent': self.ua.random}
self.url = "http://www.netbian.com"
self.category = category
os.makedirs(f'{self.category}/', exist_ok=True)
self.page_max = page_max
def get_urllist(self,url):
html=requests.get(url,headers=self.headers).content.decode('gbk')
req=etree.HTML(html)
hrefs=req.xpath('//div[@class="list"]/ul/li/a/@href')
print(len(hrefs))
for href in hrefs:
href=f'{self.url}{href}'
print(f'>> 正在爬取 {href}..')
self.parse(href)
def parse(self,url):
html = requests.get(url, headers=self.headers).content.decode('gbk')
req = etree.HTML(html)
img_url=req.xpath('//div[@class="pic"]/p/a/img/@src')[0]
img_name=req.xpath('//div[@class="pic"]/p/a/img/@alt')[0]
print(img_url,img_name)
self.down(img_url,img_name)
def down(self,img_url,img_name):
r=requests.get(img_url,headers=self.headers,timeout=5)
suffix=os.path.splitext(img_url)[-1]
with open(f'{self.category}/{img_name}{suffix}','wb') as f:
f.write(r.content)
print(f'下载图片-{img_name}{suffix}成功!')
def main(self):
for page in range(1,int(self.page_max)+1):
if page==1:
url=f'{self.url}/{self.category}/'
else:
url=f'{self.url}/{self.category}/index_{page}.htm'
print(f'>> 正在爬取第{page}页数据..')
self.get_urllist(url)
if __name__ == '__main__':
start = time.time() # 记录起始时间戳
spider = Net()
spider.main()
end = time.time() # 获取结束时间戳
print('共运行了{}秒'.format(end - start)) # 程序耗时
相信看了以上运行时间,你应该对于多线程,异步等爬取方式有了更清楚的了解!
关于异步、多线程爬虫,本渣渣已经整理好了demo,相信对于初学的你应该有用,老鸟就不必获取了,写的比较渣,见谅!

完整项目获取
请关注微信公众号:二爷记
后台回复关键词“小姐姐”
1238

被折叠的 条评论
为什么被折叠?



