Python多线程/协程爬虫应用(带执行效率对比)

该博客探讨了两种不同的网络爬虫实现方式:多线程和协程。通过比较它们在爬取小说网站内容时的效率,展示了多线程版本用时13.42秒,而协程版本用时22.84秒。代码示例中,作者首先定义了获取并解析单个章节内容的函数,然后分别使用多线程的ThreadPoolExecutor和协程的asyncio运行这些任务。最后,将内容保存到本地txt文件。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

多线程版本

#encoding='utf-8' 
import requests
from lxml import etree
import time
"""
with open('./song.txt','w',encoding='utf-8') as fp:
    fp.write(resp.text)

"""

#############################多线程版本  13.422082662582397
###跳转单本网址,获取文本内容(此处需处理)
def parse_single_txt(name,url):
	single_txt_data = None
	while True:
		try:
			single_txt_data = requests.get(url)
		except:
			print("章节{}信息获取失败,正在重新获取".format(name))
		else:
			# print("章节{}信息获取成功!".format(name))   #成功状态下选择不打印章节信息,失败再打印吧
			break
	temp = etree.HTML(single_txt_data.content.decode("gbk"))
	data = temp.xpath('//*[@id="htmlContent"]/text()')
	save_single_txt(name,data)

###保存单本文件
def save_single_txt(name,data):
	with open('./txt1/{}.txt'.format(name),'w',encoding='utf-8') as fp:
		for i in data:
			fp.write(str(i)+"\n")

if __name__ == '__main__':
	book_url = "https://www.92qb.com/xiaoshuo/8/8233/"
	resp = None
	while True:
		try:
			resp = requests.get(book_url)
		except:
			print("目录信息获取失败,正在重新获取")
		else:
			print("目录信息获取成功!")
			break
	html = etree.HTML(resp.content.decode("gbk"))
	lis = html.xpath('//*[@id="header"]/div[3]/div[3]/ul/li')

	from concurrent.futures import ThreadPoolExecutor
	t1 = time.time()
	with ThreadPoolExecutor(50) as t:
		for index,single_txt in enumerate(lis):
			name = str(single_txt.xpath("./a/text()")[0])
			if name.count("?")>0:
				name = name.replace("?","?")
			url = book_url + str(single_txt.xpath("./a/@href")[0])
			t.submit(parse_single_txt, name=str(index)+name,url = url)
	t2 = time.time()
	print(t2-t1)

协程版本

#encoding='utf-8' 
import requests
from lxml import etree
import time
import asyncio
import aiohttp
import aiofiles
"""
with open('./song.txt','w',encoding='utf-8') as fp:
    fp.write(resp.text)

"""


#############################协程版本  22.837141275405884
async def parse_save(name,url):
	async with aiohttp.ClientSession() as session:
		async with session.get(url) as resp:
			temp = etree.HTML(await resp.text())
			data = temp.xpath('//*[@id="htmlContent"]/text()')
			async with aiofiles.open(r'./txt/{}.txt'.format(name), mode="w", encoding="utf-8") as f:
				for i in data:
					await f.write(str(i)+"\n")


async def get_name_url(url):
	resp = None
	while True:
		try:
			resp = requests.get(book_url)
		except:
			print("目录信息获取失败,正在重新获取")
		else:
			print("目录信息获取成功!")
			break
	html = etree.HTML(resp.content.decode("gbk"))
	lis = html.xpath('//*[@id="header"]/div[3]/div[3]/ul/li')
	tasks = []

	for index,single_txt in enumerate(lis):
		name = str(single_txt.xpath("./a/text()")[0])
		if name.count("?")>0:
			name = name.replace("?","?")
		url = book_url + str(single_txt.xpath("./a/@href")[0])
		tasks.append(parse_save(str(index)+name,url))
	await asyncio.wait(tasks)



if __name__ == '__main__':
	book_url = "https://www.92qb.com/xiaoshuo/8/8233/"
	t1 = time.time()
	asyncio.run(get_name_url(book_url))
	t2 = time.time()
	print(t2-t1)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值