这个重点在于aiohttp的使用,它保存文件的格式有很多种,如果要保存非文本信息需要用
async with aiohttp.ClientSession() as session:
async with session.get(href) as rep:
content = await rep.text()
tree = etree.HTML(content)
data = tree.xpath('//div[@class="content"]/p/text()')
datas = "".join(data).strip("'\u3000\u3000")
因为aiohttp的读取如果是html的话直接就和requests的使用一模一样,如果是非文本信息则是
content = await rep.content.read()
这个主要用于读取非文本信息方便后期储存。
下面就是关于协程爬取的案例
import requests
import aiohttp
import aiofiles
import asyncio
from lxml import etree
def get_page_href(rep):
print("正在获取href")
tree = etree.HTML(rep)
hrefs = tree.xpath('//ul[@class="chapter-list clearfix"]/li/a/@href')
name = tree.xpath('//div[@class="book-meta"]/h1/text()')[0]
print("获取成功")
return hrefs, name
def get_page_source(url):
print("正在获取页面源代码")
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52"}
rep = requests.get(url, headers=headers)
rep.encoding = "utf-8"
print("成功")
return rep.text
async def down_load(href, name):
while 1:
try:
print("正在进行保存")
async with aiohttp.ClientSession() as session:
async with session.get(href) as rep:
content = await rep.text()
tree = etree.HTML(content)
data = tree.xpath('//div[@class="content"]/p/text()')
datas = "".join(data).strip("'\u3000\u3000")
async with aiofiles.open(f"{name}.txt", mode="a", encoding="utf-8") as f:
await f.write(datas)
print(f"已保存在{name}.txt中")
break
except Exception as e:
print("下载失败", e, href)
async def main():
url = "https://book.zongheng.com/showchapter/1147826.html"
rep = get_page_source(url)
hrefs, name = get_page_href(rep)
tasks = []
for href in hrefs:
task = asyncio.create_task(down_load(href, name))
tasks.append(task)
await asyncio.wait(tasks)
if __name__ == '__main__':
even_loop = asyncio.get_event_loop()
even_loop.run_until_complete(main())