多线程版本
import requests
from lxml import etree
import time
"""
with open('./song.txt','w',encoding='utf-8') as fp:
fp.write(resp.text)
"""
def parse_single_txt(name,url):
single_txt_data = None
while True:
try:
single_txt_data = requests.get(url)
except:
print("章节{}信息获取失败,正在重新获取".format(name))
else:
break
temp = etree.HTML(single_txt_data.content.decode("gbk"))
data = temp.xpath('//*[@id="htmlContent"]/text()')
save_single_txt(name,data)
def save_single_txt(name,data):
with open('./txt1/{}.txt'.format(name),'w',encoding='utf-8') as fp:
for i in data:
fp.write(str(i)+"\n")
if __name__ == '__main__':
book_url = "https://www.92qb.com/xiaoshuo/8/8233/"
resp = None
while True:
try:
resp = requests.get(book_url)
except:
print("目录信息获取失败,正在重新获取")
else:
print("目录信息获取成功!")
break
html = etree.HTML(resp.content.decode("gbk"))
lis = html.xpath('//*[@id="header"]/div[3]/div[3]/ul/li')
from concurrent.futures import ThreadPoolExecutor
t1 = time.time()
with ThreadPoolExecutor(50) as t:
for index,single_txt in enumerate(lis):
name = str(single_txt.xpath("./a/text()")[0])
if name.count("?")>0:
name = name.replace("?","?")
url = book_url + str(single_txt.xpath("./a/@href")[0])
t.submit(parse_single_txt, name=str(index)+name,url = url)
t2 = time.time()
print(t2-t1)
协程版本
import requests
from lxml import etree
import time
import asyncio
import aiohttp
import aiofiles
"""
with open('./song.txt','w',encoding='utf-8') as fp:
fp.write(resp.text)
"""
async def parse_save(name,url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
temp = etree.HTML(await resp.text())
data = temp.xpath('//*[@id="htmlContent"]/text()')
async with aiofiles.open(r'./txt/{}.txt'.format(name), mode="w", encoding="utf-8") as f:
for i in data:
await f.write(str(i)+"\n")
async def get_name_url(url):
resp = None
while True:
try:
resp = requests.get(book_url)
except:
print("目录信息获取失败,正在重新获取")
else:
print("目录信息获取成功!")
break
html = etree.HTML(resp.content.decode("gbk"))
lis = html.xpath('//*[@id="header"]/div[3]/div[3]/ul/li')
tasks = []
for index,single_txt in enumerate(lis):
name = str(single_txt.xpath("./a/text()")[0])
if name.count("?")>0:
name = name.replace("?","?")
url = book_url + str(single_txt.xpath("./a/@href")[0])
tasks.append(parse_save(str(index)+name,url))
await asyncio.wait(tasks)
if __name__ == '__main__':
book_url = "https://www.92qb.com/xiaoshuo/8/8233/"
t1 = time.time()
asyncio.run(get_name_url(book_url))
t2 = time.time()
print(t2-t1)