作业:用线程池获取数据,利用队列在别的线程写入csv文件,模拟线程出错并再丢入pool,模拟某页如果出错3次则放弃获取数据
import requests
import csv
from time import sleep
from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
from random import randint
headers = {
'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
def get_resp(url):
global future
try:
if randint(1, 5) == 1:
print('出错啦出错啦')
raise KeyError
resp = requests.get(url=url, headers=headers)
return resp
except:
if url in loser:
if loser[url] < 3:
loser[url] += 1
future = pool.submit(bilibili_main, url)
else:
print(f'获取{url}的数据失败啦')
return
else:
loser[url] = 1
future = pool.submit(bilibili_main, url)
def extract_info(text):
all_info = text['data']['list']
page_data = []
for x in all_info:
season_id = x['season_id']
name = x['title']
link = x['link']
subscribe = x['order']
index_show = x['index_show']
page_data.append([season_id, name, link, subscribe, index_show])
q.put(page_data)
def bilibili_main(url):
resp = get_resp(url)
extract_info(resp.json())
def write_data():
writer = csv.writer(open('番剧.csv', 'w', encoding='utf-8', newline=''))
writer.writerow(['番剧ID', '番名', '链接', '追番人数', '总集数'])
while True:
info = q.get()
if info == 'end':
return
else:
writer.writerows(info)
if __name__ == '__main__':
q = Queue()
loser = {}
pool = ThreadPoolExecutor(10)
for i in range(1, 164):
url = f'https://api.bilibili.com/pgc/season/index/result?season_version=-1&spoken_language_type=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page={i}&season_type=1&pagesize=20&type=1'
future = pool.submit(bilibili_main, url)
t = Thread(target=write_data)
t.start()
while not future.done():
sleep(1)
q.put('end')
本文介绍了一个使用线程池从B站API抓取番剧数据的Python代码,包括错误处理、数据队列和3次失败后放弃机制。通过CSV文件记录数据,同时演示了如何模拟线程错误和恢复策略。
3434

被折叠的 条评论
为什么被折叠?



