在一个DataFrame里面保存所有待下载图片的URL和name,设置下载图片的路径,运行如下代码:
import requests
import shutil
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
def save_image(session, img_url, file_name):
"""
下载一张图片到指定文件夹
:param session: requests.Session()对象
:param img_url: 图片URL
:param file_name: 文件路径
:return:
"""
r = session.get(f"{img_url}", stream=True)
if r.status_code == 200:
with open(file_name, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
def download_img(img_df, target_path):
"""
批量下载图片
:param img_df: 包含图片url和name的DataFrame数据集
:param target_path: 保存图片的目标路径
:return:
"""
tasks = []
img_dir = Path(target_path)
with ThreadPoolExecutor(100) as executor, requests.Session() as sess, tqdm(total=len(img_df.index.unique())) as bar:
for index, row in img_df.iterrows():
task = executor.submit(save_image, sess, row['url'], img_dir.joinpath(f"{row['name']}"))
task.add_done_callback(lambda fut: bar.update(1))
tasks.append(task)
ThreadPoolExecutor(100)的意思是开100个线程。