import time
import concurrent.futures
import requests
from lxml import etree
headers = {
"Referer": "https://movie.douban.com/top250?start=225&filter=",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
}
def download_image(url, name):
"""
下载图片并保存到本地
"""
response = requests.get(url=url, headers=headers)
content = response.content
with open(f'img1/{name}.jpg', 'wb') as fp:
fp.write(content)
def crawl_page(url):
"""
爬取页面内容并提取电影图片和名称
"""
response = requests.get(url=url, headers=headers)
content = response.text
qwer = etree.HTML(content)
for i in range(1, 26):
zp = qwer.xpath(f'//*[@id="content"]/div/div[1]/ol/li[{i}]/div/div[1]/a/img/@src')
name = qwer.xpath(f'//*[@id="content"]/div/div[1]/ol/li[{i}]/div/div[2]/div[1]/a/span[1]/text()')
if zp and name:
tp = str(zp[0])
print(name[0])
download_image(tp, name[0])
if __name__ == '__main__':
start_time = time.time()
# 创建线程池,限制最大线程数为4
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
urls = [f'https://movie.douban.com/top250?start={i}&filter=' for i in range(0, 226, 25)]
# 提交任务给线程池
executor.map(crawl_page, urls)
end_time = time.time()
print("总耗时:", end_time - start_time)
#总耗时: 13.306219339370728