import requests
import urllib.request
from bs4 import BeautifulSoup
import os
import time
# 获得每种品牌的汽车
def get_car_brand_url(base_url):
car_brand = 'https://car.autohome.com.cn'
car_brand_list = []
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(base_url, headers=headers)
soup = BeautifulSoup(response.content.decode("gb2312", "ignore").encode("utf-8"), 'html.parser')
url_all = soup.find_all('a')
for item in url_all:
car_brand_list.append(car_brand + item.get('href'))
return car_brand_list
# 获取每个品牌的所以不同类的汽车
def get_car_brand_class_url(car_url_list):
car_class_base = 'https://car.autohome.com.cn'
car_class_list = []
headers = {'User-Agent': 'Mozilla/5.0'}
for item in car_url_list:
response = requests.get(item, headers=headers)
soup = BeautifulSoup(response.content.decode("gb2312", "ignore").encode("utf-8"), 'html.parser')
url_list = soup.find('div', {'class': 'uibox-con carpic-list02'}).find_all('a')
for a in url_list:
car_class_list.append(car_class_base + a.get('href'))
return car_class_list
def get_brand_class_image_url(car_class_list):
car_image_url = []
car_base = 'https://car.autohome.com.cn'
headers = {'User-Agent': 'Mozilla/5.0'}
for item in car_class_list:
response = requests.get(item, headers=headers)
soup = BeautifulSoup(response.content.decode("gb2312", "ignore").encode("utf-8"), 'html.parser')
# car_image_url.append(car_base + soup.find('div', {'class': 'uibox-con carpic-list02'}).find('a').get('href'))
car_image_url.append(car_base + soup.find('ul', {'class': 'search-pic-sortul'}).find('a').get('href'))
return car_image_url
def download_image(car_image_url, folder_path):
if not os.path.exists(folder_path): # 判断文件夹是否已经存在
os.makedirs(folder_path) # 创建文件夹
car_base = 'https://car.autohome.com.cn'
headers = {'User-Agent': 'Mozilla/5.0'}
for item in car_image_url:
response = requests.get(item, headers=headers)
soup = BeautifulSoup(response.content.decode("gb2312", "ignore").encode("utf-8"), 'html.parser')
class_all_image = soup.find('div', {'class': 'uibox-con carpic-list03 border-b-solid'}).find_all('img')
index = 0
for src in class_all_image:
image_url = 'http:' + src.get('src')
img_name = os.path.join(folder_path, '{}.jpg'.format(index))
# img_name = folder_path + str(index) + '.jpg'
image = requests.get(image_url)
with open(img_name, 'wb') as file: # 以byte形式将图片数据写入
file.write(image.content)
file.flush()
# file.close() # 关闭文件
print('第%d张图片下载完成' % index)
index += 1
base_url = 'http://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=2%20&brandId=0%20&fctId=0%20&seriesId=0'
image_url_list = get_car_brand_url(base_url)
car_class_list = get_car_brand_class_url(image_url_list)
# car_class_list = get_car_brand_class_url(['https://car.autohome.com.cn/pic/series/4482.html#pvareaid=2042214'])
car_image_url = get_brand_class_image_url(car_class_list)
folder_path = r'./car_images'
download_image(car_image_url, folder_path)
python爬取照片
最新推荐文章于 2021-03-26 06:22:39 发布