从百度爬取所需要的图片内容:
# -*- coding: utf-8 -*-
import re, os
import requests
from urllib.request import urlretrieve
def download1(url, filename, filepath):
full_name = os.path.join(filepath, filename)
if os.path.exists(full_name):
print("【消息】文件已经存在:", full_name)
try:
pic = requests.get(url, timeout=5)
except:
print('【错误】当前图片无法下载')
return
try:
with open(filepath + "/" + filename, 'wb') as wf:
wf.write(pic.content)
except:
print("【错误】写入失败")
def download2(url, filename, filepath):
full_name = os.path.join(filepath, filename)
if os.path.exists(full_name):
print("【消息】文件已经存在:", full_name)
try:
urlretrieve(url, full_name)
except:
print('【错误】当前图片无法下载')
def search(word, local_path="./data/down/", page=None, keep_original_name=True):
local_path += word
os.makedirs(local_path, exist_ok=True)
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={word}&pn={pn}&gsm={gsm:x}&ct=&ic=0&lm=-1&width=0&height=0'.format(
word=word, pn=20 * page, gsm=40 + 20 * page)
print("HHHC:0====>page=%d,url=\"%s\"" % (page, url))
html = requests.get(url).text
pic_url = re.findall('"objURL":"(.*?)",', html, re.S)
i = 0
for url in pic_url:
print(url)
i = i + 1
filename = os.path.split(url)[1].split('?')[0]
filename_split = filename.split('.')
if len(filename_split) != 2:
print("【错误】文件名异常:" + filename)
continue
# print("HHHA:0====>", filename_split[1])
if filename_split[1] != 'jpg' and filename_split[1] != 'JPG' \
and filename_split[1] != 'png' and filename_split[1] != 'PNG':
print("【错误】类型异常:" + filename)
continue
if not keep_original_name:
filename = filename.split('.')[0].strip() + "-" + str(page) + "-" + str(i) + "." + filename.split('.')[
1].strip()
download1(url, filename, local_path)
return
def search_50_page(word, local_path="./data/down/"):
for i in range(1, 50):
search(word, local_path, i)
def search_list_test():
obj_list = ["苹果", "香蕉", "桔子", "橙子", "桃子", "樱桃", "龙眼", "荔枝", "雪梨", "草莓", "葡萄", "猕猴桃", "菠萝", "番石榴", "青梅"]
# obj_list = ["菊花", "蒲公英", "玫瑰", "向日葵", "郁金香"]
for obj in obj_list:
search_50_page(obj, "./data/fruit_photos/")
if __name__ == '__main__':
search_list_test()
修改 obj_list 列表里面的内容即可