在进行人脸相关处理中,人脸数据集是关键,这里描述一下怎样爬取人脸数据集
1、获取艺人名称
① 获取完整url路径
在百度中搜索“中国艺人”
得到以下界面

通过分析,url的完整路径为:
"https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?resource_id=28266&from_mid=500&format=json&ie=utf-8&oe=utf-8&query=%E4%B8%AD%E5%9B%BD%E8%89%BA%E4%BA%BA&sort_key=&sort_type=1&stat0=&stat1=&stat2=&stat3=&pn="+pn+"&rn=100&_=1580457480665"
其中,pn为页码数。
② 解析全部艺人名称列表
解析上述url,通过requests获取网页内容,然后解析艺人姓名
def get_person_name():
person_list = []
pn_i=0
while(True):
pn=str(pn_i)
pn_i+=100
url="https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?resource_id=28266&from_mid=500&format=json&ie=utf-8&oe=utf-8&query=%E4%B8%AD%E5%9B%BD%E8%89%BA%E4%BA%BA&sort_key=&sort_type=1&stat0=&stat1=&stat2=&stat3=&pn="+pn+"&rn=100&_=1580457480665"
res = requests.get(url)
try:
json_str=json.loads(res.text)
except:
continue
figs=json_str['data'][0]['result']
for i in figs:
name=i['ename']
print(name)
person_list.append(name)
return person_list
2、爬取相应艺人的相应照片
① 获取图片的url
while pn < self.__amount:
url = "https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1639129009987_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&dyTabStr=MCwzLDEsNiwyLDQsNSw3LDgsOQ%3D%3D&ie=utf-8&sid=&word=" + word_quote
try:
time.sleep(self.time_sleep)
req = urllib.request.Request(url=url, headers=self.headers)
#print(req)
page = urllib.request.urlopen(req)
rsp = page.read()
rsp = str(rsp)
index_list = find_all_sub("objURL",rsp)
img_root = []
for i in range(0,len(index_list)):
temp = []
if i == len(index_list) -1:
temp = rsp[index_list[len(index_list) -1]:len(rsp)]
else:
temp = rsp[index_list[i]:index_list[i + 1]]
img_root.append(temp)
for img_root_path in img_root:
temp_url = img_root_path[9:]
end = temp_url.find('"')
image_temp_url = temp_url[:end]
if not find_in_list(image_url_list,image_temp_url):
image_url_list.append(image_temp_url)
except UnicodeDecodeError as e:
print(e)
print('-----UnicodeDecodeErrorurl:', url)
except urllib.error.URLError as e:
print(e)
print("-----urlErrorurl:", url)
except socket.timeout as e:
print(e)
print("-----socket timout:", url)
else:
# 读取下一页
print("下载下一页")
pn += 60
finally:
page.close()
② 下载对应的图片
image_root_path = "./" + word
if not os.path.exists(image_root_path):
os.mkdir(image_root_path)
filepath = image_root_path + "/" + str(word) + "_" + str(number) + ".jpg"
for img_url in image_url_list:
number += 1
filepath = image_root_path + "/" + str(word) + "_" + str(number) + ".jpg"
print(filepath)
count = 1
try:
urllib.request.urlretrieve(img_url, filepath)
except socket.timeout:
while count <= 3:
try:
urllib.request.urlretrieve(img_url, filepath)
break
except socket.timeout:
count += 1
finally:
# display the raw url of images
print('\t%d\t%s' % (number, img_url))
if count > 3:
print('\t%d\t%s failed' % (number, img_url))
pass
print("下载任务结束")
1101





