$ python crawling.py -h
usage: crawling.py [-h] -w WORD -tp TOTAL_PAGE -sp START_PAGE
[-pp [{10,20,30,40,50,60,70,80,90,100}]] [-d DELAY]
optional arguments:
-h, --help show this help message and exit
-w WORD, --word WORD 抓取关键词
-tp TOTAL_PAGE, --total_page TOTAL_PAGE
需要抓取的总页数
-sp START_PAGE, --start_page START_PAGE
起始页数
-pp [{10,20,30,40,50,60,70,80,90,100}], --per_page [{10,20,30,40,50,60,70,80,90,100}]
每页大小
-d DELAY, --delay DELAY
抓取延时(间隔)
import argparse
import os
import re
import sys
import urllib
import json
import socket
import urllib.request
import urllib.parse
import urllib.error
import time
timeout = 5
socket.setdefaulttimeout(timeout)
class Crawler:
__time_sleep = 0.1
__amount = 0
__start_amount = 0
__counter = 0
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
'Cookie': ''
}
__per_page = 30
def __init__(self, t=0.1):
self.time_sleep = t
@staticmethod
def get_suffix(name):
m = re.search(r'\.[^\.]*$', name)
if m.group(0) and len(m.group(0)) <= 5:
return m.group(0)
else:
return '.jpeg'
@staticmethod
def handle_baidu_cookie(original_cookie, cookies):
"""
:param string original_cookie:
:param list cookies:
:return string:
"""
if not cookies:
return original_cookie
result = original_cookie
for cookie in cookies:
result += cookie.split(';')[0] + ';'
result.rstrip(';')
return result
def save_image(self, rsp_data, word):
if not os.path.exists("./" + word):
os.mkdir("./" + word)
self.__counter = len(os.listdir('./' + word)) + 1
for image_info in rsp_data['data']:
try:
if 'replaceUrl' not in image_info or len(
image_info['replaceUrl']) < 1:
continue
obj_url = image_info['replaceUrl'][0]['ObjUrl']
thumb_url = image_info['thumbURL']
url = 'https://image.baidu.com/search/down?tn=download&ipn=dwnl&word=download&ie=utf8&fr=result&url=%s&thumburl=%s' % (
urllib.parse.quote(obj_url), urllib.parse.quote(thumb_url))
time.sleep(self.time_sleep)
suffix = self.get_suffix(obj_url)
opener = urllib.request.build_opener()
opener.addheaders = [
('User-agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
),
]
urllib.request.install_opener(opener)
filepath = './%s/%s' % (word,
str(self.__counter) + str(suffix))
urllib.request.urlretrieve(url, filepath)
if os.path.getsize(filepath) < 5:
print("下载到了空文件,跳过!")
os.unlink(filepath)
continue
except urllib.error.HTTPError as urllib_err:
print(urllib_err)
continue
except Exception as err:
time.sleep(1)
print(err)
print("产生未知错误,放弃保存")
continue
else:
print("小黄图+1,已有" + str(self.__counter) + "张小黄图")
self.__counter += 1
return
def get_images(self, word):
search = urllib.parse.quote(word)
pn = self.__start_amount
while pn < self.__amount:
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%s&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word=%s&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=%s&rn=%d&gsm=1e&1594447993172=' % (
search, search, str(pn), self.__per_page)
try:
time.sleep(self.time_sleep)
req = urllib.request.Request(url=url, headers=self.headers)
page = urllib.request.urlopen(req)
self.headers['Cookie'] = self.handle_baidu_cookie(
self.headers['Cookie'],
page.info().get_all('Set-Cookie'))
rsp = page.read()
page.close()
except UnicodeDecodeError as e:
print(e)
print('-----UnicodeDecodeErrorurl:', url)
except urllib.error.URLError as e:
print(e)
print("-----urlErrorurl:", url)
except socket.timeout as e:
print(e)
print("-----socket timout:", url)
else:
rsp_data = json.loads(rsp, strict=False)
if 'data' not in rsp_data:
print("触发了反爬机制,自动重试!")
else:
self.save_image(rsp_data, word)
print("下载下一页")
pn += self.__per_page
print("下载任务结束")
return
def start(self, word, total_page=1, start_page=1, per_page=30):
"""
爬虫入口
:param word: 抓取的关键词
:param total_page: 需要抓取数据页数 总抓取图片数量为 页数 x per_page
:param start_page:起始页码
:param per_page: 每页数量
:return:
"""
self.__per_page = per_page
self.__start_amount = (start_page - 1) * self.__per_page
self.__amount = total_page * self.__per_page + self.__start_amount
self.get_images(word)
if __name__ == '__main__':
if len(sys.argv) > 1:
parser = argparse.ArgumentParser()
parser.add_argument("-w",
"--word",
type=str,
help="抓取关键词",
required=True)
parser.add_argument("-tp",
"--total_page",
type=int,
help="需要抓取的总页数",
required=True)
parser.add_argument("-sp",
"--start_page",
type=int,
help="起始页数",
required=True)
parser.add_argument("-pp",
"--per_page",
type=int,
help="每页大小",
choices=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
default=30,
nargs='?')
parser.add_argument("-d",
"--delay",
type=float,
help="抓取延时(间隔)",
default=0.05)
args = parser.parse_args()
crawler = Crawler(args.delay)
crawler.start(
args.word, args.total_page, args.start_page,
args.per_page)
else:
crawler = Crawler(0.05)
crawler.start('美女', 10, 2,
30)