请求返回的是json格式很容易提取,都不用解密,之前看到过的有些大佬写的好像还要加个解密算法,没有当然更好
# -*- coding:utf-8 -*-
import os
import sys
import time
import urllib2
import urllib
import re
from threading import Thread
from Queue import Queue
PARSE_EXIT = False
COLLECT_EXIT = False
DOWNLOAD_EXIT = False
class Hcollect(Thread):
"""多线程采集页面"""
def __init__(self, threadName, pageQueue, dataQueue, keyword):
super(Hcollect, self).__init__()
self.threadName = threadName
self.pageQueue = pageQueue
self.dataQueue = dataQueue
self.keyword = keyword
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1"
}
def run(self):
while not COLLECT_EXIT:
try:
pn = (self.pageQueue.get(False) - 1) * 30
fullurl = "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord="+self.keyword+"&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word="+self.keyword+"&face=0&istype=2&nc=1&pn="+str(pn)+"&rn=60"
request = urllib2.Request(fullurl, headers=self.headers)
response = urllib2.urlopen(request)
self.dataQueue.put(response.read())
except:
pass
class Hparse(Thread):
"""处理response并且导出文件"""
def __init__(self, threadName, dataQueue, linkQueue):
super(Hparse, self).__init__()
self.threadName = threadName
self.dataQueue = dataQueue
self.linkQueue = linkQueue
def run(self):
while not PARSE_EXIT:
try:
content = self.dataQueue.get(False)
pattern = re.compile(r'"ObjURL":"(.*?)"')
links = pattern.findall(content)
for url in links:
self.linkQueue.put(url.replace("\/", "/"))
except:
pass
class HdownLoad(Thread):
"""多线程下载"""
def __init__(self, threadName, linkQueue, nameQueue, dirpath):
super(HdownLoad, self).__init__()
self.linkQueue = linkQueue
self.nameQueue = nameQueue
self.dirpath = dirpath
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1"
}
def run(self):
while not DOWNLOAD_EXIT:
try:
url = self.linkQueue.get(False)
request = urllib2.Request(url, headers=self.headers)
response = urllib2.urlopen(request)
#后缀有多种 主要是 jpeg jpg png
#判断后缀
if url[-5] == ".":
fileName = str(self.nameQueue.get(False)) + ".jpeg"
if url[-4] == ".":
fileName = str(self.nameQueue.get(False)) + url[-4:]
#下载
with open(str(self.dirpath)+"/"+fileName, "wb") as f:
f.write(response.read())
except:
pass
def mkDir(dirName):
"""
创建保存目录
"""
dirpath = os.path.join(sys.path[0], dirName)
if not os.path.exists(dirpath):
os.mkdir(dirpath)
return dirpath
def main(word, page):
"""主函数"""
#转成url码
keyword = urllib.quote(word.decode(sys.stdin.encoding).encode('utf-8'))
while 1:
print keyword
#保存路径
dirpath = mkDir("result")
#页面页码队列
pageQueue = Queue()
#页面数据队列
dataQueue = Queue()
#图片链接队列
linkQueue = Queue()
#图片命名池
nameQueue = Queue()
for i in range (1,page+1):
pageQueue.put(i)
for i in range(1,page*70):
nameQueue.put(i)
collectList = ["collectThread1", "collectThread2", "collectThread3", "collectThread4"]
collectThread_List = []
for threadName in collectList:
thread = Hcollect(threadName, pageQueue, dataQueue, keyword)
thread.start()
collectThread_List.append(thread)
parseList = ["parseThread1", "parseThread2", "parseThread3", "parseThread4"]
parseThread_List = []
for threadName in parseList:
thread = Hparse(threadName, dataQueue, linkQueue)
thread.start()
parseThread_List.append(thread)
downloadList = ["downThread1", "downThread2", "downThread3", "downThread4"]
downloadThread_List = []
for threadName in downloadList:
thread = HdownLoad(threadName, linkQueue, nameQueue, dirpath)
thread.start()
downloadThread_List.append(thread)
while not pageQueue.empty():
pass
global COLLECT_EXIT
COLLECT_EXIT = True
for thread in collectThread_List:
thread.join()
print "----抓取完成----"
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
for thread in parseThread_List:
thread.join()
print "----处理完成-----"
while not linkQueue.empty():
pass
global DOWNLOAD_EXIT
DOWNLOAD_EXIT = True
for thread in downloadThread_List:
thread.join()
print "----下载完成----"
if __name__ == "__main__":
print "欢迎使用百度图片多线程下载"
print "仅支持单一关键词"
print "默认保存路径为同目录下的result"
print "="*60
word = raw_input("输入关键字:")
page = int(raw_input("请输入采集页数:"))
start = time.time()
main(word, page)
end = time.time()
print "----用时:%.2f s----"%(end-start)