百度图片爬虫

最新推荐文章于 2024-07-07 09:46:06 发布

HMMPO

最新推荐文章于 2024-07-07 09:46:06 发布

阅读量428

点赞数

CC 4.0 BY-SA版权

分类专栏： python 文章标签：百度图片 python 爬虫脚本

本文链接：https://blog.youkuaiyun.com/HMMPO/article/details/78556269

python 专栏收录该内容

2 篇文章

订阅专栏

请求返回的是json格式很容易提取，都不用解密，之前看到过的有些大佬写的好像还要加个解密算法，没有当然更好

# -*- coding:utf-8 -*-

import os
import sys
import time
import urllib2
import urllib
import re
from threading import Thread
from Queue import Queue



PARSE_EXIT = False
COLLECT_EXIT = False
DOWNLOAD_EXIT = False


class Hcollect(Thread):
    """多线程采集页面"""

    def __init__(self, threadName, pageQueue, dataQueue, keyword):
        super(Hcollect, self).__init__()
        self.threadName = threadName
        self.pageQueue = pageQueue
        self.dataQueue = dataQueue
        self.keyword = keyword
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1"
        }


    def run(self):
        while not COLLECT_EXIT:
            try:
                pn = (self.pageQueue.get(False) - 1) * 30
                fullurl = "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord="+self.keyword+"&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word="+self.keyword+"&face=0&istype=2&nc=1&pn="+str(pn)+"&rn=60"
                request = urllib2.Request(fullurl, headers=self.headers)
                response = urllib2.urlopen(request)
                self.dataQueue.put(response.read())
            except:
                pass


class Hparse(Thread):
    """处理response并且导出文件"""
    def __init__(self, threadName, dataQueue, linkQueue):
        super(Hparse, self).__init__()
        self.threadName = threadName
        self.dataQueue = dataQueue
        self.linkQueue = linkQueue


    def run(self):
        while not PARSE_EXIT:
            try:
                content = self.dataQueue.get(False)
                pattern = re.compile(r'"ObjURL":"(.*?)"')
                links = pattern.findall(content)
                for url in links:
                    self.linkQueue.put(url.replace("\/", "/"))
            except:
                pass


class HdownLoad(Thread):
    """多线程下载"""
    def __init__(self, threadName, linkQueue, nameQueue, dirpath):
        super(HdownLoad, self).__init__()
        self.linkQueue = linkQueue
        self.nameQueue = nameQueue
        self.dirpath = dirpath
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1"
        }


    def run(self):
        while not DOWNLOAD_EXIT:
            try:
                url = self.linkQueue.get(False)
                request = urllib2.Request(url, headers=self.headers)
                response = urllib2.urlopen(request)
                #后缀有多种  主要是 jpeg jpg png
                #判断后缀
                if url[-5] == ".":
                    fileName = str(self.nameQueue.get(False)) + ".jpeg"
                if url[-4] == ".":
                    fileName = str(self.nameQueue.get(False)) + url[-4:]
                #下载
                with open(str(self.dirpath)+"/"+fileName, "wb") as f:
                     f.write(response.read())

            except:
                pass



def mkDir(dirName):
    """
    创建保存目录
    """
    dirpath = os.path.join(sys.path[0], dirName)
    if not os.path.exists(dirpath):
        os.mkdir(dirpath)
    return dirpath

def main(word, page):
    """主函数"""

    #转成url码
    keyword = urllib.quote(word.decode(sys.stdin.encoding).encode('utf-8'))
    while 1:
        print keyword

    #保存路径
    dirpath = mkDir("result")
    #页面页码队列
    pageQueue = Queue()
    #页面数据队列
    dataQueue = Queue()
    #图片链接队列
    linkQueue = Queue()
    #图片命名池
    nameQueue = Queue()

    for i in range (1,page+1):
        pageQueue.put(i)
    for i in range(1,page*70):
        nameQueue.put(i)

    collectList = ["collectThread1", "collectThread2", "collectThread3", "collectThread4"]
    collectThread_List = []
    for threadName in collectList:
        thread = Hcollect(threadName, pageQueue, dataQueue, keyword)
        thread.start()
        collectThread_List.append(thread)

    parseList = ["parseThread1", "parseThread2", "parseThread3", "parseThread4"]
    parseThread_List = []
    for threadName in parseList:
        thread = Hparse(threadName, dataQueue, linkQueue)
        thread.start()
        parseThread_List.append(thread)

    downloadList = ["downThread1", "downThread2", "downThread3", "downThread4"]
    downloadThread_List = []
    for threadName in downloadList:
        thread = HdownLoad(threadName, linkQueue, nameQueue, dirpath)
        thread.start()
        downloadThread_List.append(thread)

    while not pageQueue.empty():
        pass

    global COLLECT_EXIT
    COLLECT_EXIT = True

    for thread in collectThread_List:
        thread.join()

    print "----抓取完成----"

    while not dataQueue.empty():
        pass

    global PARSE_EXIT
    PARSE_EXIT = True

    for thread in parseThread_List:
        thread.join()

    print "----处理完成-----"

    while not linkQueue.empty():
        pass

    global DOWNLOAD_EXIT
    DOWNLOAD_EXIT = True

    for thread in downloadThread_List:
        thread.join()

    print "----下载完成----"




if __name__ == "__main__":
    print "欢迎使用百度图片多线程下载"
    print "仅支持单一关键词"
    print "默认保存路径为同目录下的result"
    print "="*60

    word = raw_input("输入关键字：")
    page = int(raw_input("请输入采集页数："))
    start = time.time()
    main(word, page)
    end = time.time()
    print "----用时:%.2f s----"%(end-start)