一个爬取谷歌图片的python程序

本文介绍了一种使用Python和Selenium的Google图片爬虫实现方法,能够根据关键词搜索并下载图片,支持多线程下载和自定义下载数量。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

查看了一些别人写的代码,照着大体的模板,写了一个自己的版本,亲测可用。

输入:一个文本,关键词断行分隔。

特点:一类别一文件夹,可使用自定义多线程下载,可自定义下载图片数目上限。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import requests
import os
from lxml import etree
import json
import threading
import ctypes
import inspect

import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    '(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
    'Connection': 'keep - alive',
    'content-type': 'application/json'
}


class myThread(threading.Thread):
    def __init__(self, threadID, urls_list, names_list, id_list, path):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.urls_list = urls_list
        self.names_list = names_list
        self.id_list = id_list
        self.path = path

    def _async_raise(self, tid, exctype):
        """raises the exception, performs cleanup if needed"""
        tid = ctypes.c_long(tid)
        if not inspect.isclass(exctype):
            exctype = type(exctype)
        res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
            tid, ctypes.py_object(exctype))
        if res == 0:
            raise ValueError("invalid thread id")
        elif res != 1:
            # """if it returns a number greater than one, you're in trouble,
            # and you should call it again with exc=NULL to revert the effect"""
            ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
            raise SystemError("PyThreadState_SetAsyncExc failed")

    def stop_thread(self):
        self._async_raise(self.ident, SystemExit)

    def run(self):
        print("线程%s开始" % self.threadID)
        for i in range(len(self.urls_list)):
            # 下载
            try:
                ir = requests.get(
                    self.urls_list[i], headers=headers, timeout=10)
                open(self.path + '/%d.jpg' % (self.id_list[i]),
                     'wb').write(ir.content)
                print("download picture id: %d sucess" % self.id_list[i])
            except Exception as ex2:
                print('download error!!' + str(ex2))
                continue
        # self.stop_thread()
        print("线程%s退出" % self.threadID)


class Crawler():
    def __init__(self, query, path, thread_count):
        self.url = base_url_part1 + search_query + base_url_part2
        self.search_query = query
        self.path = path
        self.thread_count = thread_count

    # 启动Chrome浏览器驱动
    def start_brower(self):
        chrome_options = Options()
        chrome_options.add_argument("--disable-infobars")
        # chrome_options.add_argument('--headless')
        """谷歌浏览器驱动地址"""
        executable_path = "C:/Anaconda3/chromedriver.exe"
        """启动Chrome浏览器"""
        driver = webdriver.Chrome(
            chrome_options=chrome_options, executable_path=executable_path)
        """最大化窗口,因为每一次爬取只能看到视窗内的图片"""
        driver.maximize_window()
        """浏览器打开爬取页面"""
        driver.get(self.url)
        return driver

    def downloadImg(self, driver):
        """滑动滚动条至:加载更多处"""
        end = False
        while True:
            html_page = driver.page_source
            html = etree.HTML(html_page)
            pictures = html.xpath('//*[@id="rg_s"]/div')
            google_url = 'https://www.google.com'
            urls_list = []
            names_list = []
            for picture in pictures:
                url = picture.xpath('./div/text()')
                if url != []:
                    raw_data = str(url[0])
                    raw_data_dict = json.loads(raw_data)
                    urls_list.append(raw_data_dict["ou"])
                    name = picture.xpath(
                        './a[2]/div[@class="mVDMnf nJGrxf"]/text()')
                    names_list.append(str(name[0]))
            # 比较当前的下载数目是否已经满足要求
            if len(names_list) >= download_count:
                urls_list = urls_list[:download_count]
                names_list = names_list[:download_count]
                break
            if end is True:
                break
            # 滑动刷新
            for i in range(5):
                pos = i * 50000
                js = "document.documentElement.scrollTop=%d" % pos
                driver.execute_script(js)
                time.sleep(1)
            try:
                driver.find_element_by_xpath(
                    "./*//input[@value='显示更多结果']").click()
            except:
                end = True
                continue
            time.sleep(1)
        file_write = open(
            self.path + '/' + self.search_query + '.txt',
            'w+',
            encoding='utf-8')
        # 在txt中书写id-图片名
        length = len(names_list)
        id_list = [i for i in range(length)]
        for i in id_list:
            file_write.write(str(i) + ' ' + names_list[i] + '\n')
        file_write.close()
        time.sleep(10)
        # 开始下载
        thread_list = []
        next_start = 0
        for i in range(thread_count):
            start_id = next_start
            end_id = int(float(length) / thread_count * (i + 1))
            end_id += 1
            next_start = end_id
            thread_list.append(
                myThread(i, urls_list[start_id:end_id],
                         names_list[start_id:end_id], id_list[start_id:end_id],
                         self.path))
            thread_list[i].start()
        for i in range(thread_count):
            thread_list[i].join()

    def run(self):
        driver = self.start_brower()
        self.downloadImg(driver)
        driver.close()
        print("{} download has finished.".format(self.search_query))


if __name__ == '__main__':
    start = time.time()
    # base_url_part1以及base_url_part2都是固定不变的,无需更改
    base_url_part1 = 'https://www.google.com/search?q='
    base_url_part2 = '&source=lnms&tbm=isch'
    # 下载图片数目
    download_count = 2000
    # 爬取关键字
    file_read = open('search_imgs.txt', 'r+')
    search_list = file_read.readlines()
    totalPath = 'F:/张晋豪资料包/人工智能/视频分析资料/正式工作/爬虫/google_picture/picture/downloads2/'
    # 针对每一个开始下载
    craw_list = []
    for search_query in search_list:
        search_query = search_query.strip()
        thread_count = 200  # 每个类别的下载线程数
        path = os.path.join(totalPath, search_query)
        try:
            if not os.path.exists(path):
                os.mkdir(path)
                time.sleep(1)
        except Exception as e:
            print(e)
        craw = Crawler(
            query=search_query, path=path, thread_count=thread_count)
        craw.run()
    end = time.time()
    print('all have been downloaded.')
    print('total cost time %d' % (end - start))

 

新版代码(一个类别只下得了10几张……现在网页没给jpg的绝对地址,64位2进制编码的10几张破了,没有地址后缀的那种我暂时搞不定,以后再说吧)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import requests
import os
from lxml import etree
import json
import threading
import ctypes
import inspect
import base64
import io
import sys
import urllib
import urllib.request
from io import StringIO
import urllib3.contrib.pyopenssl
urllib3.contrib.pyopenssl.inject_into_urllib3()

headers = {
    'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
        '(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
    'Connection': 'keep - alive',
    'content-type': 'application/json'
}


class myThread(threading.Thread):
    def __init__(self, threadID, urls_list, names_list, id_list, path):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.urls_list = urls_list
        self.names_list = names_list
        self.id_list = id_list
        self.path = path

    def _async_raise(self, tid, exctype):
        """raises the exception, performs cleanup if needed"""
        tid = ctypes.c_long(tid)
        if not inspect.isclass(exctype):
            exctype = type(exctype)
        res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
            tid, ctypes.py_object(exctype))
        if res == 0:
            raise ValueError("invalid thread id")
        elif res != 1:
            # """if it returns a number greater than one, you're in trouble,
            # and you should call it again with exc=NULL to revert the effect"""
            ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
            raise SystemError("PyThreadState_SetAsyncExc failed")

    def stop_thread(self):
        self._async_raise(self.ident, SystemExit)

    def run(self):
        print("线程%s开始" % self.threadID)
        for i in range(len(self.urls_list)):
            # data = self.urls_list[i].split(',')[1]
            # if data != self.urls_list[i]:
            if self.urls_list[i].startswith('data:image/jpeg;base64,'):
                data = self.urls_list[i].replace('data:image/jpeg;base64,', '')
                image_data = base64.b64decode(data, '-_')
                fh = open(self.path + '/%d.jpeg' % (self.id_list[i]), 'wb')
                fh.write(image_data)
                fh.close()
            # 下载
            else:
                try:
                    # request = urllib.request.Request(self.urls_list[i])
                    # response = urllib.request.urlopen(request)
                    # get_img = response.read()
                    # tmpIm = StringIO(get_img)
                    # with open(self.path + '/%d.jpeg' % (self.id_list[i]),
                    #           'wb') as fp:
                    #     fp.write(tmpIm)

                    # ir = requests.get(
                    #     self.urls_list[i], headers=headers, timeout=10)
                    ir = requests.get(self.urls_list[i], verify=False)
                    with open(self.path + '/%d.jpeg' % (self.id_list[i]), 'wb') as f:
                        f.write(ir.content)

                    # urllib.request.urlretrieve(self.id_list[i], self.path + '/%d.jpg' % (self.id_list[i]))
                    print("download picture id: %d success" % self.id_list[i])
                except Exception as ex2:
                    print('download error!!' + str(ex2))
                    continue

        self.stop_thread()
        print("线程%s退出" % self.threadID)


class Crawler():
    def __init__(self, query, path, thread_count):
        self.url = base_url_part1 + search_query + base_url_part2
        self.search_query = query
        self.path = path
        self.thread_count = thread_count

    # 启动Chrome浏览器驱动
    def start_brower(self):
        chrome_options = Options()
        chrome_options.add_argument("--disable-infobars")
        chrome_options.add_argument("--no-sandbox")
        # chrome_options.add_argument('--headless')
        """谷歌浏览器驱动地址"""
        executable_path = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe"
        """启动Chrome浏览器"""
        driver = webdriver.Chrome(
            chrome_options=chrome_options, executable_path=executable_path)
        """最大化窗口,因为每一次爬取只能看到视窗内的图片"""
        driver.maximize_window()
        """浏览器打开爬取页面"""
        driver.get(self.url)
        return driver

    def downloadImg(self, driver):
        """滑动滚动条至:加载更多处"""
        end = False
        count = 0
        while True:
            html_page = driver.page_source
            html = etree.HTML(html_page)
            pictures = html.xpath('//*[@id="islrg"]/div[1]/div')
            google_url = 'https://www.google.com'
            urls_list = []
            names_list = []
            for picture in pictures:
                url = picture.xpath('./a[1]/div[1]/img/@src')
                if url != []:
                    urls_list.append(url[0])
                    count += 1
                    names_list.append('%d.jpg' % (count))
                    # raw_data = str(url[0])
                    # raw_data_dict = json.loads(raw_data)
                    # urls_list.append(raw_data_dict["ou"])
                    # name = picture.xpath(
                    #     './a[2]/div[@class="mVDMnf nJGrxf"]/text()')
                    # names_list.append(str(name[0]))
            # 比较当前的下载数目是否已经满足要求
            if len(names_list) >= download_count:
                urls_list = urls_list[:download_count]
                names_list = names_list[:download_count]
                break
            if end is True:
                break
            # 滑动刷新
            for i in range(5):
                pos = i * 50000
                js = "document.documentElement.scrollTop=%d" % pos
                driver.execute_script(js)
                time.sleep(1)
            try:
                driver.find_element_by_xpath(
                    "./*//input[@value='显示更多结果']").click()
            except:
                end = True
                continue
            time.sleep(1)
        file_write = open(
            self.path + '/' + self.search_query + '.txt',
            'w+',
            encoding='utf-8')
        # 在txt中书写id-图片名
        length = len(names_list)
        id_list = [i for i in range(length)]
        for i in id_list:
            file_write.write(str(i) + ' ' + urls_list[i] + '\n')
        file_write.close()
        time.sleep(10)
        # 开始下载
        thread_list = []
        next_start = 0

        for i in range(thread_count):
            start_id = next_start
            end_id = int(float(length) / thread_count * (i + 1))
            end_id += 1
            next_start = end_id
            thread_list.append(
                myThread(i, urls_list[start_id:end_id],
                         names_list[start_id:end_id], id_list[start_id:end_id],
                         self.path))
            thread_list[i].start()
        for i in range(thread_count):
            thread_list[i].join()

    def run(self):
        driver = self.start_brower()
        self.downloadImg(driver)
        driver.close()
        print("{} download has finished.".format(self.search_query))


if __name__ == '__main__':
    start = time.time()
    # base_url_part1以及base_url_part2都是固定不变的,无需更改
    base_url_part1 = 'https://www.google.com/search?q='
    base_url_part2 = '&source=lnms&tbm=isch'
    # 下载图片数目
    download_count = 50
    # 爬取关键字
    file_read = open('search_imgs.txt', 'r+')
    search_list = file_read.readlines()
    totalPath = './picture/'
    # 针对每一个开始下载
    craw_list = []
    for search_query in search_list:
        search_query = search_query.strip()
        thread_count = 10  # 每个类别的下载线程数
        path = os.path.join(totalPath, search_query)
        try:
            if not os.path.exists(path):
                os.mkdir(path)
                time.sleep(1)
        except Exception as e:
            print(e)
        craw = Crawler(
            query=search_query, path=path, thread_count=thread_count)
        craw.run()
    end = time.time()
    print('all have been downloaded.')
    print('total cost time %d' % (end - start))

 

评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值