爬取百度图片-优快云博客

本文介绍了一种使用Python的requests库从百度图片搜索中获取实际图片链接的方法。通过构造请求参数并解析返回的JSON数据，实现了对图片URL的有效解码。

使用requests包与百度图片服务器进行交互，得到返回的json数据后进行解析，最终获得百度图片的实际地址

import requests
from fake_useragent import UserAgent


def imgUrls(keyWord, userAgent, pn=0, rn=30):
    url = 'https://image.baidu.com/search/index'
    params = {
        'tn': 'resultjson_com',
        'ipn': 'rj',
        'ct': '201326592',
        'is': '',
        'fp': 'result',
        'queryWord': keyWord,
        'cl': '2',
        'lm': '-1',
        'ie': 'utf-8',
        'oe': 'utf-8',
        'adpicid': '',
        'st': '-1',
        'z': '',
        'ic': '0',
        'word': keyWord,
        's': '',
        'se': '',
        'tab': '',
        'width': '',
        'height': '',
        'face': '0',
        'istype': '2',
        'qc': '',
        'nc': '1',
        'fr': '',
        'pn': 0,  # 当前请求的图片序号
        'rn': 30,  # 取多少个图片
        'gsm': '1e',
        '1491808945838': ''
    }
    rep = requests.get(url, headers={'user-Agent': userAgent}, params=params)
    if(int(rep.status_code) == 200):
        try:
            imgs = rep.json()
            """解析返回的url地址"""
            def decodeUrl(imgUrl):
                longDic={'_z2C$q': ":",'_z&e3B': ".",'AzdH3F': "/"}
                mapDic={'w': "a",'k': "b",'v': "c",'1': "d",'j': "e",'u': "f",'2': "g",'i': "h",'t': "i",'3': "j",'h': "k",'s': "l",'4': "m",'g': "n","5": "o",'r': "p",'q': "q","6": "r",'f': "s",'p': "t","7": "u",'e': "v",'o': "w","8": "1",'d': "2",'n': "3","9": "4",'c': "5",'m': "6","0": "7",'b': "8",'l': "9",'a': "0"}
                for k in longDic:
                    imgUrl=imgUrl.replace(k,longDic[k])
                imgUrl=list(imgUrl)
                tmp=[]
                for i in imgUrl:
                    if i in mapDic:
                        tmp.append(mapDic[i])
                    else:
                        tmp.append(i)
                return ''.join(tmp)
            imgUrls = [decodeUrl(imgs['data'][sec]['objURL'])
                       for sec in range(len(imgs['data']) - 1)]
            result = imgUrls
            status = True
        except Exception as err:
            result = str(err)
            status = False
        finally:
            return {'result': result, 'status': status}


if __name__ == '__main__':
    ua = UserAgent()
    urls = imgUrls(keyWord='美女', userAgent=ua.random)
    print(urls)