学习爬虫第一天


目标: 从网上https://www.bing.com/images/search搜索图片下载

第一步:分析https://www.bing.com/images/search怎么搜寻图片的

根据抓包发现,是这个请求获取了图片地址
请添加图片描述

第二步:模拟请求获取数据

抓到这个请求,把这个请求多余没用的参数去掉

https://www.bing.com/images/search?q=%E5%9B%BE%E7%89%87&qs=n&form=QBIR&sp=-1&lq=0&pq=%E5%9B%BE%E7%89%87&sc=10-2&cvid=8FF468D5CBF64384837AC59E8D282684&cc=cn&first=1&cw=1732&ch=877

根据多次去掉参数结果

https://www.bing.com/images/search?q=%E5%9B%BE%E7%89%87

发现q这个参数是输入的查询的内容
headers先把所有都放进去,然后一个一个删到没有影响请求即可

headers = {
    ":authority": "www.bing.com",
    ":method": "GET",
    ":path": "/images/search?q=%E5%9B%BE%E7%89%87&qs=n&form=QBIR&sp=-1&lq=0&pq=%E5%9B%BE%E7%89%87&sc=10-2&cvid=8FF468D5CBF64384837AC59E8D282684&cc=cn&first=1&cw=1177&ch=864",
    ":scheme": "https",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-encoding": "gzip, deflate, br, zsdch, zstd",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
    "cache-control": "max-age=0",
    "cookie": "_IDET=MIExp=0&VSNoti2=20250330&HSNoti2=20250330; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=D383A38DF2B94A37B5FA04F2BA990A5C&dmnchg=1; ANON=A=FAFC24B4CAFE7727410956F0FFFFFFFF; _FP=hta=on; MSCCSC=1; MSPTC=fP3rkmL1pFB0N63Zj6GZLFmGtdyRXlAt4d9-n7fa6fk; CortanaAppUID=DB62DAB8405F754F80CAD9DF921AF756; MMCASM=ID=1A85890FAFE649A881EFE6924BCA224A; MUID=3DB87C2E2D6E6C9E1879686B2C666D13; MUIDB=3DB87C2E2D6E6C9E1879686B2C666D13; SRCHUSR=DOB=20250326&T=1743516238000&DS=1&POEX=W; _tarLang=default=en; _TTSS_OUT=hist=WyJ6aC1IYW5zIiwiZW4iXQ==; _clck=15jqlc2%5E2%5Efyy%5E0%5E2034; _UR=QS=0&TQS=0&Pn=0; BFBUSR=BFBHP=0; imgv=lodlg=2&gts=20250912; ipv6=hit=1758186866962&t=4; _HPVN=CS=eyJQbiI6eyJDbiI6MywiU3QiOjAsIlFzIjowLCJQcm9kIjoiUCJ9LCJTYyI6eyJDbiI6MywiU3QiOjAsIlFzIjowLCJQcm9kIjoiSCJ9LCJReiI6eyJDbiI6MywiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyNS0wOS0xOFQwMDowMDowMFoiLCJJb3RkIjowLCJHd2IiOjAsIlRucyI6MCwiRGZ0IjpudWxsLCJNdnMiOjAsIkZsdCI6MCwiSW1wIjo3LCJUb2JuIjowfQ==; _Rwho=u=d&ts=2025-09-27; _U=1OUjkw81-X6zSWb3RLHkTZhBNLrQak5_bdKz0QriUSViH7WiMTjJyc5dYdthgRvs_Gh-ZGgTzidip8Oj2NQR5Iihscb3mJO1lEa1nz3GbAY2vAmRhIWyAe2JyC-D9dt4SustCKFGv8EtIxuthuugNq63Ru8h2BMuAkT8OI95ke1FhoBr7wkr5waRxasvk1f2tw7gxpMb0Ksv2SbwA0TVNFeF2xxWNgTgAx7GFVSr5TLM; WLS=C=4a121f9f1a6e979e&N=%e5%9d%a4%e5%93%a5; BFPRResults=FirstPageUrls=05C7B346D21C5921624173BFD56C859F%2C165B7FF0CF621962A305843639C7FB32%2C261D71F5AF24D855C545BEA2B5B2897C%2CA0869A854325B4F69D0D04E007CDA525%2C5B23FA13F632E9348985044180AD88D6%2C19524F658AB4ABB294BAFA6E4DEE0B3F%2CC2EBD1D8FC23F1751B93D793F922F813%2C0982913CAE88095B1ED57EB690DE0277%2CD3615B2DFB1261C92E1BE00969B3251F%2C66495C47F86D9451F19EAB9F0F3EBF4B&FPIG=DC003FF5244B49CE9D53684E8F273434; _EDGE_S=mkt=zh-cn&ui=zh-cn&F=1&SID=2962980949E5692D2E8D8DB248AF682F; _TTSS_IN=hist=WyJlbiIsInpoLUhhbnMiLCJhdXRvLWRldGVjdCJd&isADRU=0; USRLOC=HS=1&ELOC=LAT=1.2894364595413208|LON=103.84998321533203|N=%E6%96%B0%E5%8A%A0%E5%9D%A1%EF%BC%8C%E4%B8%AD%E5%8C%BA|ELT=1|&BID=MjUxMDAyMTUyNjAzXzk3MTg5OGUyNDgxMzVlNjlmYTE5MzRjZGZkMGQ5N2JkOTIwZjIzNTA5ZmVmYmI3ZjFkNWNkM2NiMzc3OTA3Yjg=; _SS=SID=2962980949E5692D2E8D8DB248AF682F&R=1781&RB=1781&GB=0&RG=0&RP=1781&h5comp=3&PC=CNNDDB; SNRHOP=I=&TS=; GC=4ZixiF8alzRKFUviD6TCKXjy-nXIrmDQRe17X_3BNZbI_v3T6JC_8waRoIJq1p5kpFgspLyc7XopxfRHQMTAYA; _RwBf=mta=0&rc=1781&rb=1781&gb=2025w17_u&rg=0&pc=1781&mtu=0&rbb=0.0&cid=0&clo=0&v=3&l=2025-10-02T07:00:00.0000000Z&lft=0001-01-01T00:00:00.0000000&aof=0&ard=0001-01-01T00:00:00.0000000&rwdbt=-62135539200&rwflt=1752128679&rwaul2=0&g=&o=16&p=MSAAUTOENROLL&c=MR000T&t=6475&s=2023-09-28T06:36:43.6812228+00:00&ts=2025-10-02T07:51:43.8091796+00:00&rwred=0&wls=0&wlb=0&wle=1&ccp=2&cpt=0&lka=0&lkt=0&aad=0&TH=&e=A5YROfIHAR5sZeavKqBK4s99TLhv7rzdSHmOZ9SOJyK3VgJUZ-XC7UmB70x8kzF_BCSRiYWYaHNjnu8MnnqFiA&A=; SRCHHPGUSR=SRCHLANG=zh-Hans&DM=0&BRW=NOTP&BRH=M&CW=395&CH=877&SCW=1177&SCH=2101&DPR=1.1&UTC=480&PV=10.0.0&HV=1759391504&HVE=CfDJ8ONjmqURwN9Bgygetv6pz_nKL9BvCtrsW_hBFi1cdDMZkjI1KieHrahclfUU25qyMOaGVdsrt6RYV6zCI5XETeGWDRQmW1k8tCvPdrHO1i5ZSHsaqN-d5GczJVH1XGY8dQQ0gOCctfC5jhF5KjbP3FFacBf-Lf_bJRoLHaAwgRTXB69euJFyOIFqs8Om1GvbzA&PRVCW=395&PRVCH=877&EXLTT=31&AV=14&ADV=14&RB=0&MB=0&BZA=0&IG=46492890A9F54D7F91F411B06B86DC58&PCMobUX=0&PR=1&B=0&PREFCOL=0",
    "ect": "4g",
    "priority": "u=0, i",
    "referer": "https://www.bing.com/images/search?q=%E5%9B%BE%E7%89%87&qs=n&form=QBIR&sp=-1&lq=0&pq=%E5%9B%BE%E7%89%87&sc=10-2&cvid=8FF468D5CBF64384837AC59E8D282684&cc=cn&first=1&cw=1732&ch=864",
    "sec-ch-ua": "\"Chromium\";v=\"140\", \"Not=A?Brand\";v=\"24\", \"Microsoft Edge\";v=\"140\"",
    "sec-ch-ua-arch": "x86",
    "sec-ch-ua-bitness": "64",
    "sec-ch-ua-full-version": "140.0.3485.94",
    "sec-ch-ua-full-version-list": "\"Chromium\";v=\"140.0.7339.208\", \"Not=A?Brand\";v=\"24.0.0.0\", \"Microsoft Edge\";v=\"140.0.3485.94\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-model": "",
    "sec-ch-ua-platform": "Windows",
    "sec-ch-ua-platform-version": "10.0.0",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "sec-ms-gec": "B72C176FCDB07100B0664899BFF99B8D724DB59A415D8F1F378F7B8DF8429C52",
    "sec-ms-gec-version": "1-140.0.3485.94",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0",
    "x-client-data": "eyIxIjoiNyIsIjIiOiIxIiwiMyI6IjAiLCI0IjoiLTc4NzQzMjEyNTE3MDU0NzQ4MDEiLCI2Ijoic3RhYmxlIiwiOSI6ImRlc2t0b3AifQ=="
}

删到最后的结果

headers={  
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0',  
}

最后就是模拟请求发送即可

content=input('请输入需要搜集的名称:\n')  
url='https://www.bing.com/images/search'  
# 查询参数  
params={  
    'q':content,  
}  
headers={  
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0',  
}  
session = HTMLSession()  
response=session.get(url=url,params=params,headers=headers)  
print(response.content.decode())

第三步:分析响应中的数据,进行提取

请添加图片描述

图片url

https://th.bing.com/th/id/OIP.SBp3ysaUDzlNhwvT-8j8zwHaLG?w=89&h=89&c=1&rs=1&qlt=70&r=0&o=7&dpr=1.1&pid=InlineBlock&rm=3

去掉多余参数

https://th.bing.com/th/id/OIP.SBp3ysaUDzlNhwvT-8j8zwHaLG

运行代码

content=input('请输入需要搜集的名称:\n')  
url='https://www.bing.com/images/search'  
# 查询参数  
params={  
    'q':content,  
}  
headers={  
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0',  
}  
session = HTMLSession()  
response=session.get(url=url,params=params,headers=headers)  
print(response.content.decode())

从多张图片url可以发现都是以https://th.bing.com/th/id/+图片名,但是在请求回来的数据没有相应的图片
请添加图片描述

下载数据查看效果

with open('test.html','wb') as f:  
    f.write(response.content)

通过下载下来的数据发现,所有的图片地址的前缀都发生了变化
请添加图片描述

根据大部分图片url的前缀变化,发现在tse后只会出现1~4数字
制作对应的正则表达式

# 分析图片的格式,制作获取对应的正则表达式,获取图片url  
pattern= r"https?://tse[1-4]\.mm\.bing\.net/th/id/[^?]+"  
matches=re.findall(pattern,test)  
print(matches,len(matches))

结果都是在页面上显示的图片(有时请求得到的数量不同)

['https://tse1.mm.bing.net/th/id/OIP.mH9YLFEL5YdVxJM82mjVJQHaEo', 'https://tse4.mm.bing.net/th/id/OIP.xpapIFnN4RR4IfrjE8hxQgHaFj', 'https://tse1.mm.bing.net/th/id/OIP.25pIX3E7rof3lBoczgqZBQHaE8', 'https://tse1.mm.bing.net/th/id/OIP.H1cnfATgFswUlZBe1TSAvAHaLG', 'https://tse1.mm.bing.net/th/id/OIP.CUTSbJv8oxmGWCWWnwTdNAHaLG', 'https://tse2.mm.bing.net/th/id/OIP.0lD--rJCswh3q-Ipmsmi9wHaLG', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse1.mm.bing.net/th/id/OIP.tlGcdsox7WVHcUPN8hBESAHaLH', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse3.mm.bing.net/th/id/OIP.ivUksPG7i7TNbrNlURPuqQHaE8', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse2.mm.bing.net/th/id/OIP.RZt7uQNyRu1pCxAzBBi6awHaNK', 'https://tse2.mm.bing.net/th/id/OIP._cNnrXsNWK4q8UN38foLjAHaEA', 'https://tse3.mm.bing.net/th/id/OIP.IERNxYrvy4zPGsE55rStxQHaLH', 'https://tse3.mm.bing.net/th/id/OIP.ZUW4BFvV-CIzZEKsMtJZ2wHaLG', 'https://tse1.mm.bing.net/th/id/OIP.Kp9bMCG1JBEOnjrfFOzwIAHaEd', 'https://tse1.mm.bing.net/th/id/OIP.ggIWxdHfXOkXGB1z2zf25QHaE8', 'https://tse2.mm.bing.net/th/id/OIP.j1_0T0HAtieZ3dIPTTe6RgHaE8', 'https://tse2.mm.bing.net/th/id/OIP.ku_Hl8ZerIdev4aa6fSaxwHaKj', 'https://tse1.mm.bing.net/th/id/OIP.A9qBagKQ52CgOkqomTJwZgHaLH', 'https://tse4.mm.bing.net/th/id/OIP.uOpI0k4t28-1-bTVtATlQQHaLH', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse3.mm.bing.net/th/id/OIP.sTHRqt_4kQAZ8RJNmAfpVwHaLH', 'https://tse4.mm.bing.net/th/id/OIP.C_hichQIHZA8a173IUzzEwHaLG', 'https://tse2.mm.bing.net/th/id/OIP.R9Ijj_pqmMynYHQYNi606gHaLH', 'https://tse1.mm.bing.net/th/id/OIP.h2y-AlVHfwquj4SOffl9wgHaLH', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse2.mm.bing.net/th/id/OIP.Sb7Q3g4LgPnsn-otQjlQtQHaLH', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse4.mm.bing.net/th/id/OIP.10Ld1VARiSpgNbdaAQGrvQAAAA', 'https://tse2.mm.bing.net/th/id/OIP.5SmQSas17sb7mi402q9YigHaJ4', 'https://tse3.mm.bing.net/th/id/OIP.UtAXZk5rj4E3camb6ZXYOgHaE8', 'https://tse1.mm.bing.net/th/id/OIP.cOA8hQOqJQ6jpWf2xjr4jAHaFT', 'https://tse4.mm.bing.net/th/id/OIP.uVDHCj3kuBleqcu-8m4xHwHaE7', 'https://tse3.mm.bing.net/th/id/OIP.abPqesflyzo7go60GpeVOgHaE7', 'https://tse1.mm.bing.net/th/id/OIP.j8IqL-lfJJhzLtxlGOBmdQHaLH', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse3.mm.bing.net/th/id/OIP.vI1XfBCcCRyr6lBB0Kkw2gHaNK', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse3.mm.bing.net/th/id/OIP.zZpTdijaGRjYe5AxeWdbqwHaJ4', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse3.mm.bing.net/th/id/OIP.9vnSGFUQ3gOptzYJ_RndpwHaLH', 'https://tse1.mm.bing.net/th/id/OIP.xdBvwDRHi9rm_DlwYE94VQHaE8', 'https://tse4.mm.bing.net/th/id/OIP.-V9lZkFr17YtkKhty2VqKwHaLH', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse3.mm.bing.net/th/id/OIP.kvklx7-6azdexGDbGRcNxAHaGB', 'https://tse2.mm.bing.net/th/id/OIP.OJPr9o8jB8JjRNg4OdjGQAHaDL', 'https://tse2.mm.bing.net/th/id/OIP.C9MFQWmPlja2kEjNiqAD-wHaFp', 'https://tse1.mm.bing.net/th/id/OIP.S1aX_EAzgsf86OziVsT_NgHaFd', 'https://tse3.mm.bing.net/th/id/OIP.KWisfz55UUr1OGK4v3ayjwHaD3', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse4.mm.bing.net/th/id/OIP.mfJc5nS38DVKgxP0vvfRkAHaJQ', 'https://tse3.mm.bing.net/th/id/OIP.PO3VS1U-qDXRwXb5LIVpNgHaLH', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA', 'https://tse2.mm.bing.net/th/id/OIP.f_V64PFPfjK5DtJRcs0s2gHaE8', 'https://tse2.mm.bing.net/th/id/OIP.R9Ijj_pqmMynYHQYNi606gHaLH', 'https://tse3.mm.bing.net/th/id/ODF.OTWJk1xWx9LlWBdoF478uA']

第四步:把数据保存到指定文件夹

# 3. 下载图片  
if not os.path.exists(content):  
    os.mkdir(content)  
for img_url in matches:  
    name=img_url.split('/')[-1]  
    img_name=os.path.join(content,name+'.jpg')  
    try:  
  
        response = requests.get(img_url)  
        if response.status_code==200:  
            with open(img_name, 'wb') as f:  
                f.write(response.content)  
            print('下载成功:', img_name)  
        else:  
            print('获取图片失败:',img_url)  
    except Exception as e:  
        print(f'下载失败:{e}')

数据下载到输入内容文件夹下
请添加图片描述

完整代码

import os  
import re  
  
import requests  
from requests_html import HTMLSession  
  
  
if __name__=='__main__':  
  
    # 1.获取图片显示的html  
    content=input('请输入需要搜集的名称:\n')  
    url='https://www.bing.com/images/search'  
    # 查询参数  
    params={  
        'q':content,  
    }  
    headers={  
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0',  
    }  
    session = HTMLSession()  
    response=session.get(url=url,params=params,headers=headers)  
    print(response.content.decode())  
  
    # 测试分析接收的内容  
    # with open('test.html','wb') as f:  
    #     f.write(response.content)  
    # # 2.分析图片url在html中的格式, 从html中获取图片的url  
    test=response.content.decode()  
    # 分析图片的格式,制作获取对应的正则表达式,获取图片url  
    pattern= r"https?://tse[1-4]\.mm\.bing\.net/th/id/[^?]+"  
    matches=re.findall(pattern,test)  
    print(matches,len(matches))  
  
    # 3. 下载图片  
    if not os.path.exists(content):  
        os.mkdir(content)  
    for img_url in matches:  
        name=img_url.split('/')[-1]  
        img_name=os.path.join(content,name+'.jpg')  
        try:  
  
            response = requests.get(img_url)  
            if response.status_code==200:  
                with open(img_name, 'wb') as f:  
                    f.write(response.content)  
                print('下载成功:', img_name)  
            else:  
                print('获取图片失败:',img_url)  
        except Exception as e:  
            print(f'下载失败:{e}')

在这里插入图片描述

如果在阅读中遇到任何疑问,欢迎在评论区留言或者私信我,我很乐意与你交流!📬

喜欢本篇内容的记得点个 👍点赞,收藏 并 关注我,后续的更多实用技巧和深度干货了! 期待在评论区看到你的声音,我们一起成长、共同进步!😊
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值