tensorflow学习——爬虫(headers爬取图片)

本文介绍了如何使用Python的正则表达式和lxml.etree两种方法从网页中抓取图片,并将其保存到本地。首先设置请求头,通过requests库获取网页内容,然后利用正则表达式或XPath解析图片链接及名称,最后下载并保存图片。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1、使用正则查找

import requests
import re
import os
import time

headers = {
           "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
           "Accept-Encoding": "gzip, deflate",
           "Accept-Language": "en-US,zh-CN;q=0.8,zh;q=0.6,en;q=0.4",
           "Cache-Control": "no-cache",
           "Connection": "keep-alive",
#           "Cookie":"__cfduid=dd46a11309e3cb65a78d0caa56ed4daa51505911332;UM_distinctid=15ea106f08fe9-0ff84cf70e488a8-75246751-1fa400-15ea106f090c7; CNZZDATA5003870=cnzz_eid%3D1164409310-1505910966-null%26ntime%3D1506062459; Hm_lvt_93e96130e9baaf094d30957b36d95ccf=1505940143,1506081081; pgv_pvi=2985106432; _qddaz=QD.92erq0.v00p7w.j7thvo7t; tencentSig=5302248448;user=uid=623345387&pwd=a12345678; ASP.NET_SessionId=vvvcypp2d2o4cbnhn4bngu5b; IESESSION=alive; _qddab=3-z416qh.j7w1gs75; Hm_lpvt_93e96130e9baaf094d30957b36d95ccf=1506081081; pgv_si=s5910183936; _qdda=3-1.2x97cs; _qddamta_800058301=3-0",
           "Cookie":"__cfduid=d1cd5a3a0802ebb9cab7d2973f44bc5ca1505830099; UM_distinctid=15e9a791386a9-05d459887c7e78-36465d60-100200-15e9a79138762; pgv_pvi=7775123456; __guid=166899000.1935821023027210800.1505907169248.0813; yjs_id=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgNi4xKSBBcHBsZVdlYktpdC81MzcuMzYgKEtIVE1MLCBsaWtlIEdlY2tvKSBDaHJvbWUvNTUuMC4yODgzLjg3IFNhZmFyaS81MzcuMzZ8d3d3Lmhhb2kyMy5uZXR8MTUwNTkwNzE2OTczMnxodHRwOi8vd3d3Lmhhb2kyMy5uZXQvP3B0PXQ; tencentSig=8586044416; ASP.NET_SessionId=4w5cxxby0kzbrnbgv0t4caxq; IESESSION=alive; Hm_lvt_93e96130e9baaf094d30957b36d95ccf=1505872238,1505958438,1506041710,1506310782; Hm_lpvt_93e96130e9baaf094d30957b36d95ccf=1506310782; ctrl_time=1; pgv_si=s3970691072; user=uid=623345387&pwd=a12345678; monitor_count=7; CNZZDATA5003870=cnzz_eid%3D46441260-1505905552-%26ntime%3D1506322448; AJSTAT_ok_pages=1; AJSTAT_ok_times=11; _qddaz=QD.76tm55.fahzfd.j7roes3f; _qddamta_800058301=3-0; _qdda=3-1.2x97cs; _qddab=3-odccan.j7zuy9ei",
           "Pragma": "no-cache",
           "Referer": "http://www.haoi23.net/login.aspx?act=reg",
           "Upgrade-Insecure-Requests": "1",
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"
         }

def dowmloadPic(html, page):
    i = 0
    print ('找到第'+str(page)+'页的图片,现在开始下载图片...')
#    src = html
    links_names= re.findall(r"<img src='(.*?)'.+?<td>(\w{4,5})</td>", html, re.S)
#    print(links)
    for each, name in links_names:
        if each==None:
            continue
        if re.search(r'\d{4}',name):
            if len(name)==4:
                print( '正在下载第'+str(i+1)+'张图片,图片名称:'+name+',图片地址:'+str(each))
                try:
                    pic= requests.get(each, timeout=10)
                except requests.exceptions.ConnectionError:
                    print ('【错误】当前图片无法下载')
                    continue
                time.sleep(1)
                string = 'pictures/'+ str(page) +'_' + str(name)+'.jpg'
                with open(string,'wb') as fp:
                    fp.write(pic.content)
        i += 1

if __name__ == '__main__':

    session=requests.Session()

    if os.path.exists('pictures') is False:
        os.makedirs('pictures')

    start = input("Input start page: ")
    end = input("Input last page: ")
    while(int(end)<int(start)):
        print('Last must be > start! Please input again.')
        end = input("Input last page: ")

    base_url = 'http://www.haoi23.net/u_3mypic.aspx?day=all&s=0&e=23&page={pageNum}'
    for i in range(int(start), int(end)+1):
        url = base_url.format(pageNum=i)
        result = session.get(url, headers=headers)
        #print(result.text)
        dowmloadPic(result.text, i)

2、使用 from lxml import etree 查找

# -*- coding: utf-8 -*-
"""
Created on Thu Sep 21 21:55:26 2017

@author: Administrator
"""

import requests
import os
from lxml import etree
import time

headers = {
           "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
           "Accept-Encoding": "gzip, deflate",
           "Accept-Language": "en-US,zh-CN;q=0.8,zh;q=0.6,en;q=0.4",
           "Cache-Control": "no-cache",
           "Connection": "keep-alive",
#           "Cookie":"__cfduid=dd46a11309e3cb65a78d0caa56ed4daa51505911332;UM_distinctid=15ea106f08fe9-0ff84cf70e488a8-75246751-1fa400-15ea106f090c7; CNZZDATA5003870=cnzz_eid%3D1164409310-1505910966-null%26ntime%3D1506062459; Hm_lvt_93e96130e9baaf094d30957b36d95ccf=1505940143,1506081081; pgv_pvi=2985106432; _qddaz=QD.92erq0.v00p7w.j7thvo7t; tencentSig=5302248448;user=uid=623345387&pwd=a12345678; ASP.NET_SessionId=vvvcypp2d2o4cbnhn4bngu5b; IESESSION=alive; _qddab=3-z416qh.j7w1gs75; Hm_lpvt_93e96130e9baaf094d30957b36d95ccf=1506081081; pgv_si=s5910183936; _qdda=3-1.2x97cs; _qddamta_800058301=3-0",
           "Cookie":"__cfduid=d1cd5a3a0802ebb9cab7d2973f44bc5ca1505830099; UM_distinctid=15e9a791386a9-05d459887c7e78-36465d60-100200-15e9a79138762; pgv_pvi=7775123456; __guid=166899000.1935821023027210800.1505907169248.0813; yjs_id=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgNi4xKSBBcHBsZVdlYktpdC81MzcuMzYgKEtIVE1MLCBsaWtlIEdlY2tvKSBDaHJvbWUvNTUuMC4yODgzLjg3IFNhZmFyaS81MzcuMzZ8d3d3Lmhhb2kyMy5uZXR8MTUwNTkwNzE2OTczMnxodHRwOi8vd3d3Lmhhb2kyMy5uZXQvP3B0PXQ; tencentSig=8586044416; ASP.NET_SessionId=4w5cxxby0kzbrnbgv0t4caxq; IESESSION=alive; Hm_lvt_93e96130e9baaf094d30957b36d95ccf=1505872238,1505958438,1506041710,1506310782; Hm_lpvt_93e96130e9baaf094d30957b36d95ccf=1506310782; ctrl_time=1; pgv_si=s3970691072; _qddamta_800058301=3-0; _qddaz=QD.76tm55.fahzfd.j7roes3f; user=uid=623345387&pwd=a12345678; _qdda=3-1.1; _qddab=3-lm20l6.j7zmjqjr; CNZZDATA5003870=cnzz_eid%3D46441260-1505905552-%26ntime%3D1506306216; AJSTAT_ok_pages=3; AJSTAT_ok_times=10; monitor_count=5",
           "Pragma": "no-cache",
           "Referer": "http://www.haoi23.net/login.aspx?act=reg",
           "Upgrade-Insecure-Requests": "1",
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"
         }

def dowmloadPic(html, page):
    i = 0
    print ('找到第'+str(page)+'页的图片,现在开始下载图片...')
    src = html
    links = []

    root = etree.HTML(src)
    table = root.xpath("//table")[0]
    trs = table.findall(".//tr")[1:]

    names = []
    for tr in trs:
        td3 = tr.findall("./td")[3]
        img = tr.find(".//img")
        links.append(img.attrib['src'])
        if td3 is None:
            names.append(img.attrib['src'].split('/')[-1][:-3])
        else:
            names.append(td3.text)
    for each, name in zip(links, names):
        if each==None:
            continue
        print( '正在下载第'+str(i+1)+'张图片,图片名称:'+name+',图片地址:'+str(each))
        try:
            pic= requests.get(each, timeout=10)
        except requests.exceptions.ConnectionError:
            print ('【错误】当前图片无法下载')
            continue
        time.sleep(1)
        string = 'pictures/'+ str(page) +'_' + str(name)+'.jpg'
        with open(string,'wb') as fp:
            fp.write(pic.content)
        i += 1

if __name__ == '__main__':

    session=requests.Session()

    if os.path.exists('pictures') is False:
        os.makedirs('pictures')

    start = input("Input start page: ")
    end = input("Input last page: ")
    while(int(end)<int(start)):
        print('Last must be > start! Please input again.')
        end = input("Input last page: ")

    base_url = 'http://www.haoi23.net/u_3mypic.aspx?day=all&s=0&e=23&page={pageNum}'
    for i in range(int(start), int(end)+1):
        url = base_url.format(pageNum=i)
        result = session.get(url, headers=headers)
        #print(result.text)
        dowmloadPic(result.text, i)



### 使用Python编写爬虫抓取优酷视频 #### 抓取目标与工具准备 为了实现对优酷视频的数据获取,通常会涉及到网页请求、解析HTML文档以及处理API接口返回的内容。对于这类任务,`requests`库用于发起HTTP/HTTPS网络请求;而`BeautifulSoup`或`lxml`则适合于解析HTML/XML结构化数据。如果遇到JavaScript渲染的页面,则可能需要用到Selenium这样的浏览器自动化框架[^1]。 #### 处理登录验证机制 部分网站可能会设置较为严格的反扒措施,比如验证码识别或是基于Cookies的身份认证流程。针对这种情况,在开发初期就要考虑好应对策略,例如模拟真实用户的浏览行为模式,定期更换User-Agent字符串,甚至利用第三方服务绕过图形验证码挑战等问题[^2]。 #### 数据提取逻辑设计 当面对像优酷这样大型综合性媒体平台时,直接从公开可访问的部分入手往往是最简单有效的方式之一。具体来说就是先定位到含有目标资源列表页URL模板,再依据一定规则遍历每一页直至结束条件满足为止。与此同时还要注意观察详情页链接构造规律以便后续深入挖掘更多有价值的信息项[^3]。 ```python import requests from bs4 import BeautifulSoup def fetch_video_list(page_num=1): url = f"https://www.youku.com/v_olist/c_97_p_{page_num}.html" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") video_items = [] for item in soup.select('.v-link'): title = item['title'] href = item['href'] video_items.append({ 'title': title, 'url': href }) return video_items if __name__ == "__main__": videos = fetch_video_list() for v in videos[:5]: print(f"{v['title']}: {v['url']}") ``` 此段代码仅作为基础示范用途,请勿滥用本技术手段非法采集他人受版权保护的作品!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值