用xpath来爬图

最新推荐文章于 2025-09-11 18:06:30 发布

转载最新推荐文章于 2025-09-11 18:06:30 发布 · 90 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/regit/p/8529214.html

文章标签：

#操作系统 #python

# xpath语法可参考http://www.w3school.com.cn/xpath/xpath_syntax.asp
# 本博客引用于https://zhuanlan.zhihu.com/something-python?topic=Python

# coding:utf-8
import requests
from lxml import html
import os
import time

def header(referer):
    headers = {
        'Host': 'i.meizitu.net',
        'Pragma': 'no-cache',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/59.0.3071.115 Safari/537.36',
        'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
        'Referer': '{}'.format(referer),
    }
    return headers

# 获取主页列表, 解析 html 的话最好使用html.fromstring(),它有xpath函数，可用于定位元素。
def getPage(pageNum):
    baseUrl = 'http://www.mzitu.com/page/{}'.format(pageNum)
    selector = html.fromstring(requests.get(baseUrl).content)
    urls = []
    for i in selector.xpath('//ul[@id="pins"]/li/a/@href'):
        urls.append(i)
        print(i)
    return urls


# 图片链接列表， 标题
# url是详情页链接
def getPiclink(url):
    sel = html.fromstring(requests.get(url).content)
    # 图片总数,xpath 表达式应该返回元素的话，总是返回一个数组，即使只有一个元素;其中@class表示属性class；last()函数表示最后一个a标签
    total = sel.xpath('//div[@class="pagenavi"]/a[last()-1]/span/text()')[0]
    # 标题
    title = sel.xpath('//h2[@class="main-title"]/text()')[0]
    # 文件夹格式
    dirName = u"【{}P】{}".format(total, title)
    # 新建文件夹
    os.mkdir(dirName)

    n = 1
    for i in range(int(total)):
        # 每一页
        try:
            link = '{}/{}'.format(url, i+1)
            s = html.fromstring(requests.get(link).content)
            # 图片地址在src标签中
            jpgLink = s.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
            # print(jpgLink)
            # 文件写入的名称：当前路径／文件夹／文件名
            filename = '%s/%s/%s.jpg' % (os.path.abspath('.'), dirName, n)
            print(u'开始下载图片:%s 第%s张' % (dirName, n))
			#"wb+" 以二进制写方式打开，可以读、写文件， 如果文件不存在，创建该文件
            with open(filename, "wb+") as jpg:
                jpg.write(requests.get(jpgLink, headers=header(jpgLink)).content)
            n += 1
        except:
            pass

#下面一行代码的作用：文件作为脚本直接执行才会被执行下面代码，而import到其他脚本中是不会被执行的，http://www.dengfeilong.com/post/60.html
if __name__ == '__main__':
    pageNum = input(u'请输入页码：')
    p = getPage(pageNum)
    for e in p:
        print(e)
        getPiclink(e)
        # lxml的报错
        time.sleep(2)
		
		
		
# 执行的时候用python3, 并且要安装pip install lxml requests
#虚拟环境中装python3
#sudo apt-get install python-pip 
#sudo apt-get install python-virtualenv #安装本地虚拟环境管理工具 
#mkdir ~/django # 创建目录 
#cd ~/django virtualenv venv #在~/django目录下，创建一个venv的虚拟环境 
#source venv/bin/activate #开启虚拟环境

转载于:https://www.cnblogs.com/regit/p/8529214.html