# -*- coding: utf-8 -*- import sys import requests from lxml import etree import codecs reload(sys) sys.setdefaultencoding("utf-8") # 伯乐在线最新文章首页源码 3.html 存本地使加快请求速度 # open后为路径 建议用相对路径 第一个点表示当前文件上一级路径 content = codecs.open("./3.html", "r", encoding="utf-8").read() # print content doc = etree.HTML(content) # print doc # 获取所有文章封面img img_list = doc.xpath('''//div[@class="post floated-thumb"]//img''') # print img_list for img in img_list: try: print img.xpath("@src")[0] alt = img.xpath("@alt")[0] if alt: print alt else: print "该图片没有alt属性" print '--------------------' except IndexError as e: print '该图片没有alt属性' # 获取所有博客标题和详情 div_list = doc.xpath("//div[@class='post floated-thumb']") for div in div_list: a = div.xpath("div[@class='post-thumb']/a")[0] detai_url = a.xpath("@href")[0] title = a.xpath("@title")[0] img = a.xpath("img")[0] img_src = img.xpath("@src")[0] print title, detai_url, img_src print '---------------------------'
xpath 具体应用
最新推荐文章于 2025-03-25 09:23:23 发布