XPath
# 常用的方法
# 字符串格式的html文本的情况下
# 使用etree.HTML()方法把text转化成xpath对象
html = etree.HTML(text)
# html文本以文件的形式存在时,使用etree.parse()解析html文件,转化成xpath对象
html = etree.parse('./test.html', etree.HTMLParser())
# 把xpath对象转化为bytes类型
result = etree.tostring(html)
# 输出bytes类型的对象使用decode('utf-8')
print(result.decode('utf-8'))
# 使用etree.xpath()方法获取xpath对象的节点,返回的是一个list列表
result = html.xpath('//li')
接下来时xpath的语法。
匹配所有节点*
# xpath续!!!!!!!!!!!!
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
# 这里使用*代表匹配所有节点
result = html.xpath('//*')
print(result)
返回的result是一个list
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
# //从当前节点选择子孙节点,li选区所有li节点
result = html.xpath('//li') #list
print(result)
# 取出第一个li节点
print(result[0])
子节点/
# 子节点
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
# 所有ul节点的a节点
result = html.xpath('//ul//a')
print(result)
父节点…
# 父节点
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
# 获取具有属性href="link4.html"的a标签下的父节点li的属性class的值
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)
# 父节点方法二
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)
属性匹配@
# 属性匹配
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-1"]')
for item in result:
print(etree.tostring(item))
文本获取text(),text
# 文本获取
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]//text()')
# for item in result:
# print(etree.tostring(item))
print(result)
上面的写法可能报错,用以下写法
错误代码示例 :
driver.find_element_by_xpath('//*[@class="signTime"]/text()')
正确代码示例 :
driver.find_element_by_xpath('//*[@class="signTime"]').text
属性多值匹配contains()
# 属性多值匹配
# contains()函数
from lxml import etree
text = """
<li class="li li-first"><a href="link.html">first item</a></li>
"""
html = etree.HTML(text)
# contains()第一个参数传入【@属性名称】,第二个参数传入【属性值】,只要此属性【包含】所传入的属性值就可以完成匹配了
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)
多属性匹配and
# 多属性匹配
from lxml import etree
text = """
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
"""
html = etree.HTML(text)
# 多个属性使用and操作符相连
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)
按序选择last(),position()
# 按顺序选择
from lxml import etree
text = """
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
"""
html = etree.HTML(text)
# 第一个li的文本
result = html.xpath('//li[1]/a/text()')
print(result)
# 最后一个li的文本
result = html.xpath('//li[last()]/a/text()')
print(result)
# 前两个li的文本
result = html.xpath('//li[position()<3]/a/text()')
print(result)
# last()=5,取第三个li的文本
result = html.xpath('//li[last()-2]/a/text()')
print(result)
结点轴选择ancestor::,attribute::,descendant::,following::,following-sibling::
# 节点轴选择
from lxml import etree
text = """
<div>
<ul class="drs">
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
"""
html = etree.HTML(text)
# 获取所有祖先节点
result = html.xpath('//li[1]/ancestor::*')
print(result)
# 获取祖先节点的div节点
result = html.xpath("//li[1]/ancestor::div")
print(result)
# 获取第一个li的属性值
result = html.xpath("//li[1]/attribute::*")
print(result)
# 获取第一个li的子节点的a节点符合href=“link1.html”
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
# 获取第一个li的子孙节点的span节点
result = html.xpath('//li[1]/descendant::span')
print(result)
# 获取第一个li节点后面的所有节点的第二个后续节点a
result = html.xpath('//li[1]/following::*[2]')
for item in result:
print(etree.tostring(item))
# 获取第一个li的所有同级节点
result = html.xpath('//li[1]/following-sibling::*')
print(result)
scrapy xpath 获取属性值
export_number = response.xpath('//p[1]/@value')