Python-lxml

最新推荐文章于 2023-12-13 21:18:11 发布

原创最新推荐文章于 2023-12-13 21:18:11 发布 · 644 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫

本文深入讲解了ElementTree API的核心概念Element及其操作方法，包括元素创建、属性设置、文本内容处理等，并介绍了如何利用XPath进行元素查找及遍历整个XML树。

Element 类

Element是ElementTree API的主要容器对象。大多数XML树功能都是通过这个类访问的。Elements可以通过工厂方法轻松创建：

>>> from lxml import etree
>>> root = etree.Element('root')
>>> ###元素的XML标签名可以通过元素的tag属性访问
>>> print(root.tag)
root
>>> ###元素以XML树结构组织，用append()方法来创建子元素并添加到父元素中
>>> root.append(etree.Element("child1"))
>>> ###SubElement工厂方法更加简便:
>>> chile2 = etree.SubElement(root,"child2")


>>> child3 = etree.SubElement(root,"child3")
>>> ###打印root元素的内容
>>> print(etree.tostring(root,pretty_print=True))
b'<root>\n  <child1/>\n  <child2/>\n  <child3/>\n</root>\n'

Elemnets是列表

>>> ##为了更直接方便访问元素，元素尽可能模仿Python的list行为
>>> child1 = root[0]
>>> print(child1.tag)
child1
>>> print(len(root))
3
>>> root.index(root[1])###只有lxml.etree才有的功能
1
>>> childrens = list(root)
>>> for child in root:
	print(child.tag)

	
child1
child2
child3

>>> root.insert(0, etree.Element("child0"))
>>> start = root[:1]
>>> end = root[-1:]
>>> print(start[0].tag)
child0
>>> print(end[0].tag)
child3

>>>

判断是否有元素：

>>> ###判断是否有元素
>>> print(etree.iselement(root))
True
>>> if len(root):
	print("has element")

	
has element

>>> ###删除和复制:
>>> for chilid in root:
	print(child.tag)

	
child3
child3
child3
child3
>>> root[0] = root[-1]
>>> for child in root:
	print(child.tag)

	
child3
child1
child2
>>> ###把倒数第一个复制给第一个，倒数第一就删除了，只剩下３个元素
>>>

上面的例子和Python的list不同的哦　这个要注意 Python的list是这样的：

>>> ###Python list　赋值
>>> l = [1,2,3,4]
>>> l[0] = l[-1]
>>> l
[4, 2, 3, 4]
>>>

查询父对象是否存在

>>> root is root[0].getparent()
True
>>>

鉴于上面复制就移除的现象，复制不移除对象可以使用深拷贝

>>> ###深拷贝
>>> from copy import deepcopy
>>> element = etree.Element("neu")
>>> element.append(deepcopy(root[1]))
>>> print(element[0].tag)
child1
>>> print([c.tag for c in root])
['child3', 'child1', 'child2']
>>> root[0] is root[1].getprevious()
True
>>> root[1] is root[0].getnext()
True

下面是和上面做对比的例子：

>>> element = etree.Element("neu")
>>> element.append(root[1])
>>> print(element[0].tag)
child1
>>> print([c.tag for c in root])
['child3', 'child2']
>>>

元素有属性就是字典

>>> ###创建属性
>>> root = etree.Element("root",interesting="totally")
>>> etree.tostring(root)
b'<root interesting="totally"/>'
>>> ###打印属性值
>>> print(root.get("interesting"))
totally
>>> print(root.get("hello"))
None
>>> root.set("hello", "Huhu")
>>> print(root.get("hello"))
Huhu
>>> etree.tostring(root)
b'<root interesting="totally" hello="Huhu"/>'
>>> ###排序
>>> sorted(root.keys())
['hello', 'interesting']
>>> for name,value in sorted(root.items())
SyntaxError: invalid syntax
>>> for name,value in sorted(root.items()):
	print('%s = %r' % (name, value))

	
hello = 'Huhu'
interesting = 'totally'
>>> ###使用atrib属性
>>> attributes = root.attrib
>>> print(attributes["interesting"])
totally
>>> print(attributes.get("no-such-attribute"))
None
>>> attributes["hello"] = "Guten Tag"
>>> print(attributes['hello'])
Guten Tag
>>> print(root.get('hello'))
Guten Tag
>>> d = dict(root.attrib)
>>> sorted(d.items())
[('hello', 'Guten Tag'), ('interesting', 'totally')]
>>>

Element包含文字

>>> root = etree.Element("root")
>>> root.text = "TEXT"

>>> print(root.text)
TEXT

>>> etree.tostring(root)
b'<root>TEXT</root>'

XHTML中，文字可能嵌套在不同元素之间：

<html><body>Hello<br/>World</body></html>

用tail属性来创建嵌套文字

>>> html = etree.Element("html")
>>> body = etree.SubElement(html,"body")
>>> body.text = "TEXT"
>>> etree.tostring(html)
b'<html><body>TEXT</body></html>'
>>> br = etree.SubElement(body,"br")
>>> etree.tostring(html)
b'<html><body>TEXT<br/></body></html>'

不想打印tail

>>> br.tail = "TAIL"
>>> etree.tostring(html
)b'<html><body>TEXT<br/>TAIL</body></html>'
>>>
>>> ###不想打印tail
>>> etree.tostring(br)
b'<br/>TAIL'
>>> etree.tostring(br,with_tail=False)
b'<br/>'
>>>

打印text 去除 br 标签：

>>> etree.tostring(html,method="text")
b'TEXTTAIL'

使用XPath查找文字

>>> print(html.xpath("string()"))
TEXTTAIL
>>> print(html.xpath("//text()"))
['TEXT', 'TAIL']
>>>

常用的话就复制这个方法对象

>>> build_text_list = etree.XPath("//text()")
>>> print(build_text_list(html))
['TEXT', 'TAIL']
>>>

获取parent getparent()

>>> texts = build_text_list(html)
>>> print(texts[0])
TEXT
>>> parent = texts[0].getparent()
>>> print(parent.tag)
body
>>> print(texts[1])
TAIL
>>> print(texts[1].getparent().tag)
br
>>>

判断是文字内容还是尾巴(tail)

>>> print(texts[0].is_text)
True
>>> print(texts[1].is_text)
False
>>> print(texts[1].is_tail)
True
>>>

text()方法介绍完毕，string()和concat()方法不会告诉我们文字的原始位置

>>> stringify = etree.XPath("string()")
>>> print(stringify(html))
TEXTTAIL
>>> print(stringify(html).getparent())
None

Tree iteration

>>> ###树迭代
>>> root = etree.Element("root")
>>> etree.SubElement(root,"child").text = "Child 1"
>>> etree.SubElement(root,"child").text = "Child 2"
>>> etree.SubElement(root,"another").text = "Child 3"
>>> print(etree.tostring(root))
b'<root><child>Child 1</child><child>Child 2</child><another>Child 3</another></root>'
>>> print(etree.tostring(root, pretty_print=True))
b'<root>\n  <child>Child 1</child>\n  <child>Child 2</child>\n  <another>Child 3</another>\n</root>\n'
>>> for element in root.iteer():
	print("%s - %s" % (element.tag, element.text))

>>> for element in root.iter():
	print("%s - %s" % (element.tag, element.text))

root - None
child - Child 1
child - Child 2
another - Child 3
>>>

感兴趣的标签

>>> ###感兴趣标签
>>> for element in root.iter("child"):
	print("%s - %s" % (element.tag, element.text))

child - Child 1
child - Child 2
>>> for element in root.iter("another","child"):
	print("%s - %s" % (element.tag, element.text))

	
child - Child 1
child - Child 2
another - Child 3

检查是否是Element

>>> root.append(etree.Entity("#234"))
>>> root.append(etree.Comment("some comment"))

>>> for element in root.iter():
...     if isinstance(element.tag, basestring):  # or 'str' in Python 3
...         print("%s - %s" % (element.tag, element.text))
...     else:
...         print("SPECIAL: %s - %s" % (element, element.text))
root - None
child - Child 1
child - Child 2
another - Child 3
SPECIAL: ê - ê
SPECIAL: <!--some comment--> - some comment

>>> for element in root.iter(tag=etree.Element):
...     print("%s - %s" % (element.tag, element.text))
root - None
child - Child 1
child - Child 2
another - Child 3

>>> for element in root.iter(tag=etree.Entity):
...     print(element.text)
ê

序列化

>>> root = etree.XML('<root><a><b/></a></root>')

>>> etree.tostring(root)
b'<root><a><b/></a></root>'

>>> print(etree.tostring(root, xml_declaration=True))
<?xml version='1.0' encoding='ASCII'?>
<root><a><b/></a></root>

>>> print(etree.tostring(root, encoding='iso-8859-1'))
<?xml version='1.0' encoding='iso-8859-1'?>
<root><a><b/></a></root>

>>> print(etree.tostring(root, pretty_print=True))
<root>
  <a>
    <b/>
  </a>
</root>

补充缺失的标签

>>> root = etree.XML(
...    '<html><head/><body><p>Hello<br/>World</p></body></html>')

>>> etree.tostring(root) # default: method = 'xml'
b'<html><head/><body><p>Hello<br/>World</p></body></html>'

>>> etree.tostring(root, method='xml') # same as above
b'<html><head/><body><p>Hello<br/>World</p></body></html>'

>>> etree.tostring(root, method='html')
b'<html><head></head><body><p>Hello<br>World</p></body></html>'

>>> print(etree.tostring(root, method='html', pretty_print=True))
<html>
<head></head>
<body><p>Hello<br>World</p></body>
</html>

>>> etree.tostring(root, method='text')
b'HelloWorld'

编码方式：

>>> br = next(root.iter('br'))  # get first result of iteration
>>> br.tail = u'W\xf6rld'

>>> etree.tostring(root, method='text')  # doctest: +ELLIPSIS
Traceback (most recent call last):
  ...
UnicodeEncodeError: 'ascii' codec can't encode character u'\xf6' ...

>>> etree.tostring(root, method='text', encoding="UTF-8")
b'HelloW\xc3\xb6rld'

ElementTree 类

ElementTree主要是是一个包含根节点的树的文档包装器。它提供了一些串行化和一般文档处理的方法。

>>> root = etree.XML('''
... <?xml version="1.0"?>
... <!DOCTYPE root SYSTEM "test" [ <!ENTITY tasty "parsnips"> ]>
... <root>
...   <a>&tasty;</a>
... </root>
... ''')

>>> tree = etree.ElementTree(root)
>>> print(tree.docinfo.xml_version)
1.0
>>> print(tree.docinfo.doctype)
<!DOCTYPE root SYSTEM "test">

>>> tree.docinfo.public_id = '-//W3C//DTD XHTML 1.0 Transitional//EN'
>>> tree.docinfo.system_url = 'file://local.dtd'
>>> print(tree.docinfo.doctype)
<!DOCTYPE root PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "file://local.dtd">
>>> print(etree.tostring(tree.getroot()))
<root>
  <a>parsnips</a>
</root>
>>> print(etree.tostring(tree))  # lxml 1.3.4 and later
<!DOCTYPE root PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "file://local.dtd" [
<!ENTITY tasty "parsnips">
]>
<root>
  <a>parsnips</a>
</root>

解析文字和文件

解析方法 fromstring() 和 parse()

fromstring()方法

>>> some_xml_data = "<root>data</root>"

>>> root = etree.fromstring(some_xml_data)
>>> print(root.tag)
root
>>> etree.tostring(root)
b'<root>data</root>'

XML()方法

>>> root = etree.XML("<root>data</root>")
>>> print(root.tag)
root
>>> etree.tostring(root)
b'<root>data</root>'

HTML()方法

>>> root = etree.HTML("<p>data</p>")
>>> etree.tostring(root)
b'<html><body><p>data</p></body></html>'