xml文本
texts="""
<?xml version="1.0" encoding="UTF-8" ?> <?xml-stylesheet type="text/xsl" href="/3.2/style/exchange.xsl" ?> <ops:world-patent-data xmlns="http://www.epo.org/exchange" xmlns:ops="http://qw.org" xmlns:xlink="http://www.w3.org/1999/xlink"> <ops:biblio-search total-result-count="40"> <ops:query >200001</ops:query> <ops:search-result> <ops:publication-reference familyid="123121"> <document-id document-id-type="docdb"> <number>3097103</number> </document-id> </ops:publication-reference> <ops:publication-reference familyid="212311"> <document-id document-id-type="docdb"> <number>200073</number> </document-id> </ops:publication-reference> </ops:search-result> </ops:biblio-search> </ops:world-patent-data>
"""
1.使用Python自带的xml处理xml文件
from xml.dom.minidom import parse
import xml.dom.minidom
def parse_xml():
DOMTree = xml.dom.minidom.parse("xml.xml")
collection = DOMTree.documentElement
def get_result(tag, Attribute):
datas = collection.getElementsByTagName(tag)
for movie in datas:
if movie.hasAttribute(Attribute):
biblio_search_count = movie.getAttribute(Attribute)
return biblio_search_count
biblio_search_count = get_result("ops:biblio-search", "total-result-count")
print('biblio_search_count:', biblio_search_count)
ops_query = collection.getElementsByTagName("ops:query")[0].childNodes[0].data
print('ops_query:', ops_query)
search_result = collection.getElementsByTagName('ops:publication-reference')
for j in search_result:
dic = {}
dic['familyid'] = get_result('ops:publication-reference', 'familyid')
document_id = j.getElementsByTagName('document-id')
for k in document_id:
dic['number'] = k.getElementsByTagName('number')[0].childNodes[0].data
print(dic)
2. lxml 处理 xml文件
import requests
from lxml import etree
url='https://www.nasa.gov/rss/dyn/lg_image_of_the_day.rss'
web = requests.get(url=url)
response=etree.XML(web.content)
description=response.xpath('//rss/channel/description/text()')
print(description)
title=response.xpath('//rss/channel/title/text()')
print(title)
items=response.xpath('//rss/channel/item')
print(len(items))
for i in items:
item={}
item['title']=i.xpath('./title/text()')
item['link']=i.xpath('./link/text()')
item['description']=i.xpath('./description/text()')
item['enclosure']=i.xpath('./enclosure/@url')
item['pubDate']=i.xpath('./pubDate/text()')
item['guid']=i.xpath('./guid/text()')
print(item)
text="""
<?xml version="1.0" encoding="utf-8" ?> <rss version="2.0" xml:base="http://www.nasa.gov/" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:media="http://search.yahoo.com/mrss/"> <channel> <title>NASA Image of the Day</title>
<description>The latest NASA "Image of the Day" image.</description>
<link>http://www.nasa.gov/</link>
<atom:link rel="self" href="http://www.nasa.gov/rss/dyn/lg_image_of_the_day.rss" />
<language>en-us</language>
<managingEditor>yvette.smith-1@nasa.gov</managingEditor>
<webMaster>brian.dunbar@nasa.gov</webMaster>
<docs>http://blogs.law.harvard.edu/tech/rss</docs>
<item> <title>Relaxing Inside the Space Station's Window to the World</title>
<link>http://www.nasa.gov/image-feature/relaxing-inside-the-space-stations-window-to-the-world</link>
<description>JAXA and Expedition 64 astronaut Soichi Noguchi relaxes at the end of the work day inside the seven-windowed cupola on the International Space Station.</description>
<enclosure url="http://www.nasa.gov/sites/default/files/thumbnails/image/iss064e007861.jpg" length="2881136" type="image/jpeg" />
<guid isPermaLink="false">http://www.nasa.gov/image-feature/relaxing-inside-the-space-stations-window-to-the-world</guid>
<pubDate>Thu, 03 Dec 2020 08:49 EST</pubDate>
<source url="http://www.nasa.gov/rss/dyn/lg_image_of_the_day.rss">NASA Image of the Day</source>
</item>
<item> <title>Awakening Newborn Stars</title>
<link>http://www.nasa.gov/image-feature/awakening-newborn-stars</link>
<description>Lying inside our home galaxy, the Milky Way, this Herbig–Haro object is a turbulent birthing ground for new stars in a region known as the Orion B molecular cloud complex, located 1,350 light-years away.</description>
<enclosure url="http://www.nasa.gov/sites/default/files/thumbnails/image/herbig-haro-jet.jpg" length="1773997" type="image/jpeg" />
<guid isPermaLink="false">http://www.nasa.gov/image-feature/awakening-newborn-stars</guid>
<pubDate>Wed, 02 Dec 2020 08:19 EST</pubDate>
<source url="http://www.nasa.gov/rss/dyn/lg_image_of_the_day.rss">NASA Image of the Day</source>
</item>
</channel>
</rss>
"""
3.lxml处理xml响应
标签中的:表示命名空间
XML 命名空间属性被放置于元素的开始标签之中
当命名空间被定义在元素的开始标签中时,所有带有相同前缀的子元素都会与同一个命名空间相关联。
XML 命名空间提供避免元素命名冲突的方法
response = etree.XML(texts.encode('utf-8'))#texts.encode('utf-8')表示转换成bytes类型
#response = etree.XML(requests.get(url).content)
dicts = {}
namespace={'x':"http://www.epo.org/exchange",'ops':"http://qw.org",'xlink':"http://www.w3.org/1999/xlink"}
search_total_result_count = response.xpath('//ops:biblio-search/@total-result-count',namespaces=namespace)[0]
search_result=response.xpath('//ops:search-result/ops:publication-reference',namespaces=namespace)
dicts['items'] = []
for j in search_result[:1]:
dic = {}
dic['familyid'] = j.xpath('./@familyid')[0]
dic['number'] =j.xpath('.//x:document-id/x:number/text()',namespaces=namespace)[0]