在进行网络数据抓取抓取时,使用requests可以直接处理json格式的返回结果,对于xml格式的返回结果requests并没有提供直接的支持。python内建了xml解析器,下面的示例说明两种情况下对xml的解析。
解析xml文件
from xml.etree import ElementTree xml_file = r'D:\BJMapSearch.xml' try: tree = ElementTree.parse(xml_file) root = tree.getroot() # 获取根节点 except Exception, e: print '解析xml文件出错' return -1 # 对结点进行查询 data_node = root.find("layer").find("hts").findall("ht") for node in data_node: if u'关联关系' in node.attrib.keys(): print node.attrib.get(u'关联关系')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
from
xml
.
etree
import
ElementTree
xml_file
=
r
'D:\BJMapSearch.xml'
try
:
tree
=
ElementTree
.
parse
(
xml_file
)
root
=
tree
.
getroot
(
)
# 获取根节点
except
Exception
,
e
:
print
'解析xml文件出错'
return
-
1
# 对结点进行查询
data_node
=
root
.
find
(
"layer"
)
.
find
(
"hts"
)
.
findall
(
"ht"
)
for
node
in
data_node
:
if
u
'关联关系'
in
node
.
attrib
.
keys
(
)
:
print
node
.
attrib
.
get
(
u
'关联关系'
)
|
# xml结构如下 <Response> <count>10</count> <total>4706</total> <actualtotal>4706</actualtotal> <layer id="L10319" type="点">...</layer> </Response> # layer结点的结构如下 <hts Sum="10"> <ht></ht> <ht></ht> ... </hts>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
# xml结构如下
<
Response
>
<
count
>
10
<
/
count
>
<
total
>
4706
<
/
total
>
<
actualtotal
>
4706
<
/
actualtotal
>
<
layer
id
=
"L10319"
type
=
"点"
>
.
.
.
<
/
layer
>
<
/
Response
>
# layer结点的结构如下
<
hts
Sum
=
"10"
>
<
ht
>
<
/
ht
>
<
ht
>
<
/
ht
>
.
.
.
<
/
hts
>
|
网络获取xml进行解析
import requests from xml.etree import ElementTree xml_file = requests.get('http://www.beijingmap.gov.cn/bjgtj/BJMapSearch?p=0%2C10&s=%2A&l=L10319&t=xml') # fromstring方法直接返回root结点 root = ElementTree.fromstring(xml_file.text) data_node = root.find("layer").find("hts").findall("ht") for node in data_node: if u'关联关系' in node.attrib.keys(): print node.attrib.get(u'关联关系')
1
2
3
4
5
6
7
8
9
10
11
12
|
import
requests
from
xml
.
etree
import
ElementTree
xml_file
=
requests
.
get
(
'http://www.beijingmap.gov.cn/bjgtj/BJMapSearch?p=0%2C10&s=%2A&l=L10319&t=xml'
)
# fromstring方法直接返回root结点
root
=
ElementTree
.
fromstring
(
xml_file
.
text
)
data_node
=
root
.
find
(
"layer"
)
.
find
(
"hts"
)
.
findall
(
"ht"
)
for
node
in
data_node
:
if
u
'关联关系'
in
node
.
attrib
.
keys
(
)
:
print
node
.
attrib
.
get
(
u
'关联关系'
)
|
如果网络请求返回的结果比较大,需要使用另外的方式
response = requests.get(url, stream=True) # if the server sent a Gzip or Deflate compressed response, decompress # as we read the raw stream: response.raw.decode_content = True events = ElementTree.iterparse(response.raw) for elem, event in events: # do something with `elem`
1
2
3
4
5
6
7
8
9
|
response
=
requests
.
get
(
url
,
stream
=
True
)
# if the server sent a Gzip or Deflate compressed response, decompress
# as we read the raw stream:
response
.
raw
.
decode_content
=
True
events
=
ElementTree
.
iterparse
(
response
.
raw
)
for
elem
,
event
in
events
:
# do something with `elem`
|