对于要提取嵌套标签所有内容的情况, 使用string
或//text()
, 注意两者区别
>>> from <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/scrapy" title="View all posts in scrapy" target="_blank">scrapy</a></span> import Selector >>> >>> doc = "<p id='test'>hello<b>world!</b></p>" >>> >>> sel = Selector(text=doc, type='html') >>> >>> sel.xpath("/p[@id='test']/text()").extract() []
1
2
3
4
5
6
7
8
|
>>>
from
scrapy
import
Selector
>>>
>>>
doc
=
"<p id='test'>hello<b>world!</b></p>"
>>>
>>>
sel
=
Selector
(
text
=
doc
,
type
=
'html'
)
>>>
>>>
sel
.
xpath
(
"/p[@id='test']/text()"
)
.
extract
(
)
[
]
|
使用text()
>>>#使用两个反斜杠 >>> sel.xpath("//p[@id='test']/text()").extract() [u'hello'] >>> #这样提取出来是一个列表, >>> sel.xpath("//p[@id='test']//text()").extract() [u'hello', u'world!'] >>>
1
2
3
4
5
6
7
|
>>>
#使用两个反斜杠
>>>
sel
.
xpath
(
"//p[@id='test']/text()"
)
.
extract
(
)
[
u
'hello'
]
>>>
#这样提取出来是一个列表,
>>>
sel
.
xpath
(
"//p[@id='test']//text()"
)
.
extract
(
)
[
u
'hello'
,
u
'world!'
]
>>>
|
使用string
>>> sel.xpath("//p[@id='test']").xpath('string(.)').extract() [u'helloworld!'] >>> >>> sel.xpath("string(//p[@id='test'])").extract() [u'helloworld!'] >>>
1
2
3
4
5
6
|
>>>
sel
.
xpath
(
"//p[@id='test']"
)
.
xpath
(
'string(.)'
)
.
extract
(
)
[
u
'helloworld!'
]
>>>
>>>
sel
.
xpath
(
"string(//p[@id='test'])"
)
.
extract
(
)
[
u
'helloworld!'
]
>>>
|