4.1 Xpath
from lxml import etree
text = """
<div class="board-item-main">
<div class="board-item-content">
<div class="movie-item-info">
<p class="name"><a href="/films/493549" title="大寒" data-act="boarditem-click" data-val="{movieId:493549}">大寒</a></p>
<p class="star">主演:张双兵,鲁园,许薇</p>
<p class="releasetime">上映时间:2018-08-14</p>
</div>
<div class="movie-item-number score-num">
<p class="score"><i class="integer">9.</i><i class="fraction">4</i></p>
</div>
</div>
</div>
"""
html = etree.HTML(text)
result = html.xpath('//div/p//text()')
print(result)
result = html.xpath('//div/p/@class')
print(result)
4.2 beautiful soup 以猫眼电影排行某标签为例
import bs4
import re
html = """
<div class="board-item-main">
<div class="board-item-content">
<div class="movie-item-info">
<p class="name"><a href="/films/345870" title="传奇的诞生" data-act="boarditem-click" data-val="{movieId:345870}">传奇的诞生</a></p>
<p class="star">
主演:凯文·德·保拉,贝利,索·豪黑
</p>
<p class="releasetime">上映时间:2018-09-07</p> </div>
<div class="movie-item-number score-num">
<p class="score"><i class="integer">9.</i><i class="fraction">4</i></p>
</div>
</div>
</div>
"""
soup = bs4.BeautifulSoup(html, 'lxml')
print(soup.prettify()) # 格式更正初始化
print(soup.p.string)
print(soup.p) # 多个节点只选择第一个节点
print(soup.p.attrs)
print(soup.p.a['title'])
print(soup.p.contents) # 直接子节点的列表
print(soup.p.parent)
for i in soup.find_all(name='p'): # 提取所有p节点的内容
print(i.string)
print(soup.find_all(text=re.compile('-'))) # 提取匹配'-'的内容,即日期
print(soup.select('p')[0].select('a')) # CSS选择器, 嵌套选择元素
print(soup.select('p')[0].getText()) # CSS选择器, 嵌套选择文本
4.3 pyquery 以西安购房登记信息的爬取为例
from pyquery import PyQuery
doc = PyQuery(url='http://124.115.228.93/zfrgdjpt/jggs.aspx?qy=00&yxbh=0000000180')
for i in doc('td').items(): # 生成器
print(i.text(), end=" ")
for item in doc('td').text().split(): # 整理目标信息并输出
print(item,end=' ')
if item.isdecimal(): # 整齐输出
print('\n')
tr = doc('.yxdjmdTable') # 找class值对应节点
tr.find('span').remove() # 去除其他标签
print(tr.text())