from lxml import etree
"""
1.读取html字符串
"""
text = \
"""
<tr class="hots">
<td class="1">hot1</td>
<td class="2">hot2</td>
<td class="3">hot3</td>
<td class="4">hot4</td>
<td class="5">hot5
<td class="6">爬虫
</tr>
"""
html = etree.HTML(text)
print(type(html))
result = etree.tostring(html,encoding='utf8').decode('utf8')
print(result)
"""
2.直接解析html文件
【默认使用xml解析器】
"""
parser = etree.HTMLParser(encoding='utf8')
html = etree.parse(r"/Users/dx/Desktop/(凡博)Python爬虫资料/3.Python爬虫数据提取-Xpath语法/课程资料/test.html",parser=parser)
result = etree.tostring(html,encoding='utf8').decode('utf8')
print(result)
text = \
"""
<ul class="ullist" padding="1" spacing="1">
<li>
<div id="top">
<span class="position" width="350">职位名称</span>
<span>职位类别</span>
<span>人数</span>
<span>地点</span>
<span>发布时间</span>
</div>
<div id="even">
<span class="l square">
<a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">python开发工程师</a>
</span>
<span>技术类</span>
<span>2</span>
<span>上海</span>
<span>2018-10-23</span>
</div>
<div id="odd">
<span class="l square">
<a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">python后端</a>
</span>
<span>技术类</span>
<span>2</span>
<span>上海</span>
<span>2018-10-23</span>
</div>
<div id="even">
<span class="l square">
<a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">高级Python开发工程师</a>
</span>
<span>技术类</span>
<span>2</span>
<span>上海</span>
<span>2018-10-23</span>
</div>
<div id="odd">
<span class="l square">
<a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">python架构师</a>
</span>
<span>技术类</span>
<span>1</span>
<span>上海</span>
<span>2018-10-23</span>
</div>
<div id="even">
<span class="l square">
<a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">Python数据开发工程师</a>
</span>
<span>技术类</span>
<span>1</span>
<span>上海</span>
<span>2018-10-23</span>
</div>
<div id="odd">
<span class="l square">
<a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">高级图像算法研发工程师</a>
</span>
<span>技术类</span>
<span>1</span>
<span>上海</span>
<span>2018-10-23</span>
</div>
<div id="even">
<span class="l square">
<a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">高级AI开发工程师</a>
</span>
<span>技术类</span>
<span>4</span>
<span>上海</span>
<span>2018-10-23</span>
</div>
<div id="odd">
<span class="l square">
<a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">后台开发工程师</a>
</span>
<span>技术类</span>
<span>1</span>
<span>上海</span>
<span>2018-10-23</span>
</div>
<div id="even">
<span class="l square">
<a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">Python开发(自动化运维方向)</a>
</span>
<span>技术类</span>
<span>1</span>
<span>上海</span>
<span>2018-10-23</span>
</div>
<div id="odd">
<span class="l square">
<a target="_blank" href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">Python数据挖掘讲师 </a>
</span>
<span>技术类</span>
<span>1</span>
<span>上海</span>
<span>2018-10-23</span>
</div>
</li>
</ul>
"""
from lxml import etree
html = etree.HTML(text)
"""
1.获取所有的div标签【节点选取】
xpath语法提取数据时得到的必定是列表
"""
divs = html.xpath('//div')
print(divs)
for div in divs:
d = etree.tostring(div,encoding='utf8').decode('utf8')
print(d)
print("*"*60)
"""
2.获取指定的某个div标签【谓语的使用】
"""
div = html.xpath('//div[2]')[0]
print(etree.tostring(div,encoding='utf8').decode('utf8'))
"""
3.获取所有的id='even'的div标签
"""
divs = html.xpath('//div[@id="even"]')
for div in divs:
d = etree.tostring(div,encoding='utf8').decode('utf8')
print(d)
print("*"*60)
"""
4.获取标签的某个属性值
"""
divs = html.xpath('//div/@id')
print(divs)
hrefs = html.xpath('//a/@href')
print(hrefs)
"""
5.获取div里面所有的职位信息
"""
divs = html.xpath('//div')[1:]
works = []
for div in divs:
work = {}
url = div.xpath('.//a/@href')[0]
position = div.xpath('.//a/text()')[0]
work_type = div.xpath('.//span[2]/text()')[0]
nums = div.xpath('.//span[3]/text()')[0]
area = div.xpath('.//span[4]/text()')[0]
time = div.xpath('.//span[5]/text()')[0]
work = {
"url":url,
"position":position,
"work_type":work_type,
"nums":nums,
"area":area,
"time":time,
}
works.append(work)