xpath解析:最常用且最便捷高效的一种解析方式。通用性。
- xpath解析原理:
- 1.实例化一个etree的对象,且需要将被解析的页面源码数据加载到该对象中。
- 2.调用etree对象中的xpath方法结合着xpath表达式实现标签的定位和内容的捕获。
- 环境的安装:
- pip install lxml
- 如何实例化一个etree对象:from lxml import etree
- 1.将本地的html文档中的源码数据加载到etree对象中:
etree.parse(filePath)
- 2.可以将从互联网上获取的源码数据加载到该对象中
etree.HTML('page_text')
- xpath('xpath表达式')
- xpath表达式:
- /:表示的是从根节点开始定位。表示的是一个层级。
- //:表示的是多个层级。可以表示从任意位置开始定位。
- 属性定位://div[@class='song'] tag[@attrName="attrValue"]
- 索引定位://div[@class="song"]/p[3] 索引是从1开始的。
- 取文本:
- /text() 获取的是标签中直系的文本内容
- //text() 标签中非直系的文本内容(所有的文本内容)
- 取属性:
/@attrName ==>img/src
爬取房区名字和价格
from lxml import etree
import requests
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
url="https://sh.5i5j.com/ershoufang?pmf_group=baidu&pmf_medium=ocpc&pmf_plan=%E4%BA%8C%E6%89%8B%E6%88%BF%E5%93%81%E7%89%8C%E8%AF%8D&pmf_unit=%E5%93%81%E7%89%8C-%E7%9B%B8%E5%85%B3&pmf_keyword=%E6%88%91%E7%88%B1%E6%88%91%E5%AE%B6%E4%BA%8C%E6%89%8B%E6%88%BF%E7%BD%91&pmf_account=41&pmf_id=59219375325&gio_link_id=Yo1vYMX9&jzl_kwd=59219375325&jzl_ctv=15355652966&jzl_mtt=2&jzl_adt=cl2&jzl_ch=11&jzl_act=5274835&jzl_cpg=73143408&jzl_adp=2398666619&jzl_sty=0&jzl_dv=1&bd_vid=10937938088197617014"
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
# 先定位到li标签
li_list=tree.xpath('//ul[@class="pList"]/li')
fp = open('58.txt', 'w', encoding='utf-8')
# li=li_list[0]
for li in li_list:
# 局部解析,需要从./开始表示是li标签内,如果写/h3则表示从页面根目录写起
title = li.xpath('.//h3/a/text()')[0]
value=li.xpath('.//div[@class="jia"]//strong/text()')[0]
print(title,value)
fp.write(title +value+ '\n')
爬取
from lxml import etree
import requests
import os
if not os.path.exists('./4k'):
os.makedirs('./4k')
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
url="http://pic.netbian.com/4kdongman/"
response=requests.get(url=url, headers=headers)
# 防止乱码,.apparent_encoding猜测编码方式进行重新编码
response.encoding=response.apparent_encoding
page_text = response.text
tree = etree.HTML(page_text)
# 先定位到li标签
li_list=tree.xpath('//ul[@class="clearfix"]/li')
for li in li_list:
img_url_half = li.xpath('./a/img/@src')[0]
img_url="http://pic.netbian.com/"+img_url_half
# 访问获取图片
img_content=requests.get(url=img_url,headers=headers).content
img_name=li.xpath('./a/img/@alt')[0]+'.jpg'
with open('./4k/'+img_name,'wb') as fp:
fp.write(img_content)
print(img_name)
from lxml import etree
import requests
import os
import time
def down_load(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
# 防止乱码,.apparent_encoding猜测编码方式进行重新编码
response.encoding = response.apparent_encoding
page_text = response.text
tree = etree.HTML(page_text)
# 注释的这行,不知道为什么返回是空列表,所以用另一个替代
# div_list=tree.xpath('//div[@class="box col3 ws_block masonry-brick"]')
div_list = tree.xpath('//div[@id="main"]/div/div/a')
for div in div_list:
# 得到具体链接和名字
href = "https:" + div.xpath('./@href')[0]
name = div.xpath('./img/@alt')[0]
print(href, name)
# 访问链接
response_download = requests.get(url=href, headers=headers)
response_download.encoding = response_download.apparent_encoding
page_download_text = response_download.text
tree_download = etree.HTML(page_download_text)
# 得到下载的rar链接
down_load_list = tree_download.xpath('//div[@class="clearfix mt20 downlist"]/ul[@class="clearfix"]/li/a/@href')
# for down_load_rar in down_load_list:
# 写入
try:
rar_file = requests.get(url=down_load_list[0], headers=headers).content
# time.sleep(1)
with open("./muban/" + name + ".rar", "wb") as fp:
fp.write(rar_file)
print("success!!")
except:
print("failed")
if __name__ == "__main__":
if not os.path.exists('./muban'):
os.makedirs('./muban')
for page in range(1,3):
url = "https://aspx.sc.chinaz.com/query.aspx?keyword=免费&classID=864&page=%d"%(page)
down_load(url)
总结:rar要像图片一样content进行写入