什么是pyQuery
pyquery库是 jQuery 的 Python 实现,能够以jQuery的语法来操作解析 HTML 文档,易用性和解析速度都很好
安装
pip3 install pyquery
注意:由于 pyquery 依赖于 lxml ,要先安装 lxml ,否则会提示失败。
pip3 install lxml
PyQuery方法
方法名 | 方法实现的结果 |
---|---|
.html()和.text() | 获取相应的 HTML 块或者文本内容 |
selector | 通过选择器来获取目标内容 |
.eq(index) | 根据索引号获取指定元素(index 从 0 开始) |
.find() | 查找嵌套元素, |
.filter() | 根据 class、id 筛选指定元素 |
.attr() | 获取、修改属性值 |
item() | 遍历标签 |
案例
from pyquery import PyQuery
import requests
#pip install lxml
class CollegateRank(object):
def get_page_data(self,url):
response = self.send_request(url=url)
if response:
# print(response)
with open('page.html','w',encoding='gbk') as file:
file.write(response)
self.parse_page_data(response)
def parse_page_data(self,response):
# 使用pyquery解析数据
pq = PyQuery(response)
# find():根据css语法获取标签
# filter():根据id或者class过滤标签
ranks = pq.find('div.scores_List dl')
ranks = pq.find('div').filter('.scores_List').find('dl')
# print(type(ranks.items()))
for dl in ranks.items():
# print(dl)
school_info = {}
school_info['url'] = dl('dt a').eq(0).attr('href')
school_info['icon'] = dl('dt a img').attr('src')
school_info['name'] =dl('dt strong a').text()
school_info['adress'] = dl('dd ul li').eq(0).text()
school_info['tese'] = dl('dd ul li').eq(1).find('span').text()
school_info['type'] = dl('dd ul li').eq(2).text()
school_info['belong'] = dl('dd ul li').eq(3).text()
school_info['level'] = dl('dd ul li').eq(4).text()
school_info['weburl'] = dl('dd ul li').eq(5).text()
print(school_info)
def extract_first(self,data=None,defalut=None):
if len(data) > 0:
return data[0]
return defalut
def send_request(self, url, headers=None):
headers = headers if headers else {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
response = requests.get(url=url,headers=headers)
if response.status_code == 200:
return response.text
if __name__ == '__main__':
url = 'http://college.gaokao.com/schlist/'
obj = CollegateRank()
obj.get_page_data(url)