安装:pip3 install pyquery
初始化
字符串初始化
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html) #声明一个pyquery对象
print(doc("li")) #doc传入一个选择器;选择器就是CSS选择器;选id用#号,选class用.,选标签名什么都不加
运行结果:
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
URL初始化
from pyquery import PyQuery as pq #引入pyquery
doc = pq(url="http://www.baidu.com") #声明一个pyquery对象,传入一个链接也是一样的
print(doc("head")) #doc传入一个选择器;选择器就是CSS选择器;选id用#号,选class用.,选标签名什么都不加
运行结果:
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/>......
font-size:13px;text-decoration:underline}</style></head>
文件初始化
from pyquery import PyQuery as pq #引入pyquery
doc = pq(filename="demo.html") #声明一个pyquery对象,传入一个文本;filename后面添加指定文件的路径或者就在同一文件下
print(doc("li")) #doc传入一个选择器;选择器就是CSS选择器;选id用#号,选class用.,选标签名什么都不加
#没有这个文件,结果代码与第一个初始化一致
基本CSS选择器
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
print(doc("#container .list li")) #doc传入一个选择器;选择器就是CSS选择器;选id用#号,选class用.,选标签名什么都不加
运行结果:
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
查找元素
子元素
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
items = doc(".list") #doc传入一个选择器;选择器就是CSS选择器;选id用#号,选class用.,选标签名什么都不加
print(type(items))
print(items)
lis = items.find('li') #find方法查找当前标签内部满足条件的所有标签
print(type(lis))
print(lis)
运行结果:
①:
<class 'pyquery.pyquery.PyQuery'>
②:
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
③:
<class 'pyquery.pyquery.PyQuery'>
④:
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
items = doc(".list") #doc传入一个选择器;选择器就是CSS选择器;选id用#号,选class用.,选标签名什么都不加
lis = items.children() #childre表示查找所有的直接子元素
print(type(lis))
print(lis)
运行结果:
①:
<class 'pyquery.pyquery.PyQuery'>
②:
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
items = doc(".list") #doc传入一个选择器;选择器就是CSS选择器;选id用#号,选class用.,选标签名什么都不加
lis = items.children('.active') #childre表示查找所有的直接子元素;还可以传入一个参数,参数也是CSS选择器
print(lis) #比如上述;查找一个子元素class为active的li标签
运行结果:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
父元素
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
items = doc(".list") #doc传入一个选择器;选择器就是CSS选择器;选id用#号,选class用.,选标签名什么都不加
lis = items.parent() #.parent()可以获取父元素
print(type(lis))
print(lis)
运行结果:
①:
<class 'pyquery.pyquery.PyQuery'>
②:
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
items = doc(".list") #doc传入一个选择器;选择器就是CSS选择器;选id用#号,选class用.,选标签名什么都不加
lis = items.parents() #.parents()查找所有的祖先节点,先从最外层开始
print(lis)
运行结果:
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div><div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
items = doc(".list") #doc传入一个选择器;选择器就是CSS选择器;选id用#号,选class用.,选标签名什么都不加
lis = items.parents('.wrap') #也可以传入一个CSS选择器,根据传入的选择器再次进行筛选
print(lis)
运行结果:
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
兄弟元素
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
li = doc('.list .item-0.active') #.item-0.active中间没有空格表示并列,必须是同一个class里拥有这两个属性
print(li.siblings()) #sibling获取所有的兄弟元素,不分前后
运行结果:
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
li = doc('.list .item-0.active') #.item-0.active中间没有空格表示并列,必须是同一个class里拥有这两个属性
print(li.siblings('.active')) #sibling获取所有的兄弟元素,不分前后;此方法还是可以再次进行筛选
运行结果:
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
遍历
单个元素
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
li = doc('.item-0.active') #.item-0.active中间没有空格表示并列,必须是同一个class里拥有这两个属性
print(li) #查找到单个元素
运行结果:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
多个元素
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
lis = doc('li').items() #.items()生成一个产生器,可以用for循环,如果没有这个不会显示代码
print(type(lis))
for li in lis: #查找的是多个元素,需要进行遍历
print(li)
运行结果:
<class 'generator'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
获取信息
获取属性
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
lis = doc('.item-0.active a')
print(lis)
print(lis.attr('href')) #.attr此为获取属性的方法,传入的参数是属性的名称
print(lis.attr.href) #结果与上面一致,只是方法不同,都可以使用
运行结果:
①:<a href="link3.html"><span class="bold">third item</span></a>
②:link3.html
③:link3.html
获取文本
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
lis = doc('.item-0.active a')
print(lis)
print(lis.text()) #.text获取li标签下面的所有文本内容
运行结果:
①:<a href="link3.html"><span class="bold">third item</span></a>
②:third item
获取HTML
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
lis = doc('.item-0.active')
print(lis.html()) #获取lis下面的所有html代码,不包含自己本身
运行结果:
<a href="link3.html"><span class="bold">third item</span></a>
DOM操作:节点的操作、动态修改代码
addClass、removeClass
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
li = doc('.item-0.active') #选中一个li标签
print(li)
li.removeClass('active') #removeClass表示移除掉一个class
print(li)
li.addClass('active') #addClass表示添加一个class,在同一个class下
print(li)
运行结果:
①:<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
②:<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
③:<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
attr、css
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
li = doc('.item-0.active') #选中一个li标签
print(li)
li.attr('name','link') #attr:第一个参数为属性名称,第二个参数为属性值;如果之前没有这个属性则重新添加;如果之前有,则更改其值为新传入的值
print(li)
li.css('font_size','14px') #css:添加一个style的属性;将第一和第二参数以键值对的形式加入到style的属性
print(li)
运行结果:
①:<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
②:<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
③:<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>
remove
html = '''
<div class="wrap">
Hello,World
<p>This is a paragraph.</p>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text()) #返回div下面所有的文本
wrap.find('p').remove() #调用find方法找到p标签,再调用remove方法删除p标签
print(wrap.text())
运行结果:
①:
Hello,World
This is a paragraph.
②:
Hello,World
其他DOM方法
https://pyquery.readthedocs.io/en/latest/api.html
伪类选择器
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq #引入pyquery
doc = pq(html)
li = doc('li:first-child') #获取的第一个标签
print(li)
li = doc('li:last-child') #获取最后一个标签
print(li)
li = doc('li:nth-child(2)') #获取顺位指定的标签,这里表示获取第二个标签;序号从1开始
print(li)
li = doc('li:gt(2)') #获取序号n以后的标签,这里表示获取序号2以后的标签;序号从0开始
print(li)
li = doc('li:nth-child(2n)') #获取所有n位的标签,这里表示获取2*n(偶数)个标签;序号从1开始
print(li)
li = doc('li:contains(second)') #查找某个文本的标签,这里表示查找包含second文本的标签
print(li)
运行结果:
①:
<li class="item-0">first item</li>
②:
<li class="item-0"><a href="link5.html">fifth item</a></li>
③:
<li class="item-1"><a href="link2.html">second item</a></li>
④:
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
⑤:
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
⑥:
<li class="item-1"><a href="link2.html">second item</a></li>
更多CSS选择器可以查看
http://www.w3school.com.cn/css/index.asp