|
1
|
|
[code language="python"]
### select 传入tag标签
1. soup.select("title")
2. soup.select("p")
### 通过tag标签逐层查找
1. soup.select("body a")
2. soup.select("html head title")
### 找到某个tag标签下的直接子标签
1. soup.select("p > a")
2. soup.select("p > #link1")
### 找到兄弟节点标签:
1. soup.select("#link1 ~ .sister")
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
2. soup.select("#link1 + .sister")
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
###通过CSS的类名查找:
1. soup.select(".sister")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
2. soup.select("[class~=sister]")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
###通过tag的id查找:
1. soup.select("#link1")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
2. soup.select("a#link2")
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
###通过是否存在某个属性来查找:
soup.select('a[href]')
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,p <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
###通过属性的值来查找: 正则表达式
1. soup.select('a[href="http://example.com/elsie"]')
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
2. soup.select('a[href^="http://example.com/"]')
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
3. soup.select('a[href$="tillie"]')
[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
4. soup.select('a[href*=".com/el"]')
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
#实例:::
#coding:utf-8
import requests, re, sys
from bs4 import BeautifulSoup as bs4
reload(sys)
sys.setdefaultencoding("utf-8")
"""
@author: songhao
@software: PyCharm
@file: demo.py
@time: 2017/7/5 下午5:26
"""
r = requests.get("").content
soup = bs4(r,'lxml')
alists = soup.select('a') #取出的a列表
for a in alists:
# 获取文本
print a.get_text()
print a.string
# 获取link
try:
print a['href']
except:
pass
imges = soup.select('img')
for a in imges:
# # 获取文本
# print a.get_text()
# print a.string
# # 获取link
try:
#获取 src
print a['src'] #
except:
pass
ip = soup.select('.article-content ')
for i in ip:
for p in i.select('p'):
print p
#获取全部img链接
imgz = soup.select('img[src]')
print imgz
for u in imgz:
print u['src']
#以什么开头
print soup.select('img[src^="http://qiniu."]')
#以什么结尾
print soup.select('img[src$=".jpg"]')
#包含
print soup.select('div[class*="crayon"]')
print soup.select('div[class*="crayon"]')
常用的css选择器 http://www.168seo.cn/python/23660.html
56万+

被折叠的 条评论
为什么被折叠?



