beautifulsoup4-优快云博客

本文详细介绍BeautifulSoup4的安装与基本使用方法，涵盖标签选择器、标准选择器及CSS选择器等核心功能，并提供实例帮助理解。

环境为：

Python3.6
windows
pycharm2017.2.4

安装：

# 安装beautifulsoup4
　　pip install beautifulsoup4

# 安装解析器
　　pip install lxml

# 另一个可供选择的解析器是纯Python实现的 html5lib，html5lib的解析方式与浏览器相同
　　pip install html5lib

基本使用

           html_doc 
           = 
           """
          
           <html><head><title>The Dormouse's story</title></head>
          
           <body>
          
           <p class="title"><b>The Dormouse's story</b></p>
          
           <p class="story">Once upon a time there were three little sisters; and their names were
          
           <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
          
           <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
          
           <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
          
           and they lived at the bottom of a well.</p>
          
           <p class="story">...</p>
          
           """
          
           #基本使用：容错处理,文档的容错能力指的是在html代码不完整的情况下,使用该模块可以识别该错误。<br>#使用BeautifulSoup解析上述代码,能够得到一个 BeautifulSoup 的对象,并能按照标准的缩进格式的结构输出
          
           from 
           bs4 
           import 
           BeautifulSoup
          
           soup
           =
           BeautifulSoup(html_doc,
           'lxml'
           ) 
           #具有容错功能
          
           res
           =
           soup.prettify() 
           #处理好缩进，结构化显示
          
           print
           (res)

标签选择器

1	`即直接通过标签名字选择，选择速度快，如果存在多个相同的标签则只返回第一个<br><br>`

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>

<p>first tag</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie<i>this i tag</i></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')

# 获取标签的名称
# print(soup.head) # <head><title>The Dormouse's story</title></head>

# 获取标签的属性
# print(soup.p.name) # p

# 直接获取标签，如果存在多个相同的标签则只返回第一个
# print(soup.p) # <p>first tag</p>

 


# 获取标签的内容,

# print(soup.p.string) # first tag
# print(soup.a.string) # None
# print(soup.p.text) # first tag
# print(soup.a.text) # Elsiethis i tag
# print(soup.a.contents) # ['Elsie', <i>this i tag</i>]
"""
注意
contents获取选中标签内的所有的值，包括里面的标签
string 只能获取当前标签，而无法获取子标签的内容，如果存在子标签，则返回None
text则获取包括子标签在内的所有值
"""

# 嵌套选择
# print(soup.head.title.string) # The Dormouse's story
# print(soup.body.a.contents) # ['Elsie', <i>this i tag</i>]
# print(soup.body.a.text) # Elsiethis i tag
# print(soup.body.a.string) # None
# print(soup.body.p.string) # first tag

# 获取子节点，子孙节点
# print(soup.contents) # 返回整个HTML页面的所有节点
# print(soup.p.contents) # ['first tag']
# print(soup.p.children) # 得到一个迭代器，包含此标签内错有的子节点
# print(list(soup.a.children)) # ['Elsie', <i>this i tag</i>]
# print(soup.p.descendants) # <generator object descendants at 0x00000162FFB9D570>
# print(list(soup.a.descendants)) # 获取子孙节点,p下所有的标签都会选择出来 ['Elsie', <i>this i tag</i>, 'this i tag']
# for i, child in enumerate(soup.p.descendants):
# print(i, child) # 0 first tag

# 获取父节点，祖先节点
# print(soup.a.parent) # 获取 a 标签
# print(soup.a.parents) # <generator object parents at 0x0000022F8747D570>
# print(list(soup.a.parents)) # a 标签的父，父，父节点都会找出来，到html节点

# 获取兄弟节点
# print(soup.a.next_siblings) # 生成器对象 <generator object next_siblings at 0x000002418B9BD570>
# print(list(soup.a.next_siblings))

beautifulsoup4标签选择器

View Code

标准选择器

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>

<p>first tag</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie<i id="i1" class="i1">this i tag</i></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')

# 标准选择器

# 按照标签名查找
# print(soup.find_all('a'))  # 拿到所有的标签
# print(soup.find_all('a', id='link2'))  # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
# print(soup.find(id='link2'))  # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
# print(soup.find_all(attrs={"class": "sister"}))  # 拿到所有的类为sister的a标签
# print(soup.find_all(class_='sister'))   # 拿到的结果也是所有的类名为sister的a标签
# 注意：soup.find_all(class_='sister' 中的class_ 的用法，要加下划线，因为class为关键字，写在attrs里面的没影响

# 嵌套查找
# print(soup.find_all('a')[0].find('i'))  # 拿到 a 标签的下级 i 标签 <i>this i tag</i>


# 按照属性查找
# print(soup.a.find_all(attrs={'id':'i1'}))  # [<i class="i1" id="i1">this i tag</i>]
# print(soup.a.find_all(attrs={"class":'i1'})) # [<i class="i1" id="i1">this i tag</i>]
# print(soup.find_all(id='i1'))  # [<i class="i1" id="i1">this i tag</i>]

# 按照文本内容查找,按照完全匹配来匹配内容，不是模糊的匹配，是== 不是 in
# print(soup.p.find_all(text='first tag'))  # ['first tag']

beautifulsoup4标准选择器

View Code

CSS选择器

##该模块提供了select方法来支持css
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
    <b>The Dormouse's story</b>
    Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">
        <span>Elsie</span>
    </a>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    <div class='panel-1'>
        <ul class='list' id='list-1'>
            <li class='element'>Foo</li>
            <li class='element'>Bar</li>
            <li class='element'>Jay</li>
        </ul>
        <ul class='list list-small' id='list-2'>
            <li class='element'><h1 class='yyyy'>Foo</h1></li>
            <li class='element xxx'>Bar</li>
            <li class='element'>Jay</li>
        </ul>
    </div>
    and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'lxml')

#1、CSS选择器
print(soup.p.select('.sister'))
print(soup.select('.sister span'))

print(soup.select('#link1'))
print(soup.select('#link1 span'))

print(soup.select('#list-2 .element.xxx'))

print(soup.select('#list-2')[0].select('.element')) #可以一直select,但其实没必要,一条select就可以了

# 2、获取属性
print(soup.select('#list-2 h1')[0].attrs)

# 3、获取内容
print(soup.select('#list-2 h1')[0].get_text())

CSS选择器