beautifulsoup4的使用-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_46306264/article/details/143244536

文本内容

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>

获取标签

多个标签的情况下之返回第一个标签

soup = BeautifulSoup(open("Index.html"),"lxml")

>>> soup.title
<title>The Dormouse's story</title>
>>> soup.title.name
'title'
>>> soup.title.string
"The Dormouse's story"

获取父标签

>>> soup.title.parent
<head><title>The Dormouse's story</title></head>
>>> soup.title.parent.name
'head'
>>> soup.title.parent.string
"The Dormouse's story"

只返回第一个标签

>>> soup.p
<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>
>>> soup.p.name
'p'
>>> soup.p.string
"The Chapter 1 of the Dormouse's story"

>>> soup.p.parent
<body>
<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p></body>

>>> soup.a
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> soup.a.name
'a'
>>> soup.a.string
'Elsie'

获取标签的属性值

>>> soup.p["class"]
['title']
>>> soup.a["href"]
'http://example.com/elsie'
>>> soup.a["id"]

修改和删除标签的属性值

soup.a["href"] = "how can I see you ?" 
print(soup.a["href"])
# how can I see you ?

del soup.a["href"] 
print(soup.a)
# <a class="sister" id="link1">Elsie</a>

根据属性查找标签

>>> soup.find(href="http://example.com/elsie")
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> soup.find(id="link2")
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

>>> soup.find(class_="sister")
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

find_all() 查找所有标签

>>> soup.find_all("title")
[<title>The Dormouse's story</title>]

>>> soup.find_all("p")
[<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>, 
 <p class="story">Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
        and they lived at the bottom of a well.</p>, 
<p class="story">...</p>]

>>> soup.find_all("a")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

for link in soup.find_all("a"):
    print(link["id"],"--",link["class"],"--", link["href"])
"""
link1 -- ['sister'] -- http://example.com/elsie
link2 -- ['sister'] -- http://example.com/lacie 
link3 -- ['sister'] -- http://example.com/tillie
"""

soup.get_text()
"""
The Dormouse's story

The Chapter 1 of the Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""

三个对象

>>> type(soup)
<class 'bs4.BeautifulSoup'>

>>> type(soup.p)
<class 'bs4.element.Tag'>

>>> type(soup.p.string)
<class 'bs4.element.NavigableString'>

数组获取子标签

>>> soup.head.contents
[<title>The Dormouse's story</title>]

>>> soup.body.contents[3].contents
['Once upon a time there were three little sisters; and their names were\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, ',\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']

>>> soup.body.contents
"""
['\n', <p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>, '\n', <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, '\n', <p class="story">...</p>]
"""

children获取子标签

print(soup.body.children," ",type(soup.body.children)) 
# <list_iterator object at 0x0000026C0D7EDBD0>   <class 'list_iterator'>

p_3 = soup.body.contents[3].contents
print(p_3)
a_1=p_3[1]
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

# 标签内的文本也是子节点
for child in a_1.children:
    print(child) # Elsie

for cc in a_1.contents:   # a_1.contents==['Elsie']
    print(cc)    # Elsie

descendants获取子孙标签

for child in soup.head.children:
    print(child)
# <title>The Dormouse's story</title>

for dd in soup.head.descendants:
    print(dd)
# <title>The Dormouse's story</title>
# The Dormouse's story

CSS选择

1. 标签名查找

print(soup.select("title"))  #[<title>The Dormouse's story</title>]
print(soup.select("b"))      #[<b>The Dormouse's story</b>]

2. 类名查找

print(soup.select(".sister")) 
"""
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
"""

3. id名查找

print(soup.select("#link1"))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

4. 组合查找

print(soup.select("p #link3"))
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

子标签查找

print(soup.select("p > #link3"))
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

把pascal voc(.xml)格式转为yolo(.txt)格式

单个文件转换

from bs4 import BeautifulSoup

import os 

def pasvcal_voc_to_yolo(xml_file,class_mapping,output_path):
    with open(xml_file,"r") as f:
        soup = BeautifulSoup(f,"xml")
        
        width = int( soup.size.width.string )
        height = int( soup.size.height.string)

        yolo_format = []

        for obj in soup.find_all("object"):
            # class_name = obj.name.string 报错
            # class_name = obj.find("name").text也可以
            class_name = obj.find("name").string
            if class_name in class_mapping:
                class_index = class_mapping[class_name]

                # 也可以xmin = int(obj.find('bndbox').find('xmin').text)
                xmin = int( obj.bndbox.xmin.string )
                ymin = int( obj.bndbox.ymin.string )
                xmax = int( obj.bndbox.xmax.string )
                ymax = int( obj.bndbox.ymax.string )

                x_center = (xmin+xmax) / 2 / width
                y_center = (ymin+ymax) / 2 / height
                bbox_width = (xmax-xmin) / width
                bbox_height = (ymax-ymin) / height
                yolo_format.append(f"{class_index} {x_center} {y_center} {bbox_width} {bbox_height}")

                txt_filename = os.path.basename(xml_file).replace(".xml",".txt")
                output_filename = os.path.join(output_path,txt_filename)

                with open(output_filename,"w") as f :
                    for label in yolo_format:
                        f.write(f"{label}\n")
                 
class_mapping={
    "car":0
}

src = "./0.xml"
dist = os.getcwd()

pasvcal_voc_to_yolo(src,class_mapping,dist)