文本内容
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
获取标签
多个标签的情况下之返回第一个标签
soup = BeautifulSoup(open("Index.html"),"lxml")
>>> soup.title
<title>The Dormouse's story</title>
>>> soup.title.name
'title'
>>> soup.title.string
"The Dormouse's story"
获取父标签
>>> soup.title.parent
<head><title>The Dormouse's story</title></head>
>>> soup.title.parent.name
'head'
>>> soup.title.parent.string
"The Dormouse's story"
只返回第一个标签
>>> soup.p
<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>
>>> soup.p.name
'p'
>>> soup.p.string
"The Chapter 1 of the Dormouse's story"
>>> soup.p.parent
<body>
<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p></body>
>>> soup.a
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> soup.a.name
'a'
>>> soup.a.string
'Elsie'
获取标签的属性值
>>> soup.p["class"]
['title']
>>> soup.a["href"]
'http://example.com/elsie'
>>> soup.a["id"]
修改和删除标签的属性值
soup.a["href"] = "how can I see you ?"
print(soup.a["href"])
# how can I see you ?
del soup.a["href"]
print(soup.a)
# <a class="sister" id="link1">Elsie</a>
根据属性查找标签
>>> soup.find(href="http://example.com/elsie")
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> soup.find(id="link2")
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
>>> soup.find(class_="sister")
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
find_all() 查找所有标签
>>> soup.find_all("title")
[<title>The Dormouse's story</title>]
>>> soup.find_all("p")
[<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>,
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>,
<p class="story">...</p>]
>>> soup.find_all("a")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
for link in soup.find_all("a"):
print(link["id"],"--",link["class"],"--", link["href"])
"""
link1 -- ['sister'] -- http://example.com/elsie
link2 -- ['sister'] -- http://example.com/lacie
link3 -- ['sister'] -- http://example.com/tillie
"""
soup.get_text()
"""
The Dormouse's story
The Chapter 1 of the Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
三个对象
>>> type(soup)
<class 'bs4.BeautifulSoup'>
>>> type(soup.p)
<class 'bs4.element.Tag'>
>>> type(soup.p.string)
<class 'bs4.element.NavigableString'>
数组获取子标签
>>> soup.head.contents
[<title>The Dormouse's story</title>]
>>> soup.body.contents[3].contents
['Once upon a time there were three little sisters; and their names were\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, ',\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']
>>> soup.body.contents
"""
['\n', <p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>, '\n', <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, '\n', <p class="story">...</p>]
"""
children获取子标签
print(soup.body.children," ",type(soup.body.children))
# <list_iterator object at 0x0000026C0D7EDBD0> <class 'list_iterator'>
p_3 = soup.body.contents[3].contents
print(p_3)
a_1=p_3[1]
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# 标签内的文本也是子节点
for child in a_1.children:
print(child) # Elsie
for cc in a_1.contents: # a_1.contents==['Elsie']
print(cc) # Elsie
descendants获取子孙标签
for child in soup.head.children:
print(child)
# <title>The Dormouse's story</title>
for dd in soup.head.descendants:
print(dd)
# <title>The Dormouse's story</title>
# The Dormouse's story
CSS选择
1. 标签名查找
print(soup.select("title")) #[<title>The Dormouse's story</title>]
print(soup.select("b")) #[<b>The Dormouse's story</b>]
2. 类名查找
print(soup.select(".sister"))
"""
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
"""
3. id名查找
print(soup.select("#link1"))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
4. 组合查找
print(soup.select("p #link3"))
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
子标签查找
print(soup.select("p > #link3"))
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
把pascal voc(.xml)格式转为yolo(.txt)格式
单个文件转换
from bs4 import BeautifulSoup
import os
def pasvcal_voc_to_yolo(xml_file,class_mapping,output_path):
with open(xml_file,"r") as f:
soup = BeautifulSoup(f,"xml")
width = int( soup.size.width.string )
height = int( soup.size.height.string)
yolo_format = []
for obj in soup.find_all("object"):
# class_name = obj.name.string 报错
# class_name = obj.find("name").text也可以
class_name = obj.find("name").string
if class_name in class_mapping:
class_index = class_mapping[class_name]
# 也可以xmin = int(obj.find('bndbox').find('xmin').text)
xmin = int( obj.bndbox.xmin.string )
ymin = int( obj.bndbox.ymin.string )
xmax = int( obj.bndbox.xmax.string )
ymax = int( obj.bndbox.ymax.string )
x_center = (xmin+xmax) / 2 / width
y_center = (ymin+ymax) / 2 / height
bbox_width = (xmax-xmin) / width
bbox_height = (ymax-ymin) / height
yolo_format.append(f"{class_index} {x_center} {y_center} {bbox_width} {bbox_height}")
txt_filename = os.path.basename(xml_file).replace(".xml",".txt")
output_filename = os.path.join(output_path,txt_filename)
with open(output_filename,"w") as f :
for label in yolo_format:
f.write(f"{label}\n")
class_mapping={
"car":0
}
src = "./0.xml"
dist = os.getcwd()
pasvcal_voc_to_yolo(src,class_mapping,dist)