1.BeautifulSoup对象的创建
– (作用) – 从HTML / XML中提取数据!
#1.导入模块
from bs4 import BeautifulSoup
#创建beautifulSoup对象
soup = BeautifulSoup('<html>data</html>', 'lxml')
``
print(soup) #自动修正html格式!
2.BeautifulSoup对象的find方法
#1.导入模块
from bs4 import BeautifulSoup
#2.准备文档字符串
html =''' <html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
'''
#3。创建BeautifulSoup对象
soup = BeautifulSoup(html,'lxml')
#4.查找title标签
title = soup.find('title')
#print(title)
find方法有: 1.标签 2. 属性 3.文本内容 进行查找 !
Tag 对应于 XML, HTML标签, 它的常用属性 -->> name, attrs, text
#一 .5.查找第一个a标签
a= soup.find('a') #a是一个 Tag对象 -- name , attrs, text
#print(a)
#查找所有a标签
a_s = soup.find_all('a') #返回一个列表
#print(a_s)
#二.根据属性进行查找
#查找id为link1的标签
#方式一:通过命名参数进行指定的
a = soup.find(id='link1')
#print(a)
#方式二:使用attrs来指定属性字典,进行查找
a = soup.find(attrs = {'id':'link1'})
#print(a)
#三.根据文本内容进行查找
text = soup.find(text = 'Elsie')
print(text)
#Tag对象
print('a: ',a)
print('a的type: ',type(a))
print('标签名: ',a.name)
print('标签所有属性: ',a.attrs)
print('标签文本内容:',a.text)