from bs4 import BeautifulSoup
#创建BeautifulSoup对象
r = requests.get("https://www.baidu.com")
r.encoding = "utf-8"
soup = BeautifulSoup(r.text,"html.parser")
print(soup)
print(soup.title)
print(soup.title.text)
print(type(soup.title))
print(type(soup))
#通过html文件获得BeautifulSoup对象
soup = BeautifulSoup(open("d:\\2019\\sohu\\0.html",encoding="utf-8"),"html.parser")
print(soup.title)
#格式化输出html内容
#print(soup.prettify())
#标签分析
r = requests.get("https://www.baidu.com")
r.encoding = "utf-8"
soup = BeautifulSoup(r.text,"html.parser")
print(soup.title)#第一个title标签的内容
print(soup.head)#第一个head标签的内容
print(soup.p)#第一个p标签的内容
print(soup.a) #第一个a标签的内容
print(type(soup.a))
print(soup.a.name) #第一个a标签本身名称,输出'a'
print(soup.a.attrs) #第一个a标签的所有属性
#{'href': 'http://news.baidu.com', 'name': 'tj_trnews', 'class': ['mnav']}
for k,v in soup.a.attrs.items():
print(k,v)
print(soup.a["href"])
print(soup.a.get("href"))
soup.a["name"]="baidu_news" #修改标签属性值
print(soup.a["name"])
print(soup.a)
del soup.a["name"] #删除标签属性值
print(soup.a)
print(soup.a.get("name",1))
#NavigableString(标签内容)分析
print(soup.title.text)
print(soup.title.string) #第一个title标签的文本内容
print(type(soup.title.text)) #<class 'str'>
print(type(soup.title.string)) #<class 'bs4.element.NavigableString'>
soup.title.string = "百度一下,你也不知道" #修改标签内容
print(soup.title.string)
#html结构化分析
soup = BeautifulSoup(open("d:\\2019\\sohu_html.html",encoding="utf-8"),"html.parser")
#print(soup.prettify())
#子节点
print(soup.head.contents) #head子节点的列表
print(type(soup.head.contents)) #list
#生成器方式存储子节点
l = soup.ul.children
print(l)
for i in l:
print(i)
#子孙节点
for i in soup.ul.descendants:
print(i)
print("**************")
#节点内容及多个节点内容
print(soup.body.string) #None 包含多个节点,无法确定打印哪个节点
print(soup.title.string) #可以确定
print(soup.body.strings)
for i in soup.body.strings:
print(i)
print("************")
for i in soup.body.stripped_strings: #处理了空行
print(i)
print("##############")
#父节点
print(soup.div.parent)
print(soup.body.parent)
content = soup.title.string
print(content)
print(content.parent)
print(content.parent.parent)
print(content.parent.parent.parent)
print("-----------------")
count=0
for i in content.parents:
print(i.name)
print("*****************")
count+=1
print(count)
#兄弟节点
#next_sibling下一个,写两个是因为下一个是空格
print(soup.title.next_sibling.next_sibling)
print((soup.title.next_sibling.next_sibling).name)
for i in soup.link.next_siblings:
print(i)
print("*************")
#previous_sibling上一个
print(soup.title.previous_sibling.previous_sibling) #上一个
for i in soup.title.previous_siblings:
print(i)
print("***************")
#前后节点
print(soup.head.next_element.next_element)
print(soup.title.previous_element.previous_element)
print("###############")
for i in soup.head.next_elements:
print(i)
print("***********")
print(soup.title.previous_elements)
#遍历文档结构查询
soup = BeautifulSoup(open("d:\\2019\\sohu_html.html",encoding="utf-8"),"html.parser")
#find_all
#字符串
print(soup.find_all("link"))
#正则
import re
for i in soup.find_all(re.compile(r'^l')):
print(i)
print("*************")
#传列表
print(soup.find_all(["head","body"]))