BeautifulSoup4
bs4简介
bs4是一个可以从html或者xml文件中提取数据的网页信息提取库
导航 查找 修改
pip install lxml pip install bs4
bs4的使用
# bs4的使用
from bs4 import BeautifulSoup
html_doc = """
<html >
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>时尚气质唯美女生头åƒ_微信头åƒ_我è¦ä¸ªæ€§ç½‘</title>
<link rel="stylesheet" type="text/css" href="/source/css/common.css?1567576618.css" />
<link rel="stylesheet" type="text/css" href="/source/css/swipebox.css" />
<script href="http://www.baidu.com" type="text/javascript" charset="utf-8" src="/source/js/jquery.1.9.js"></script>
<script href="http://www.qq.com" type="text/javascript" charset="utf-8" src="/source/js/ZeroClipboard.js"></script>
<script href="http://www.52bqg.com" type="text/javascript" charset="utf-8" src="/source/js/common.js"></script>
</head>
"""
bs = BeautifulSoup(html_doc,features='lxml')
# print(bs.prettify())
# print(bs.title)
# print(bs.title.name) # 标签的名字
# print(bs.title.string) # 内容
# print(bs.script)
# r = bs.find_all('script')
# print(len(r))
# print(r)
# 拿到 script标签里面的连接地址
links = bs.find_all('script')
for link in links:
# print(link
print(link.get('href'))
bs4对象的种类
对象的种类
Tag
NavigableString
BeautifulSoup
Comment
# 对象的种类
# Tag 标签
# NavigableString 可导航的字符串
# BeautifulSoup bs对象
# Comment 注释
# print(type(bs)) # <class 'bs4.BeautifulSoup'>
# print(type(bs.title)) # <class 'bs4.element.Tag'>
# print(type(bs.meta)) # <class 'bs4.element.Tag'>
# print(type(bs.script)) # <class 'bs4.element.Tag'>
# print(bs.p.string) # 憨憨
# print(type(bs.title.string)) # <class 'bs4.element.NavigableString'>
title_tag = bs.p
# print(title_tag) # <p>憨憨</p>
# print(type(title_tag)) # <class 'bs4.element.Tag'>
# print(title_tag.name) # p
# print(title_tag.string) # 憨憨
#
# # 查看script标签里面class的值
# print(title_tag['class'])
html_comment = '<a><!--注释憨憨--></a>'
bs = BeautifulSoup(html_comment,'lxml')
print(bs.a.string)
print(type(bs.a.string))
遍历子节点
遍历树 子节点
children
descendants
# 遍历树 遍历子节点
# 先找到Tag(标签)
# print(bs.title)
# print(bs.p)
# print(bs.a)
#
# all_p = bs.find_all('p')
# print(all_p)
# contents children
# contents返回一个列表 children 迭代器
# 迭代 iterate java c python while循环 for...in... 可迭代对象
# links = bs.contents
# print(type(links))
# print(links)
# html = '''
# <div>
# <a href ='#'>百度</a>
# <a href ='#'>腾讯</a>
# <a href ='#'>阿里</a>
# </div>
# '''
# # 我想要div标签下的数据
# bs = BeautifulSoup(html,features='lxml')
# # links = bs.contents
# # print(type(links))
# # print(links)
#
# # links = bs.children # bs.children遍历这个PageElement的所有直接子元素
# # print(type(links))
#
# links = bs.div.children
# # print(list(links))
#
# for link in links:
# print(link,end='')
# descendants # 没有长度 # 遍历a中此PageElement的所有子元素宽度优先顺序
# 遍历子子孙孙
# print(len(bs.contents)) # 1
# print(len(bs.descendants)) # TypeError: object of type 'generator' has no len()
# for i in bs.descendants: # 遍历a中此PageElement的所有子元素宽度优先顺序
# print('---------------')
# print(i)
# string 标签里面的内容
# title_tag = bs.title
# print(title_tag)
# print(title_tag.string)
# head_tag = bs.title
# print(head_tag.string)
#
# print(bs.html.string)
# strings strippd_strings
# strings = bs.strings
strings = bs.stripped_strings
for s in strings:
print(s)
遍历树遍历父节点
遍历树 父节点
parent
next_sibling 和 previous_sibling
next_siblings 和 previous_siblings
# 遍历树 父节点
# parent parents
# 先拿到一个title节点
# title_tag = bs.title
# print(title_tag)
# print(title_tag.parent)
# print(bs.html.parent)
# a_tag = bs.a
# print(a_tag)
# print(a_tag.parent)
# for p in a_tag.parent:
# print(p)
# print('-----------------')
# next_sibling 和 previous_sibling next_siblings 和 previous_siblings
# next_sibling 下一个兄弟节点 previous_sibling 上一个兄弟节点
# next_siblings 下一个所有的兄弟节点 previous_siblings 上一个所有的兄弟节点
# html_doc = '<a><b>abc</b><c>bcd</c></a>'
# bs = BeautifulSoup(html_doc,'lxml')
# # print(bs.prettify())
# b_tag = bs.b
# # print(b_tag)
# # print(b_tag.next_sibling)
#
# c_tag = bs.c
# print(c_tag.next_sibling)
# print(c_tag.previous_sibling)
# a_tag = bs.a
# # print(a_tag)
# # print(a_tag.next_sibling)
# for x in a_tag.next_siblings:
# print(x)
a_tag = bs.find(id='link')
# print(a_tag)
for x in a_tag.previous_siblings:
print(x)
搜索树
字符串的过滤器
# 字符串的过滤器
# 找到一条a标签的数据
a_tag = bs.find('a')
# 找到所有a标签的数据
a_tags = bs.find_all('a')
print(a_tags)
正则表达式过滤器
import re
# 需求<title>时尚气质唯美女生头åƒ_微信头åƒ_我è¦ä¸ªæ€§ç½‘</title>
print(bs.find(re.compile('title')))
找t打头的标签
print(bs.find(re.compile('t')))
列表过滤器
# 找p标签和a标签的数据
print(bs.find_all(['p','a']))
True过滤器
# True过滤器
print(bs.find_all(True))
方法过滤器
# 方法过滤器
# 标签长度大于1
def filter(tag):
return len(tag.name) > 1
print(bs.find_all((filter)))
find_all()方法
name: tag的名称
attrs:标签的属性
recursive:是否递归搜索
text: 文本的内容
limit:限制返回的条数
kwargs:关键字参数
# name名称
# a_tags = bs.find_all('a')
# a_tags = bs.find_all('p','还是憨憨')
#
# print(a_tags)
# 关键字
# a_tags = bs.find_all(id='link')
# print(a_tags)
# import re
#
# print(bs.find_all(id=re.compile('link')))
# text参数
# import re
# print(bs.find_all(text=re.compile('憨憨')))
# limit 返回数据的限制
# print(bs.find_all('a',limit=1))
# print(bs.a)
# print(bs.find('a'))
# 是否递归搜索
# 先找一个a标签的数据
# print(bs.find_all('a')) # 找到了所有的a标签的数据
print(bs.find_all('a',recursive=True)) # # 找到了所有的a标签的数据
find()方法
find parent() find parents()
# print(bs.find_all('a',limit=1))
# print(bs.find('a'))
# find parent() find parents()
# 找一个title节点
# title_tag = bs.title
# print(title_tag.find_parent('head'))
# s = bs.find(text='憨憨')
# print(s.find_parents('p'))
搜索兄弟节点 find_next_siblings() find_next_sibling()
# 搜索兄弟节点 find_next_siblings() find_next_sibling()
a_tag = bs.a
# print(a_tag)
# 找下一个兄弟
# print(a_tag.find_next_sibling('a'))
# 找所有兄弟
print(a_tag.find_next_siblings('a'))
搜索兄弟节点find_previous_siblings() find_previous_sibling()
# find_previous_siblings() find_previous_sibling()
a_tag = bs.find(id='link')
# print(a_tag)
# 往上的一个兄弟
# print(a_tag.find_previous_sibling())
# 往上所有的兄弟
print(a_tag.find_previous_siblings())
修改文档树
修改文档树
# 修改文档树
# 1.修改tag的名称和属性
# 2.修改string
# 3.append()
# 4.new_string() new_tag()
# 5.deconpose() ** 重要 删除
r = bs.find(class_='还是憨憨')
# print(r)
r.decompose()
print(bs)