from bs4 import BeautifulSoup
s = '<h1>123</h1> <span>456<span>'
soup = BeautifulSoup(s,'html.parser')
print(soup.text) # 结果 123 456
# 构建摘要数据,获取标签字符串的文本前150个符号
soup=BeautifulSoup(content,"html.parser")
desc=soup.text[0:150]+"..."
# 防止xss攻击,过滤script标签
s = '<h1>123</h1> <span>456<span> <script>alert('66')</script>'
soup.find_all() # 获取标签名
for tag in soup.find_all():
print(tag.name)
if tag.name=="script":
tag.decompose() # 删除