#__author__ = 'DouYunQian'
#coding=utf-8
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story2">...</p>
"""
import re
from bs4 import BeautifulSoup
soup=BeautifulSoup(html,"html.parser")
print(soup.title)#<title>The Dormouse's story</title>
print(soup.title.string)#The Dormouse's story
print(soup.title.parent)#<head><title>The Dormouse's story</title></head>
print(soup.p)#返回第一个p标签 <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
print(soup.a)#<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>
print(soup.p['class'])#['title']
print(soup.find_all("a"))#返回一个列表 所有a标签的
print(soup.find(id="link2"))#<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
print(soup.find(id="link2").string)#Lacie 如果中间有别的标签就不能够很好的使用了
print(soup.find(id="link2").get_text())#Lacie
print(soup.find("p",class_="title"))#<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
print(soup.find("p",{"class":"story2"}))#<p class="story2">...</p>
print(soup.find("p",{"class":"story"}).get_text())#获取任何标签中间的内容不论标签有多少
print("===================")
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
print("=============")#找到属性是某种类型的所有集合
all_href=soup.find_all(href=re.compile("http://example.com/.+"))
#coding=utf-8
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story2">...</p>
"""
import re
from bs4 import BeautifulSoup
soup=BeautifulSoup(html,"html.parser")
print(soup.title)#<title>The Dormouse's story</title>
print(soup.title.string)#The Dormouse's story
print(soup.title.parent)#<head><title>The Dormouse's story</title></head>
print(soup.p)#返回第一个p标签 <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
print(soup.a)#<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>
print(soup.p['class'])#['title']
print(soup.find_all("a"))#返回一个列表 所有a标签的
print(soup.find(id="link2"))#<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
print(soup.find(id="link2").string)#Lacie 如果中间有别的标签就不能够很好的使用了
print(soup.find(id="link2").get_text())#Lacie
print(soup.find("p",class_="title"))#<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
print(soup.find("p",{"class":"story2"}))#<p class="story2">...</p>
print(soup.find("p",{"class":"story"}).get_text())#获取任何标签中间的内容不论标签有多少
print("===================")
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
print("=============")#找到属性是某种类型的所有集合
all_href=soup.find_all(href=re.compile("http://example.com/.+"))