from bs4 import BeautifulSoup as bs import re html_doc = """ <html><head><title>The Dormouse's story<a>hello</a></title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://exampledcom/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = bs(html_doc, "html.parser") #print(soup.prettify()) #print(soup.a) print(soup.title) print(soup.title.string) #结果:None print(soup.title.get_text()) #结果:The Dormouse's storyhello print(soup.p) print(soup.p['class']) #此函数返回列表形式,需要for循环遍历 print(soup.find_all('a')) for link in soup.findAll('a'): print(link.string) #获取id=link2的标签 print(soup.find(id='link2')) #获取id=link2的标签的文本 print(soup.find(id='link2').get_text()) #获取p标签中class为title的标签,需要传参,又因为class是关键字,故需要以下格式{} print(soup.find("p", {"class": "title"})) print(soup.find("p", {"class": "story"}).get_text()) #找出所有以b开头的标签 for tag in soup.find_all(re.compile("^b")): print(tag.name) #找出所有名字中包含”t”的标签 for tag in soup.find_all(re.compile("t")): print(tag.name) #.在正则表达式中代表任意字符 dat = soup.findAll("a", href=re.compile(r"^http://example.com/")) print(dat) #用\.此时进行转移,代表真正的点号 data = soup.findAll("a", href=re.compile(r"^http://example\.com/")) print(data)https://blog.youkuaiyun.com/qq_29883591/article/details/52984181?locationNum=2&fps=1
详情看:https://blog.youkuaiyun.com/qq_29883591/article/details/52984181?locationNum=2&fps=1