# -*- coding:utf-8 -*- from bs4 import BeautifulSoup import re//正则 html_doc = ''' <html> <head> <script> location.replace(location.href.replace("https://","http://")); </script> </head> <body> <noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript> <a href="https://www.imooc.com/video/10687">happy</a> <a href="http://www.baidu.com">baidu</a> </body> </html> ''' soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf-8') links = soup.find_all('a') for link in links: print(link.name,link.get_text(),link["href"]) print('特定') link = soup.find('a',href='http://www.baidu.com') print(link.name,link.get_text(),link["href"]) print('正则匹配') link = soup.find('a',href=re.compile(r"moo")) print(link.name,link.get_text(),link["href"]) print('获取script') script = soup.find('script') print(script.name,script.get_text())