from urllib.request import urlopen from urllib.error import HTTPError,URLError from bs4 import BeautifulSoup #创建getTitle函数 def getTitle(url): #排除网页不存在、服务器不存在问题 try: html=urlopen(url) except(HTTPError,URLError) as e: return None #排除body标签里面没有H1标签问题 try: bsObj=BeautifulSoup(html.read()) title=bsObj.body.h1 except AttributeError as e: return None #返回<body>标签里的<h1>标签 return title title=getTitle("http://blog.sina.com.cn/s/blog_de7bd4850102xgad.html") if title ==None: print("title could not be found") else: print(title)
爬虫入门_抓取html页面元素
最新推荐文章于 2025-06-24 09:00:56 发布