方法一:xpath
from lxml import etree
import requests
from requests import exceptions
def get_response(url, headers=None, timeout=None):
""""
当条件为假,断言失败
assert response.status_code == 200'请求报错,请求状态码为: %s'%str(response.status_code)
"""
try:
response = requests.get(url, headers=headers, timeout=timeout)
except exceptions.Timeout as e:
response = None
raise e
except exceptions.HTTPError as e:
response = None
raise e
except exceptions.ConnectTimeout as e:
response = None
raise e
except exceptions.ReadTimeout as e:
response = None
raise e
except exceptions.ProxyError as e:
response = None
raise e
except Exception as e:
response = None
raise e
finally:
return response
def get_content(etree_html, xpath):
result = []
content = etree_html.xpath(xpath) # list 类型
for ea

本文介绍了Python3中四种爬取网页元素的方法:使用XPath、正则表达式、BeautifulSoup的find和find_all以及select方法。提供了匹配URL的示例和相关资源链接。
最低0.47元/天 解锁文章
5万+

被折叠的 条评论
为什么被折叠?



