pip install beautifulsoup4
解析html,获取超链接
from bs4 import BeautifulSoup # 使用文档解析类库,解析HTML
# 解析html
def parserHtml(url,finishedContents):
# 集合
result = []
try:
response = urllib.request.urlopen(url)
string = response.read()
html = string.decode('utf-8')
# 解析成文档对象
soup = BeautifulSoup(html, 'html.parser') #文档对象
# 非法URL 1
invalidLink1 = '#'
# 非法URL 2
invalidLink2 = 'javascript:void(0)'
#查找文档中所有a标签
for k in soup.find_all('a'):
#查找href标签
link = k.get('href')
# 过滤没找到的
if (link is not None):
#过滤非法链接
if link == invalidLink1:
pass
elif link == invalidLink2:
pass
elif link.find("javascript:") != -1:
pass
#过滤已完成集合
elif link in finishedContents:
pass
else:
result.append(link)
result.sort() #reverse = True
return result, ''
except Exception as e:
return result, url + ':解析异常' + str(e)
根据超链接下载文件
import urllib.request #引入urllib库
#下载
def DownloadTif(url, filePath):
try:
urllib.request.urlretrieve(url, filePath)
except Exception as e:
logger.error(url + "下载异常:" + str(e))
return
根据地址请求服务,获取返回参数
import requests
# 请求服务
def postRequest(url, data):
try:
res = requests.post(url=url, data=data)
param = json.loads(res.text)
except Exception as e:
app.logger.error("请求异常:" + res.text + str(e))
return '', res.text + str(e)
if param["status"] == 1:
# app.logger.info("请求成功")
return param["data"],
解析返回参数(xml)
安装 lxml
pip3 install lxml
from bs4 import BeautifulSoup
def parserXML(xmlText):
soup = BeautifulSoup(xmlText, 'xml')
result = soup.find_all(attrs={"name": "file"})
return result