from urllib.parse import urljoin
from Common.PSoup import *
class HtmlCommon:
def handleHtmlString(self,htmlString,url,dic={}):
psoup=PSoup()
docBody=psoup.getPSoup(htmlString)
bodyElement = docBody.find("body")
# <editor-fold desc="填充字典参数的数据">
str=""
for item in dic.items():
key,value = item;
str=str+"<div id='"+key+"'>"+value+"</div>"
if bodyElement!=None:
bodyElement.append(str)
else:
htmlString = "<body>" + htmlString + "</body>"
docBody = psoup.getPSoup(htmlString)
bodyElement = docBody.find("body")
bodyElement.append(str)
htmlString = docBody.html()
# </editor-fold>
# <editor-fold desc="替换A标签和Img标签的路径">
docA = psoup.getPSoup(htmlString)
elesA = docA.find("a")
for da in elesA.items():
href=da.attr("href")
if href!=None:
nhref = urljoin(url, href)
da.attr("href", nhref)
htmlString = docA.html()
docI = psoup.getPSoup(htmlString)
elesI = docI.find("img")
for ds in elesI.items():
src=ds.attr("src")
if src!=None:
nsrc=urljoin(url,src)
ds.attr("src",nsrc)
htmlString=docI.html()
# </editor-fold>
return htmlString
python 处理 Html
最新推荐文章于 2022-09-11 18:03:32 发布
6273

被折叠的 条评论
为什么被折叠?



