#-*- coding: utf-8 -*-
html = """<div><div>div data >aaa<p>ppp data<a>hello,world.</a></p></div></div>"""
if __name__ == "__main__":
stack = []
result = []
for index,elem in enumerate(html):
stack.append(elem)
if elem == ">" and "<" in stack:
tmp_result=[]
tmp_data = []
while stack:
e = stack.pop()
tmp_data.insert(0,e)
if e=="<":
tmp_result.insert(0,tmp_data)
tmp_data = []
if tmp_data:
tmp_result.insert(0,tmp_data)
if tmp_result:
result.extend(tmp_result)
result_index = []
current_index = 0
for index,record in enumerate(result):
length = len(record)
if not (record[0] == "<" and record[-1] == ">"):
result_index.append([index,current_index,current_index + length])
current_index = current_index + length
for index,start,end in result_index:
print(index,start,end)
print(html[start:end])
算法:获取html中内容的开始和结束位置
于 2021-11-22 14:03:24 首次发布