无法正确解析<br/><img src=""/>等单个标签:
测试代码如下:
class TestSGMLParser(SGMLParser):
def __init__(self, verbose=0):
self.testdata = ""
SGMLParser.__init__(self, verbose)
def handle_data(self, data):
self.testdata = self.testdata + data
if len(repr(self.testdata)) >= 70:
self.flush()
def flush(self):
data = self.testdata
if data:
self.testdata = ""
print 'data:', repr(data)
def handle_comment(self, data):
self.flush()
r = repr(data)
if len(r) > 68:
r = r[:32] + '...' + r[-32:]
print 'comment:', r
def unknown_starttag(self, tag, attrs):
self.flush()
if not attrs:
print 'start tag: <' + tag + '>'
else:
print 'start tag: <' + tag,
for name, value in attrs:
print name + '=' + '"' + value + '"',
print '>'
def unknown_endtag(self, tag):
self.flush()
print 'end tag: </' + tag + '>'
def unknown_entityref(self, ref):
self.flush()
print '*** unknown entity ref: &' + ref + ';'
def unknown_charref(self, ref):
self.flush()
print '*** unknown char ref: &#' + ref + ';'
def unknown_decl(self, data):
self.flush()
print '*** unknown decl: [' + data + ']'
def close(self):
SGMLParser.close(self)
self.flush()
if __name__=="__main__":
#htmldata=urllib.urlopen("http://www.sogou.com").read().decode("gbk")
#pros=BaseHTMLProcessor()
#pros.feed(htmldata)
#print pros.gethtmltext()
htmldata="""<html><head><title>Google Page</title></head><body>
<table id="tab">
<tr id="tr1"><td id="tr1td1">tr1 td1</td><td>tr1 td2</td><td>tr1 td3</td></tr>
<tr id="tr2"><td id="tr2td1">tr2 td1</td><td>tr2 td2</td><td>tr2 td3</td></tr>
</table>
<br/>
<img src="http://www.baidu.com/img/baidu_logo.gif" id="baidulogo" /><br/>
<a href="http://baidu.com">baidu</a><br/>
<b>bold font</b><br/>
<script language="javascript">alert("hello, world ");</script>
<style>#tab{background-color:#fcdad5;}</style>
</body></html>
"""
pros=TestSGMLParser() #BaseHTMLProcessor()
pros.feed(htmldata)
输出如下:
start tag: <html>
start tag: <head>
start tag: <title>
data: 'Google Page'
end tag: </title>
end tag: </head>
start tag: <body>
data: ' \n '
start tag: <table id="tab" >
data: '\n '
start tag: <tr id="tr1" >
start tag: <td id="tr1td1" >
data: 'tr1 td1'
end tag: </td>
start tag: <td>
data: 'tr1 td2'
end tag: </td>
start tag: <td>
data: 'tr1 td3'
end tag: </td>
end tag: </tr>
data: '\n '
start tag: <tr id="tr2" >
start tag: <td id="tr2td1" >
data: 'tr2 td1'
end tag: </td>
start tag: <td>
data: 'tr2 td2'
end tag: </td>
start tag: <td>
data: 'tr2 td3'
end tag: </td>
end tag: </tr>
data: '\n '
end tag: </table>
data: '\n '
start tag: <br>
data: '>\n <img src="http:'
end tag: </br>
data: '/www.baidu.com/img/baidu_logo.gif" id="baidulogo" />'
start tag: <br>
data: '>\n <a href="http:'
end tag: </br>
data: '/baidu.com">baidu'
end tag: </a>
start tag: <br>
data: '>\n <b>bold font<'
end tag: </br>
data: 'b>'
start tag: <br>
data: '>\n \n <script language="javascript">alert("hello, world ");<'
end tag: </br>
data: 'script>\n '
start tag: <style>
data: '#tab{background-color:#fcdad5;}'
end tag: </style>
data: '\n '
end tag: </body>
end tag: </html>
从输出可见:<br/> <img/>都没有被正确解析。
错误根源可以从sgmllib.py找到
另有人报:sgmlparser 解析时,把标签事件属性js代码中的大于号(> )误作为结束标记。
这两个bug,其实都是正则表达式惹的祸。
所以做网页内容提取时,尽量不要用正则,而是在把html转换为xhtml后,用dom解析,或者xpath。