python之HTMLParser解析HTML文档

HTMLParser实战教程

最新推荐文章于 2023-02-19 23:21:33 发布

转载最新推荐文章于 2023-02-19 23:21:33 发布 · 428 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/hester/p/5420605.html

文章标签：

#python #xhtml

HTMLParser是Python自带的模块，使用简单，能够很容易的实现HTML文件的分析。
本文主要简单讲一下HTMLParser的用法.

使用时需要定义一个从类HTMLParser继承的类，重定义函数：

handle_starttag( tag, attrs)
handle_startendtag( tag, attrs)
handle_endtag( tag)
handle_data(data)

更多属性及方法请查看源代码：

"""A parser for HTML and XHTML."""

# This file is based on sgmllib.py, but the API is slightly different.

# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).


import markupbase
import re

# Regular expressions used for parsing

interesting_normal = re.compile('[&<]')
incomplete = re.compile('&[a-zA-Z#]')

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')

# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
# note: if you change tagfind/attrfind remember to update locatestarttagend too
tagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
# this regex is currently unused, but left for backward compatibility
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')

attrfind = re.compile(
    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')

locatestarttagend = re.compile(r"""
  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
  (?:[\s/]*                          # optional whitespace before attribute name
    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
      (?:\s*=+\s*                    # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |"[^"]*"                   # LIT-enclosed value
          |(?!['"])[^>\s]*           # bare value
         )
       )?(?:\s|/(?!>))*
     )*
   )?
  \s*                                # trailing whitespace
""", re.VERBOSE)
endendtag = re.compile('>')
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
# </ and the tag name, so maybe this should be fixed
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')


class HTMLParseError(Exception):
    """Exception raised for all parse errors."""

    def __init__(self, msg, position=(None, None)):
        assert msg
        self.msg = msg
        self.lineno = position[0]
        self.offset = position[1]

    def __str__(self):
        result = self.msg
        if self.lineno is not None:
            result = result + ", at line %d" % self.lineno
        if self.offset is not None:
            result = result + ", column %d" % (self.offset + 1)
        return result


class HTMLParser(markupbase.ParserBase):
    """Find tags and other markup and call handler functions.

    Usage:
        p = HTMLParser()
        p.feed(data)
        ...
        p.close()

    Start tags are handled by calling self.handle_starttag() or
    self.handle_startendtag(); end tags by self.handle_endtag().  The
    data between tags is passed from the parser to the derived class
    by calling self.handle_data() with the data as argument (the data
    may be split up in arbitrary chunks).  Entity references are
    passed by calling self.handle_entityref() with the entity
    reference as the argument.  Numeric character references are
    passed to self.handle_charref() with the string containing the
    reference as the argument.
    """

    CDATA_CONTENT_ELEMENTS = ("script", "style")


    def __init__(self):
        """Initialize and reset this instance."""
        self.reset()

    def reset(self):
        """Reset this instance.  Loses all unprocessed data."""
        self.rawdata = ''
        self.lasttag = '???'
        self.interesting = interesting_normal
        self.cdata_elem = None
        markupbase.ParserBase.reset(self)

    def feed(self, data):
        r"""Feed data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').
        """
        self.rawdata = self.rawdata + data
        self.goahead(0)

    def close(self):
        """Handle any buffered data."""
        self.goahead(1)

    def error(self, message):
        raise HTMLParseError(message, self.getpos())

    __starttag_text = None

    def get_starttag_text(self):
        """Return full source of start tag: '<...>'."""
        return self.__starttag_text

    def set_cdata_mode(self, elem):
        self.cdata_elem = elem.lower()
        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)

    def clear_cdata_mode(self):
        self.interesting = interesting_normal
        self.cdata_elem = None

    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
    def goahead(self, end):
        rawdata = self.rawdata
        i = 0
        n = len(rawdata)
        while i < n:
            match = self.interesting.search(rawdata, i) # < or &
            if match:
                j = match.start()
            else:
                if self.cdata_elem:
                    break
                j = n
            if i < j: self.handle_data(rawdata[i:j])
            i = self.updatepos(i, j)
            if i == n: break
            startswith = rawdata.startswith
            if startswith('<', i):
                if starttagopen.match(rawdata, i): # < + letter
                    k = self.parse_starttag(i)
                elif startswith("</", i):
                    k = self.parse_endtag(i)
                elif startswith("<!--", i):
                    k = self.parse_comment(i)
                elif startswith("<?", i):
                    k = self.parse_pi(i)
                elif startswith("<!", i):
                    k = self.parse_html_declaration(i)
                elif (i + 1) < n:
                    self.handle_data("<")
                    k = i + 1
                else:
                    break
                if k < 0:
                    if not end:
                        break
                    k = rawdata.find('>', i + 1)
                    if k < 0:
                        k = rawdata.find('<', i + 1)
                        if k < 0:
                            k = i + 1
                    else:
                        k += 1
                    self.handle_data(rawdata[i:k])
                i = self.updatepos(i, k)
            elif startswith("&#", i):
                match = charref.match(rawdata, i)
                if match:
                    name = match.group()[2:-1]
                    self.handle_charref(name)
                    k = match.end()
                    if not startswith(';', k-1):
                        k = k - 1
                    i = self.updatepos(i, k)
                    continue
                else:
                    if ";" in rawdata[i:]:  # bail by consuming '&#'
                        self.handle_data(rawdata[i:i+2])
                        i = self.updatepos(i, i+2)
                    break
            elif startswith('&', i):
                match = entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    k = match.end()
                    if not startswith(';', k-1):
                        k = k - 1
                    i = self.updatepos(i, k)
                    continue
                match = incomplete.match(rawdata, i)
                if match:
                    # match.group() will contain at least 2 chars
                    if end and match.group() == rawdata[i:]:
                        self.error("EOF in middle of entity or char ref")
                    # incomplete
                    break
                elif (i + 1) < n:
                    # not the end of the buffer, and can't be confused
                    # with some other construct
                    self.handle_data("&")
                    i = self.updatepos(i, i + 1)
                else:
                    break
            else:
                assert 0, "interesting.search() lied"
        # end while
        if end and i < n and not self.cdata_elem:
            self.handle_data(rawdata[i:n])
            i = self.updatepos(i, n)
        self.rawdata = rawdata[i:]

    # Internal -- parse html declarations, return length or -1 if not terminated
    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
    # See also parse_declaration in _markupbase
    def parse_html_declaration(self, i):
        rawdata = self.rawdata
        if rawdata[i:i+2] != '<!':
            self.error('unexpected call to parse_html_declaration()')
        if rawdata[i:i+4] == '<!--':
            # this case is actually already handled in goahead()
            return self.parse_comment(i)
        elif rawdata[i:i+3] == '<![':
            return self.parse_marked_section(i)
        elif rawdata[i:i+9].lower() == '<!doctype':
            # find the closing >
            gtpos = rawdata.find('>', i+9)
            if gtpos == -1:
                return -1
            self.handle_decl(rawdata[i+2:gtpos])
            return gtpos+1
        else:
            return self.parse_bogus_comment(i)

    # Internal -- parse bogus comment, return length or -1 if not terminated
    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
    def parse_bogus_comment(self, i, report=1):
        rawdata = self.rawdata
        if rawdata[i:i+2] not in ('<!', '</'):
            self.error('unexpected call to parse_comment()')
        pos = rawdata.find('>', i+2)
        if pos == -1:
            return -1
        if report:
            self.handle_comment(rawdata[i+2:pos])
        return pos + 1

    # Internal -- parse processing instr, return end or -1 if not terminated
    def parse_pi(self, i):
        rawdata = self.rawdata
        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
        match = piclose.search(rawdata, i+2) # >
        if not match:
            return -1
        j = match.start()
        self.handle_pi(rawdata[i+2: j])
        j = match.end()
        return j

    # Internal -- handle starttag, return end or -1 if not terminated
    def parse_starttag(self, i):
        self.__starttag_text = None
        endpos = self.check_for_whole_start_tag(i)
        if endpos < 0:
            return endpos
        rawdata = self.rawdata
        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        match = tagfind.match(rawdata, i+1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
        self.lasttag = tag = match.group(1).lower()

        while k < endpos:
            m = attrfind.match(rawdata, k)
            if not m:
                break
            attrname, rest, attrvalue = m.group(1, 2, 3)
            if not rest:
                attrvalue = None
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
            if attrvalue:
                attrvalue = self.unescape(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()

        end = rawdata[k:endpos].strip()
        if end not in (">", "/>"):
            lineno, offset = self.getpos()
            if "\n" in self.__starttag_text:
                lineno = lineno + self.__starttag_text.count("\n")
                offset = len(self.__starttag_text) \
                         - self.__starttag_text.rfind("\n")
            else:
                offset = offset + len(self.__starttag_text)
            self.handle_data(rawdata[i:endpos])
            return endpos
        if end.endswith('/>'):
            # XHTML-style empty tag: <span attr="value" />
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag)
        return endpos

    # Internal -- check to see if we have a complete starttag; return end
    # or -1 if incomplete.
    def check_for_whole_start_tag(self, i):
        rawdata = self.rawdata
        m = locatestarttagend.match(rawdata, i)
        if m:
            j = m.end()
            next = rawdata[j:j+1]
            if next == ">":
                return j + 1
            if next == "/":
                if rawdata.startswith("/>", j):
                    return j + 2
                if rawdata.startswith("/", j):
                    # buffer boundary
                    return -1
                # else bogus input
                self.updatepos(i, j + 1)
                self.error("malformed empty start tag")
            if next == "":
                # end of input
                return -1
            if next in ("abcdefghijklmnopqrstuvwxyz=/"
                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
                # end of input in or before attribute value, or we have the
                # '/' from a '/>' ending
                return -1
            if j > i:
                return j
            else:
                return i + 1
        raise AssertionError("we should not get here!")

    # Internal -- parse endtag, return end or -1 if incomplete
    def parse_endtag(self, i):
        rawdata = self.rawdata
        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
        match = endendtag.search(rawdata, i+1) # >
        if not match:
            return -1
        gtpos = match.end()
        match = endtagfind.match(rawdata, i) # </ + tag + >
        if not match:
            if self.cdata_elem is not None:
                self.handle_data(rawdata[i:gtpos])
                return gtpos
            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
            namematch = tagfind.match(rawdata, i+2)
            if not namematch:
                # w3.org/TR/html5/tokenization.html#end-tag-open-state
                if rawdata[i:i+3] == '</>':
                    return i+3
                else:
                    return self.parse_bogus_comment(i)
            tagname = namematch.group(1).lower()
            # consume and ignore other stuff between the name and the >
            # Note: this is not 100% correct, since we might have things like
            # </tag attr=">">, but looking for > after tha name should cover
            # most of the cases and is much simpler
            gtpos = rawdata.find('>', namematch.end())
            self.handle_endtag(tagname)
            return gtpos+1

        elem = match.group(1).lower() # script or style
        if self.cdata_elem is not None:
            if elem != self.cdata_elem:
                self.handle_data(rawdata[i:gtpos])
                return gtpos

        self.handle_endtag(elem)
        self.clear_cdata_mode()
        return gtpos

    # Overridable -- finish processing of start+end tag: <tag.../>
    def handle_startendtag(self, tag, attrs):
        self.handle_starttag(tag, attrs)
        self.handle_endtag(tag)

    # Overridable -- handle start tag
    def handle_starttag(self, tag, attrs):
        pass

    # Overridable -- handle end tag
    def handle_endtag(self, tag):
        pass

    # Overridable -- handle character reference
    def handle_charref(self, name):
        pass

    # Overridable -- handle entity reference
    def handle_entityref(self, name):
        pass

    # Overridable -- handle data
    def handle_data(self, data):
        pass

    # Overridable -- handle comment
    def handle_comment(self, data):
        pass

    # Overridable -- handle declaration
    def handle_decl(self, decl):
        pass

    # Overridable -- handle processing instruction
    def handle_pi(self, data):
        pass

    def unknown_decl(self, data):
        pass

    # Internal -- helper to remove special character quoting
    entitydefs = None
    def unescape(self, s):
        if '&' not in s:
            return s
        def replaceEntities(s):
            s = s.groups()[0]
            try:
                if s[0] == "#":
                    s = s[1:]
                    if s[0] in ['x','X']:
                        c = int(s[1:], 16)
                    else:
                        c = int(s)
                    return unichr(c)
            except ValueError:
                return '&#'+s+';'
            else:
                # Cannot use name2codepoint directly, because HTMLParser supports apos,
                # which is not part of HTML 4
                import htmlentitydefs
                if HTMLParser.entitydefs is None:
                    entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
                    for k, v in htmlentitydefs.name2codepoint.iteritems():
                        entitydefs[k] = unichr(v)
                try:
                    return self.entitydefs[s]
                except KeyError:
                    return '&'+s+';'

        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)

可以看出，源代码中handle_xxxxxx函数体均是空的，需要自己继承并添加处理内容；否则函数不作任何处理。

1. 获取标签属性

tag是的html标签，attrs是 (属性，值)元组(tuple)的列表(list).

如一个标签为：<input type="hidden" name="NXX" id="IDXX" value="VXX" />

那么它的attrs列表为[('type', 'hidden'), ('name', 'NXX'), ('id', 'IDXX'), ('value', 'VXX')]
HTMLParser自动将tag和attrs都转为小写。

下面给出的例子抽取了html中的所有链接：

from HTMLParser import HTMLParser
 
class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []
 
    def handle_starttag(self, tag, attrs):
        #print "Encountered the beginning of a %s tag" % tag
        if tag == "a":
            if len(attrs) == 0: pass
            else:
                for (variable, value)  in attrs:
                    if variable == "href":
                        self.links.append(value)
 
if __name__ == "__main__":
    html_code = """
    <a href="www.google.com"> google.com</a>
    <A Href="www.pythonclub.org"> PythonClub </a>
    <A HREF = "www.sina.com.cn"> Sina </a>
    """
    hp = MyHTMLParser()
    hp.feed(html_code)
    hp.close()
    print(hp.links)

输出为：

['www.google.com', 'www.pythonclub.org', 'www.sina.com.cn']

如果想抽取图形链接：

<img src='http://www.google.com/intl/zh-CN_ALL/images/logo.gif' />

就要重定义 handle_startendtag( tag, attrs) 函数

2. 获取标签内容　　

test1.html文件内容如下：

<html>
<head>
<title> XHTML 与 HTML 4.01 标准没有太多的不同</title>
</head>
<body>
i love you
</body>
</html>

2.1 第一个例子

import HTMLParser

class TitleParser(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        # self.taglevels=[]
        self.handledtags = ['title','body']
        self.processing = None

    def handle_starttag(self,tag,attrs):
        print '--------------'
        print 'handle start func',tag

    def handle_endtag(self,tag):
        print '================'
        print 'handle end func',tag

if __name__ == '__main__':
    fd=open('test1.html')
    tp=TitleParser()
    tp.feed(fd.read())

运行结果：

--------------
handle start func html
--------------
handle start func head
--------------
handle start func title
=======================
handle end func title
=======================
handle end func head
--------------
handle start func body
=======================
handle end func body
=======================
handle end func html

相信大家已经看出来了，解析时碰到<***>，自动调用handle_starttag()；碰到</***>，自动调用handle_endtag()

2.2 添加handle_data方法

import HTMLParser

class TitleParser(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        # self.taglevels=[]
        self.handledtags = ['title','body']
        self.processing = None

    def handle_starttag(self,tag,attrs):
        print '--------------'
        print 'handle start func',tag

    def handle_data(self,data):
        print '####'
        print 'handle data func'
        if data == '\n':
            print r'\n'
        else:
            print data,

    def handle_endtag(self,tag):
        print '======================='
        print 'handle end func',tag

if __name__ == '__main__':
    fd=open('test1.html')
    tp=TitleParser()
    tp.feed(fd.read())

运行结果：

--------------
handle start func html
####
handle data func
\n
--------------
handle start func head
####
handle data func
\n
--------------
handle start func title
####
handle data func
 XHTML 与 HTML 4.01 标准没有太多的不同 =======================
handle end func title
####
handle data func
\n
=======================
handle end func head
####
handle data func
\n
--------------
handle start func body
####
handle data func

i love you
=======================
handle end func body
####
handle data func
\n
=======================
handle end func html

说明：

每一个标签，无论<> 还是</>，均会调用handle_data()
html中第一行、第二行分别为<html>和<head>，后面无具体数据，只有回车换行，所用调用handle_data()，打印结果为换行；</html></head>同理。

2.2 解析需要的内容

import HTMLParser

class TitleParser(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self.handledtags = ['title','body']
        self.processing = None
        self.data = []

    def handle_starttag(self,tag,attrs):
        if tag in self.handledtags:
            self.processing = tag

    def handle_data(self,data):
        if self.processing:
            self.data.append(data)

    def handle_endtag(self,tag):
        if tag == self.processing:
            self.processing = None

if __name__ == '__main__':
    fd = open('test1.html')
    tp = TitleParser()
    tp.feed(fd.read())
    for each in tp.data:
        print each

运行结果：

 XHTML 与 HTML 4.01 标准没有太多的不同

i love you

2.3 解析豆瓣热门电影实例

#encoding=utf8
import urllib2
from HTMLParser import HTMLParser
'''
<li class="ui-slide-item s" data-rater="6802" data-enough="True" data-intro="" data-actors="朴灿烈 / 袁姗姗 / 姜潮" data-director="金帝荣" data-region="中国大陆" data-duration="99分钟" data-ticket="https://movie.douban.com/subject/26564988/cinema/" data-trailer="https://movie.douban.com/subject/26564988/trailer" data-star="30" data-rate="5.3" data-release="2016" data-title="所以……和黑粉结婚了" data-dstat-viewport=".screening-bd" data-dstat-watch=".ui-slide-content" data-dstat-mode="click,expose" data-dstat-areaid="70_4">
'''

class MYPARSER(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.movies = []

    def handle_starttag(self,tag,attrs):
        def _attr(attrlist,attrname):
            for each in attrlist:
                if attrname == each[0]:
                    return each[1]
            return None

        if tag == 'li' and _attr(attrs,'data-title'):
            movie = {}
            movie['actors'] = _attr(attrs,'data-actors')
            movie['director'] = _attr(attrs,'data-director')
            movie['duration'] = _attr(attrs,'data-dutation')
            movie['title'] = _attr(attrs,'data-title')
            movie['rate'] = _attr(attrs,'data-rate')
            self.movies.append(movie)

def movieparser(url):
    headers = {}
    req = urllib2.Request(url,headers)
    s = urllib2.urlopen(req)
    myparser = MYPARSER()
    myparser.feed(s.read())
    myparser.close()
    return myparser.movies


if __name__ == '__main__':
    url = 'https://movie.douban.com/'
    movies = movieparser(url)
    for each in movies:
        print('%(title)s|%(rate)s|%(actors)s|%(director)s|%(duration)s' % each)

运行结果：

寒战2|7.2|郭富城 / 梁家辉 / 杨采妮|梁乐民|None
致青春·原来你还在这里|3.9|吴亦凡 / 刘亦菲 / 金世佳|周拓如|None
大鱼海棠|6.6|季冠霖 / 苏尚卿 / 许魏洲|梁旋|None
忍者神龟2：破影而出 Teenage Mutant Ninja Turtles: Out of the Shadows|6.4|梅根·福克斯 / 斯蒂芬·阿美尔 / 威尔·阿奈特|戴夫·格林|None
摇滚藏獒|6.8|郭德纲 / 郭麒麟 / 于谦|艾什·布兰农|None
发条城市|6.4|王宁 / 修睿 / 王自健|江涛|None
赏金猎人|5.5|李敏镐 / 钟汉良 / 唐嫣|申太罗|None
张震讲故事之合租屋|4.8|卢杉 / 傅亨 / 吴谨西|战越|None
惊天魔盗团2 Now You See Me 2|6.6|杰西·艾森伯格 / 伍迪·哈里森 / 戴夫·弗兰科|朱浩伟|None
海底总动员2：多莉去哪儿 Finding Dory|7.4|艾伦·德杰尼勒斯 / 艾伯特·布鲁克斯 / 艾德·奥尼尔|安德鲁·斯坦顿|None
独立日：卷土重来 Independence Day: Resurgence|5.9|利亚姆·海姆斯沃斯 / 杰夫·高布伦 / 比尔·普尔曼|罗兰·艾默里奇|None
丑小鸭历险记|3.3|朱可可 / 阿飞 / 夏倚轩|郑义|None
所以……和黑粉结婚了|5.3|朴灿烈 / 袁姗姗 / 姜潮|金帝荣|None
筷仙|2.7|胡影怡 / 朱璇 / 周骏|姬雨|None
古田会议|2.9|许铂岑 / 王韦智 / 王怡苏|陈健|None
魔轮|4.8|林心如 / 何润东 / 金世佳|王早|None

代码说明：