#-*-coding:utf-8-*-
from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs
import sys
class TitleParser(HTMLParser):
def __init__(self):
self.title = ''
self.readingtitle = 0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag == 'title':
self.readingtitle = 1
def handle_data(self, data):
if self.readingtitle:
self.title += data
def handle_endtag(self, tag):
if tag == 'title':
self.readingtitle = 0;
def handle_entityref(self, name): #& 是实体,没有这个函数结果不显示&
if entitydefs.has_key(name):#entitydefs函数能检查是否是实体
self.handle_data(entitydefs[name])
else:
self.handle_data('&' + name + ';')
def gettitle(self):
return self.title
fileHandle = open('./c.html')
titleParser = TitleParser()
titleParser.feed(fileHandle.read())
print 'title is :', titleParser.gettitle()
fileHandle.close()