对象:DataSource, SAXParser, ContentHandler, DataTarget
DataSource封装xml数据流,提供read接口
char read();
SAXPaser 通过DataSource读取数据,产生事件并且回调ContentHandler的事件处理方法
void parse(DataSource sourceXml, ContentHandler handler);
ContentHandler,封装事件处理逻辑以及处理过程中的相关状态:
void startElement(tag, attributes)
void endElement(tag)
void text(textData)如果需要,还可以有一个DataTarget对象,比如产生一颗DOM树或者转化成另一个xml,ContenHandler会去更新这个对象
例1,抓取lintcode.com的题目列表
from urllib2 import urlopen
from xml.sax import make_parser, ContentHandler
class MyHandler(ContentHandler):
def __init__(self, dataTarget):
ContentHandler.__init__(self)
self.__inRightElement = False
self.__dataTarget = dataTarget
def startElement(self, tag, attrs):
if tag == 'span' and attrs['class'] == 'm-l-sm title':
self.__inRightElement = True
def endElement(self, tag):
if tag == 'span' and self.__inRightElement == True:
self.__inRightElement = False
def characters(self, data):
if self.__inRightElement == True:
content = data.strip()
if content != '': self.__dataTarget.append(content)
infile = urlopen('http://www.lintcode.com/en/problem/')
dataTarget = []
parser = make_parser();
parser.setContentHandler(MyHandler(dataTarget))
parser.parse(infile)
for item in dataTarget : print item
例2,产生DOM
from xml.sax import parseString, ContentHandler
class XmlElement:
def __init__(self, name):
self.name = name
self.children = []
self.attributes = {}
def addChild(self, child):
self.children.append(child)
def setAttribute(self, key, value):
self.attributes[key] = value
def setText(self, text):
self.text = text
class MakeDOMHandler(ContentHandler):
def __init__(self):
ContentHandler.__init__(self)
self.__doc = XmlElement("doc")
self.__stack = [self.__doc]
def startElement(self, tag, attrs):
newNode = XmlElement(tag)
for k, v in attrs.items():
newNode.setAttribute(k, v)
self.__stack[-1].addChild(newNode)
self.__stack.append(newNode)
def endElement(self, tag):
self.__stack.pop()
def characters(self, text):
self.__stack[-1].setText(text)
def getRootElement(self):
return self.__doc.children[0]
handler = MakeDOMHandler()
parseString('<Contact><name>Tom</name><address name="home">street A</address></Contact>', handler)
rootElement = handler.getRootElement()
leetCode 爬虫
import urllib2, urllib, cookielib, HTMLParser,os
class LeetCodeParser(HTMLParser.HTMLParser):
def __init__(self, result):
HTMLParser.HTMLParser.__init__(self)
self.res = result
self.__inTable, self.__inTitleCol, self.__inStatusCol = False, False, False
self.__problem_link, self.__title_text, self.__submission_link = '', '', ''
def handle_starttag(self, tag, attrList):
attrs = {}
for kv in attrList: attrs[kv[0]] = kv[1]
if tag == 'table':
if attrs['id'] == 'result-testcases':
self.__inTable = True
elif tag == 'a' and self.__inTable and attrs.has_key('class'):
if attrs['class'] == 'inline-wrap':
self.__inTitleCol = True
self.__problem_link = attrs['href']
else:
self.__inStatusCol = True
self.__submission_link = attrs['href']
def handle_endtag(self, tag):
if tag == 'table' and self.__inTable: self.__inTable = False
elif tag == 'a':
if self.__inTitleCol:
self.__inTitleCol = False
elif self.__inStatusCol:
self.__inStatusCol = False
self.__title_text = ''
def handle_data(self, data):
if self.__inTitleCol:
self.__title_text += data
elif self.__inStatusCol and data == 'Accepted':
if not self.res.has_key(self.__title_text): self.res[self.__title_text] = {'submissions': []}
self.res[self.__title_text]['problem_link'] = self.__problem_link
self.res[self.__title_text]['submissions'].append(self.__submission_link)
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders.append(("User-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"))
opener.open('https://leetcode.com/accounts/login/')
token = filter(lambda c : c.name == 'csrftoken', cj)[0].value
data = urllib.urlencode({'login': 'userId', 'password': 'pass', 'csrfmiddlewaretoken': token,'remember':'on' })
opener.addheaders.append(('Referer', 'https://leetcode.com/accounts/login/'))
opener.open( 'https://leetcode.com/accounts/login/', data)
#get the submission list
result = {}
parser = LeetCodeParser(result)
for pageNo in xrange(1, 115):
html = opener.open('https://leetcode.com/submissions/'+ str(pageNo) ).read()
parser.feed(html)
#grab submission and save code
base = 'c:\\personal\\leetcode\\'
existed = set([x[x.index(' ') + 1 : x.index('.')] for x in os.listdir(base)])
total, done, skipped = sum([len(v['submissions']) for v in result.values()]), 0, 0
print 'Total submissions: ', total, ', exisists: ', len(existed)
for problemName, problemData in result.items():
subList = problemData['submissions']
for j in xrange(len(subList)):
submission = problemName + '_' + str(len(subList) - j)
if submission in existed:
print submission, ' exists, skip.'
skipped += 1
continue
try:
pos = [0]
content = opener.open('https://leetcode.com' + subList[j]).read()
def getValue(key):
pos[0] = content.index(key, pos[0])
pos[0] = content.index('\'', pos[0]) + 1
end = content.index('\'', pos[0])
value = content[pos[0]: end]
pos[0] = end
return value
problemId = getValue('questionId')
language = getValue('getLangDisplay')
code = getValue('submissionCode')
convertTable = {'\u000D':'\n','\u000A':'','\u003B':';','\u003C':'<','\u003E':'>','\u003D':'=',\
'\u0026':'&','\u002D':'-','\u0022':'"','\u0009':'\t','\u0027':"'",'\u005C':'\\'}
for before, after in convertTable.items():
code = code.replace(before, after)
with open(base + problemId + ' ' + submission + '.'+ language, 'w') as f:
f.write('// ' + problemId + '\n' + '// ' + problemName + '\n' + '// https://leetcode.com/' + problemData['problem_link'] + '\n' + code)
done += 1
print submission + ' complete.', 'Done ', done
except Exception,e:
print submission + " failed.", Exception, e
print 'Total: ', total, ' Skipped:', skipped, ' Downloaded:', done

本文详细介绍了如何使用Python的xml.sax模块解析XML数据,包括数据流的封装、SAXParser的使用、ContentHandler的事件处理逻辑以及DataTarget的应用。实例展示了从LintCode网站抓取题目列表和构建DOM树的过程。

被折叠的 条评论
为什么被折叠?



