import re
import urllib
import string
import os
from HTMLParser2 import *
from urlparse import *
def get_abspath(url, page):
return urljoin(url, page)
class HtmlParserComplier(HTMLParser):
def __init__(self):
self.rule = []
HTMLParser.__init__(self)
return
def get_value(self, value):
if '#' != value[0]:
return value,''
tmp = value.split('-')
if 2 == len(tmp):
return tmp
else:
return tmp[0],''
def handle_starttag(self, tag, attrs):
tag = tag.strip()
item = []
tmp = tag.split('-')
if 'goto' == tmp[0]:
item.append('goto')
item.append(string.atoi(tmp[1]))
self.rule.append(item)
return
if 'getitem' == tmp[0]:
item.append('getitem')
self.rule.append(item)
return
item = ['start']
item.append(tag)
attr = []
for name,value in attrs:
v1,v2 = self.get_value(value)
tmp2 = [name, v1,v2]
attr.append(tmp2)
item.append(attr)
self.rule.append(item)
return
def handle_data(self, data):
data = data.strip()
if 0 == len(data):
return
v1,v2 = self.get_value(data)
item = ['data', v1, v2]
self.rule.append(item)
return
def handle_endtag(self, tag):
item = ['end', tag]
self.rule.append(item)
return
debug = 0
def debug_check(rule):
if 0 == debug:
return
print 'to find:',rule
print
def debug_found(str):
if 0 == debug:
return
print 'found:',str
print
class HtmlParserMatch(HTMLParser):
def __init__(self, rule):
self.rule = rule
self.init_item = None
self.at = 0
self.data = []
self.item = []
self.tmp = None
HTMLParser.__init__(self)
return
def set_initfunc(self, func):
self.init_item = func
self.item = self.init_item()
return
def get_value(self, para1, para2, value):
if 'all' == para2:
self.item[para1] = self.item[para1] + value
elif 'last' == para2:
self.item[para1] = value
elif 'first' == para2:
if '' == self.item[para1]:
self.item[para1] = value
else:
self.item[para1] = value
return
def check_tag(self):
if self.at >= len(self.rule):
return
debug_check(self.rule[self.at])
rule = self.rule[self.at]
if 'goto' == rule[0]:
self.at = rule[1]
return
if 'getitem' == rule[0]:
self.data.append(self.item)
self.item = self.init_item()
self.at = self.at + 1
return
if 'ignore' == rule[0]:
self.at = self.at + 1
return
return
def handle_starttag(self, tag, attrs):
debug_found('start '+tag)
self.check_tag()
if self.at >= len(self.rule):
return
rule = self.rule[self.at]
if 'start' != rule[0] or tag != rule[1]:
return
attr = rule[2]
checked_count = 0
for name,value in attrs:
for tmp in attr:
if tmp[0] == name:
if '#' == tmp[1][0]:
self.get_value(tmp[1], tmp[2], value)
checked_count = checked_count + 1
elif tmp[1] == value:
checked_count = checked_count + 1
if checked_count == len(attr):
self.tmp = None
self.at = self.at + 1
self.check_tag()
return
def handle_data(self, data):
debug_found('data '+data)
self.check_tag()
if self.at >= len(self.rule):
return
rule = self.rule[self.at]
if 'data' != rule[0] and None == self.tmp:
return
if 'data' == rule[0]:
self.at = self.at + 1
if '' != rule[2] and None == self.tmp:
self.tmp = rule
else:
rule = self.tmp
self.get_value(rule[1], rule[2], data)
self.check_tag()
return
def handle_endtag(self, tag):
debug_found('end '+tag)
self.check_tag()
if self.at >= len(self.rule):
return
rule = self.rule[self.at]
if 'end' != rule[0]:
return
if rule[1] == tag:
self.tmp = None
self.at = self.at + 1
self.check_tag()
return
def init_item():
return {'#soft_url':'', '#soft_name':'', '#soft_info':'', '#soft_size':'', '#soft_detail':'', '#soft_plantform':''}
url = './test'
compiler_file = urllib.urlopen(url).read()
compiler = HtmlParserComplier()
compiler.feed(compiler_file)
print len(compiler.rule)
for t in compiler.rule:
print t
url = 'http://www.newhua.com/sort/151_1.htm'
page_file = urllib.urlopen(url).read()
getcontent = HtmlParserMatch(compiler.rule)
getcontent.set_initfunc(init_item)
getcontent.feed(page_file)
print len(getcontent.data)
for t in getcontent.data:
print 'soft_name:',t['#soft_name']
print 'soft_url:',t['#soft_url']
print 'soft_size:',t['#soft_size']
print 'soft_info:',t['#soft_info']
print 'soft_detail:',t['#soft_detail']
print