#!/usr/bin/python
# -*- coding: utf-8 -*-
# -*- coding: gb2312 -*-
import sgmllib
class MyParser(sgmllib.SGMLParser):
"A simple parser class."
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.urls = []
self.descriptions = []
self.inside_a_element = 0
self.starting_description = 0
self.href = ""
def start_a(self, attributes):
"Process a hyperlink and its 'attributes'."
for name, value in attributes:
if name == "href":
self.urls.append(value)
self.inside_a_element = 1
self.href = value
self.starting_description = 1
def end_a(self):
"Record the end of a hyperlink."
self.inside_a_element = 0
def handle_data(self, data):
"Handle the textual 'data'."
if 1==self.inside_a_element:
if self.starting_description:
s = "<a href=%s>" %self.href + data + "</a>"
self.descriptions.append(s)
self.starting_description = 0
else:
self.descriptions[-1] += data
def get_urls(self):
"Return the list of urls."
return self.urls
def get_descriptions(self):
"Return a list of descriptions."
return self.descriptions
import urllib, sgmllib
# Get something to work with.
f = urllib.urlopen("http://dzh.mop.com/dwdzh/list_46_0_0.html")
s = f.read()
#print s
# Try and process the page.
# The class should have been defined first, remember.
myparser = MyParser()
myparser.parse(s)
# Get the urls.
#print myparser.get_urls()
lists = myparser.get_descriptions()
for list in lists:
print list
# -*- coding: utf-8 -*-
# -*- coding: gb2312 -*-
import sgmllib
class MyParser(sgmllib.SGMLParser):
"A simple parser class."
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.urls = []
self.descriptions = []
self.inside_a_element = 0
self.starting_description = 0
self.href = ""
def start_a(self, attributes):
"Process a hyperlink and its 'attributes'."
for name, value in attributes:
if name == "href":
self.urls.append(value)
self.inside_a_element = 1
self.href = value
self.starting_description = 1
def end_a(self):
"Record the end of a hyperlink."
self.inside_a_element = 0
def handle_data(self, data):
"Handle the textual 'data'."
if 1==self.inside_a_element:
if self.starting_description:
s = "<a href=%s>" %self.href + data + "</a>"
self.descriptions.append(s)
self.starting_description = 0
else:
self.descriptions[-1] += data
def get_urls(self):
"Return the list of urls."
return self.urls
def get_descriptions(self):
"Return a list of descriptions."
return self.descriptions
import urllib, sgmllib
# Get something to work with.
f = urllib.urlopen("http://dzh.mop.com/dwdzh/list_46_0_0.html")
s = f.read()
#print s
# Try and process the page.
# The class should have been defined first, remember.
myparser = MyParser()
myparser.parse(s)
# Get the urls.
#print myparser.get_urls()
lists = myparser.get_descriptions()
for list in lists:
print list