1,爬虫程序
crawler.py
# -* - coding: UTF-8 -* -
#!/usr/bin/python
import sys, urllib, hashlib, htmllib, os, formatter, string
class Parser(htmllib.HTMLParser):
def __init__(self, verbose = 0):
self.anchors = {}
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f, verbose)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.anchor = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.anchor and text:
self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
#返回url的主目录,比如http://www.baidu.com/123返回http://www/baidu.com
def getmainurl(url):
ind = url.find('/',len('http://'))
if ind > 0 :
return url[:ind]
else :
return url
#根据url返回其他所有的url
def getURL(url, html, queue):
p = Parser()
try :
p.feed(html)
p.close()
except:
return
ma