#!/usr/bin/python -u
import sys, urllib, hashlib, htmllib, os, formatter, string
class Parser(htmllib.HTMLParser):
def __init__(self, verbose = 0):
self.anchors = {}
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f, verbose)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.anchor = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.anchor and text:
self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
def getmainurl(url):
ind = url.find('/',len('http://'))
if ind > 0 :
return url[:ind]
else :
return url
def getURL(url, html, queue):
p = Parser()
try :
p.feed(html)
p.close()
except:
return
mainurl = getmainurl(url)
for k, v in p.anchors.items():
for u in v :
if not u.startswith('http://'):
if (mainurl[-1] != '/' and u[0] != '/') :
u = mainurl + '/' + u
else :
u = mainurl + u
hashNum = hashlib.md5(u);
hashNum.digest()
filename = hashNum.hexdigest()
filename = filename + ".html"
filename = "d:\\web\\" + filename
if os.path.isfile(filename) == False:
queue.append(u);
def BFS():
queue = ["http://www.xxxxxx.com/"]
while len(queue) != 0 :
print len(queue)
url = queue.pop(0);
#Init the URL
try:
wp = urllib.urlopen(url)
#Open Connection
except:
print url, "can not open this url"
wp.close()
continue
content = wp.read()
#get content
wp.close()
hashNum = hashlib.md5(url);
hashNum.digest()
filename = hashNum.hexdigest()
filename = filename + ".html"
filename = "d:\\web\\" + filename
if os.path.isfile(filename) == False:
fp = open(filename,"w")
#open file
fp.write(content)
#write
fp.close()
#close
else :
continue
getURL(url, content, queue)
def main():
while True :
BFS()
if __name__ == '__main__' :
main()
一个爬虫例子
最新推荐文章于 2025-04-13 10:46:06 发布