一个爬虫例子

最新推荐文章于 2025-08-07 22:25:00 发布

Yatere

最新推荐文章于 2025-08-07 22:25:00 发布

阅读量1.2k

点赞数

分类专栏： python 文章标签： url html string web class file

python 专栏收录该内容

203 篇文章

订阅专栏

 
#!/usr/bin/python -u
import sys, urllib, hashlib, htmllib, os, formatter, string
 
class Parser(htmllib.HTMLParser):
    def __init__(self, verbose = 0):
        self.anchors = {}
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f, verbose)
 
    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        self.anchor = href
 
    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.anchor and text:
            self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
 
def getmainurl(url):
 
    ind = url.find('/',len('http://'))
    if ind > 0 :
        return url[:ind]
    else :
        return url
 
def getURL(url, html, queue):
    p = Parser()
    try :
        p.feed(html)
        p.close()
    except:
        return
    mainurl = getmainurl(url)
    for k, v in p.anchors.items():
        for u in v :
            if not u.startswith('http://'):
                if (mainurl[-1] != '/' and u[0] != '/') :
                    u = mainurl + '/' + u
                else :
                    u = mainurl + u
            hashNum = hashlib.md5(u);
            hashNum.digest()
            filename = hashNum.hexdigest()
            filename = filename + ".html"
            filename = "d:\\web\\" + filename
     if os.path.isfile(filename) == False:
         queue.append(u);
def BFS():
    queue = ["http://www.xxxxxx.com/"]
    while len(queue) != 0 :
        print len(queue)
        url = queue.pop(0);
        #Init the URL
        try:
            wp = urllib.urlopen(url)
        #Open Connection
        except:
            print url, "can not open this url"
            wp.close()
            continue
       
        content = wp.read()
        #get content
        wp.close()
        hashNum = hashlib.md5(url);
       
        hashNum.digest()
       
        filename = hashNum.hexdigest()
        filename = filename + ".html"
        filename = "d:\\web\\" + filename
        if os.path.isfile(filename) == False:
            fp = open(filename,"w")
            #open file     
            fp.write(content)
            #write
            fp.close()
            #close
        else :
            continue
        getURL(url, content, queue)
 
def main():
     while True :
        BFS()
 
if __name__ == '__main__' :
    main()