python 爬虫

最新推荐文章于 2025-12-02 11:40:57 发布

转载最新推荐文章于 2025-12-02 11:40:57 发布 · 104 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/frog2008/p/6845306.html

文章标签：

#爬虫 #python

爬虫程序主要是把第n层网页的连接也下载下来
主程序
爬虫启动
生成一个队列
f（x）循环队列为空跳出
网址出队列
下载网页找下一层连接
添加到队列

from sys import argv
from os import makedirs,unlink,sep
from os.path import dirname,exists,isdir,splitext
from string import replace ,find,lower
from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import urlparse,urljoin
from formatter import DumbWriter,AbstractFormatter
from cStringIO import StringIO
import os,sys

syspath=sys.argv[0]


class retri(object):
    def __init__(self,url):
        self.url=url
        self.file=self.filename(url)
        
    def filename(self,url,deffile='index.htm'):
        parsedurl=urlparse(url,'http:',0)
        if parsedurl[2]=='':
            path=parsedurl[1]+'//index.htm'
        else:
            path=parsedurl[1]+parsedurl[2]
        ext=splitext(path)
        if ext[1]=='':
            if path[-1]=='/':
                path+=deffile
            else:
                path+='/'+deffile
        ldir=dirname(path)
#        ldir=path
        if sep !='/':
            ldir =replace(ldir,'/',sep)
        if not isdir(ldir):
            if exists(ldir):
                unlink(ldir)
            makedirs(ldir)
        return path
#        return parsedurl[2]
    
    
    def download(self):
        try:
            retval=urlretrieve(self.url,self.file)
            return retval
        except IOError:
            retval=('*** error:invalid url "%s"'%self.url)
            return retval
            
    def parse_and_getlink(self):
        self.parser=
        (AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
    
    
    
class crawler(object):
    count=0
    def __init__(self,url):
        self.q=[url]
        self.seen=[]
        self.dom=urlparse(url)[1]
    
    def get_page(self,url):
        r=retri(url)
        retval=r.download()
        if retval[0]=='*':
            print retval,'.. skipping parse'
            return
        crawler.count+=1
        print '\n(',crawler.count,')'
        print 'url:',url
        print 'file:',retval[0]
        self.seen.append(url)
        
        links=r.parse_and_getlink()
        for eachlink in links:
            if eachlink[:4]!='http' and find(eachlink,'://')==-1:
                eachlink=urljoin(url,eachlink)
            print '* ',eachlink
            
            if find(lower(eachlink),'mailto:')!=-1:
                print '... discarded,mailto link'
                continue
            
            if eachlink not in self.seen:
                if find(eachlink,self.dom)==-1:
                    print '...discarded,not in domain'
                else:
                    if eachlink not in self.q:
                        self.q.append(eachlink)
                        print '...new,added to q'
                    else:
                        print '...discarded,already in q'
                        
            else:
                print '... discarded,already processed'
        
        
        
    def go(self):
        while self.q:
            url=self.q.pop()
            self.get_page(url)
            
            
def main():
    if len(argv)>1:
        url=argv[1]
    else:
        try:
            url=raw_input('enter starting url:')
        except(KeyboardInterrupt,EOFError):
            url=''
    if not url:return
    robot =crawler(url)
    robot.go()
    
if __name__=='__main__':
    main()