python爬虫

本文介绍了一个简单的爬虫框架的设计与实现过程,包括控制器、URL管理、网页下载、解析和数据收集等核心模块。通过具体代码展示了如何从指定URL开始抓取网页,并递归地抓取链接指向的新页面。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >


1.控制器:

#coding=utf-8

'''
Created on 2016��7��12��

@author: wenwen.huang
'''
from spider import html_url, html_dw, html_parse, html_collect


class SpiderMain(object):
    
    def __init__(self):
        self.urlManager = html_url.Urlmanager()
        self.downloader = html_dw.DownLoader()
        self.paser = html_parse.Parser()
        self.collecter = html_collect.Collect()
        

    def start(self, rootUrl):
        count = 1
        self.urlManager.addUrl(rootUrl)
        while self.urlManager.hasUrl():
            try:
                newUrl = self.urlManager.getUrl()
                print 'start download %d --   %s' % (count, newUrl)
                html = self.downloader.startLoad(newUrl)
                newUrls, newData = self.paser.parse(html, newUrl)
                self.urlManager.addUrls(newUrls)
                self.collecter.collect(newData)
                if count > 100:
                    break
                count = count + 1
            except Exception, e:
                print e
                print "parse fail"
                
        self.collecter.writeFile()


if __name__ == '__main__':
    rootUrl = "http://baike.baidu.com/view/21087.htm"
    spider = SpiderMain()
    spider.start(rootUrl)


2.url管理

#coding=utf-8
'''
Created on 2016��7��12��

@author: wenwen.huang
'''


class Urlmanager(object):
    
    def __init__(self):
        self.newUrls = set()
        self.oldurls = set()
    
    
    def addUrl(self, url):
        if url is None:
            return
        if url in self.newUrls or url in self.oldurls:
            return
        self.newUrls.add(url)
        
    
    def hasUrl(self):
        return len(self.newUrls) != 0
    
    
    def getUrl(self):
        url = self.newUrls.pop()
        self.oldurls.add(url)
        return url
    
    
    def addUrls(self, urls):
        if urls is None or len(urls) == 0:
            return
        for url in urls :
            self.addUrl(url)
    


3.下载

#coding=utf-8
'''
Created on 2016年7月12日

@author: wenwen.huang
'''
import urllib2



class DownLoader(object):
    
    
<del>    def startLoad(self, url):
        res = urllib2.urlopen(url)
        if res.getcode() != 200:
            return
        
        return res.read() </del>

上面有问题,如下改进:

def startLoad(self, url):
#        req=urllib2.Request(url)
        res = urllib2.urlopen(url, timeout=1)
        if res.getcode() != 200:
            return
        
        result = res.read()
        res.close()
        return result

4.解析:

#coding=utf-8
'''
Created on 2016��7��12��

@author: wenwen.huang
'''
from bs4 import BeautifulSoup
import re
import urlparse


class Parser(object):
    
    
#     <dd class="lemmaWgt-lemmaTitle-title">
# <h1>Python</h1>
# <a href="javascript:;" class="edit-lemma cmn-btn-hover-blue cmn-btn-28 j-edit-link" style="display: inline-block;"><em class="cmn-icon wiki-lemma-icons wiki-lemma-icons_edit-lemma"></em>�༭</a>
# <a class="lock-lemma" target="_blank" href="/view/10812319.htm" title="����"><em class="cmn-icon wiki-lemma-icons wiki-lemma-icons_lock-lemma"></em>����</a>
# </dd>
        
        
#         <a target="_blank" href="/view/20965.htm">�������</a>

# <div class="para" label-module="para">
        
        
    def __parseUrl(self, soup, parseUrl):
        newUrls = set()
        links = soup.find_all("a", href=re.compile(r"/view/\d+\.htm"))
        for link in links:
            url = link['href']
            url = urlparse.urljoin(parseUrl, url)
            newUrls.add(url)
            
        return newUrls
    
    def __parseData(self, soup, parseUrl):
        datas={}
        titleNode = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1")
        datas['title'] = titleNode.get_text()
        datas['url'] = parseUrl
        descNode = soup.find("div", class_="para")
        datas['desc'] = descNode.get_text()
        return datas
    
    def parse(self, html, parseUrl):
        if html is None or parseUrl is None:
            return
        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
        newUrls = self.__parseUrl(soup, parseUrl)
        data = self.__parseData(soup, parseUrl)
        
        return newUrls, data



6.存储展示:

#coding=utf-8
'''
Created on 2016��7��12��

@author: wenwen.huang
'''


class Collect(object):
    
    def __init__(self):
        self.list = []
    
    def collect(self, data):
        if data is None:
            return
        self.list.append(data)

    
    def writeFile(self):
        fout = open("out.html", "w")
        
        fout.write("<html>")
        fout.write("<head><meta charset='utf-8'>")
        fout.write('<link href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap.min.css" rel="stylesheet">')
        fout.write("</head>")
        fout.write("<body>")
        fout.write('<div class="container">')
        fout.write('<table class="table table-bordered">')
        
        for data in self.list:
#             print data
            fout.write("<tr border='1'>")
            
            fout.write('<td>%s</td>' % data['url'])
            title = data['title'].encode('utf-8')
            print title
            fout.write('<td>%s</td>' % title)
            fout.write('<td>%s</td>' % data['desc'].encode('utf-8'))
            
            
#             fout.write("<td>")
#             fout.write(data['title'].encode("utf8"))
# #             print data['title']
#             fout.write("</td>")
#             fout.write("<td>")
#             fout.write(data['url'])
#             fout.write("</td>")
#             fout.write("<td>")
#             fout.write(data['desc'])
#             fout.write("</td>")
            fout.write("</tr>")
    
        fout.write("</table>")
        fout.write("</div>")
        fout.write("</body>")
        fout.write("</html>")
#以下容易被忽略:
        fout.close()
  


测试线程;

#coding=utf-8
'''
Created on 2016年7月13日

@author: wenwen.huang
'''
from time import sleep, ctime
import threading

class MyThread(object):

    def __init__(self):
        self.threads = []
        
    def music(self, name):
        for i in range(2):
            print " 我在听歌, 听  %s  in  %s \n" % (name, ctime())
            sleep(1)
            
            
            
    def movie(self, name):
        for i in range(2):
            print "我在看电影 ,看   %s  in  %s \n" % (name, ctime())
            sleep(3)
    
    def useThread(self):
        t1 = threading.Thread(target=self.music, args=('爱情买卖',))
        self.threads.append(t1)
        t2 = threading.Thread(target=self.movie, args=('阿凡达',))
        self.threads.append(t2)
        for t in self.threads:
#             t.setDaemon(True)  //打开导致子线程没执行完,就跟着主线程死亡!
            t.start()
        
        t.join()   //保证子线程执行完后,再执行主线程;
        
if __name__ == '__main__':
    test = MyThread()
    test.useThread()
    print 'start all threads over !'
#     test.music('黑色幽默')
#     test.movie('血色浪漫')








评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值