一个小爬虫

从网上借鉴了别人的代码,自己改写了一下:

#! /usr/bin/env python
#coding=utf-8
#sina小爬虫
import re, urllib

class reptitle():
    def __init__(self, par_site):
        self.site = par_site
        self.content = ""
        self.urllist = []
        self.newslist = []
        self.url_count = 0
        
    def get_news(self, par_site):
        self.content = urllib.urlopen(par_site).read()
        #eg:http://news.sina.com.cn/c/2012-09-10/155925140890.shtml"<>
        self.newslist = re.findall(r"http://news\.sina\.com\.cn/\w+[^<]*</a>",self.content,re.M)

    def get_urls(self, par_site):
        self.content = urllib.urlopen(par_site).read()
        #eg:http://news.sina.com.cn/c/2012-09-10/155925140890.shtml"<>
        urls = re.findall(r"\"http://[a-z0-9A-Z]{1,}\.[a-z0-9A-Z]{1,}\.[a-z0-9A-Z]{1,}\S*\"",self.content,re.M)
        for item in urls:
            url = item.split('"')[1]
            self.urllist.append(url)
            self.url_count+=1
            
    def store_urls(self, par_path):
        file_url = open(par_path, "w+")
        index = 1;
        for piece in self.urllist:
            file_url.write("%d  "%index+piece+"\n")
            index+=1
        file_url.write("\nURLS totle number is "+str(self.url_count)+"...\n")
        file_url.close()
            
if __name__ == '__main__':
    site = "http://www.sina.com.cn"   #http://不能少哦
    reptile_sina = reptitle(site)
    print "[+]Getting the urls...\n"
    reptile_sina.get_urls(site)
    reptile_sina.store_urls("sina_urls.txt")
    print "Get the urls finished!\n"
    print "Totle urls is " + str(reptile_sina.url_count)


 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值