新浪新闻爬虫-优快云博客

本文链接：https://blog.youkuaiyun.com/chence19871/article/details/8006056

从网上借鉴了别人的代码，自己改写了一下：

#! /usr/bin/env python
#coding=utf-8
#sina小爬虫
import re, urllib

class reptitle():
    def __init__(self, par_site):
        self.site = par_site
        self.content = ""
        self.urllist = []
        self.newslist = []
        self.url_count = 0
        
    def get_news(self, par_site):
        self.content = urllib.urlopen(par_site).read()
        #eg:http://news.sina.com.cn/c/2012-09-10/155925140890.shtml"<>
        self.newslist = re.findall(r"http://news\.sina\.com\.cn/\w+[^<]*</a>",self.content,re.M)

    def get_urls(self, par_site):
        self.content = urllib.urlopen(par_site).read()
        #eg:http://news.sina.com.cn/c/2012-09-10/155925140890.shtml"<>
        urls = re.findall(r"\"http://[a-z0-9A-Z]{1,}\.[a-z0-9A-Z]{1,}\.[a-z0-9A-Z]{1,}\S*\"",self.content,re.M)
        for item in urls:
            url = item.split('"')[1]
            self.urllist.append(url)
            self.url_count+=1
            
    def store_urls(self, par_path):
        file_url = open(par_path, "w+")
        index = 1;
        for piece in self.urllist:
            file_url.write("%d  "%index+piece+"\n")
            index+=1
        file_url.write("\nURLS totle number is "+str(self.url_count)+"...\n")
        file_url.close()
            
if __name__ == '__main__':
    site = "http://www.sina.com.cn"   #http://不能少哦
    reptile_sina = reptitle(site)
    print "[+]Getting the urls...\n"
    reptile_sina.get_urls(site)
    reptile_sina.store_urls("sina_urls.txt")
    print "Get the urls finished!\n"
    print "Totle urls is " + str(reptile_sina.url_count)

一个小爬虫