从网上借鉴了别人的代码,自己改写了一下:
#! /usr/bin/env python
#coding=utf-8
#sina小爬虫
import re, urllib
class reptitle():
def __init__(self, par_site):
self.site = par_site
self.content = ""
self.urllist = []
self.newslist = []
self.url_count = 0
def get_news(self, par_site):
self.content = urllib.urlopen(par_site).read()
#eg:http://news.sina.com.cn/c/2012-09-10/155925140890.shtml"<>
self.newslist = re.findall(r"http://news\.sina\.com\.cn/\w+[^<]*</a>",self.content,re.M)
def get_urls(self, par_site):
self.content = urllib.urlopen(par_site).read()
#eg:http://news.sina.com.cn/c/2012-09-10/155925140890.shtml"<>
urls = re.findall(r"\"http://[a-z0-9A-Z]{1,}\.[a-z0-9A-Z]{1,}\.[a-z0-9A-Z]{1,}\S*\"",self.content,re.M)
for item in urls:
url = item.split('"')[1]
self.urllist.append(url)
self.url_count+=1
def store_urls(self, par_path):
file_url = open(par_path, "w+")
index = 1;
for piece in self.urllist:
file_url.write("%d "%index+piece+"\n")
index+=1
file_url.write("\nURLS totle number is "+str(self.url_count)+"...\n")
file_url.close()
if __name__ == '__main__':
site = "http://www.sina.com.cn" #http://不能少哦
reptile_sina = reptitle(site)
print "[+]Getting the urls...\n"
reptile_sina.get_urls(site)
reptile_sina.store_urls("sina_urls.txt")
print "Get the urls finished!\n"
print "Totle urls is " + str(reptile_sina.url_count)