再不写点东西就真不会写代码了
最近一直在学习云计算的东西,慢慢的也准备整理一下。
今天试试手,写一个简单的爬虫,之前就写了,但一直没有实现。很简单,正好熟悉一下python
# coding=utf-8
import sys
import os
import urllib
import urllib2
import threading
base_uri = "http://www.youkuaiyun.com/"
connect_flag = True
running = False
from time import sleep
import re
LINK_RE = r'(http://[\w,\b,/,\.,-]*)'
IMAGE_RE = r"<img.*src\s?=\s?\"[^\w]*([^>]*?)\""
link_set = set()
#初始化参数
link_set_copy = set()
link_set.add(base_uri)
link_set_copy.add(base_uri)
SUCCESS_COUNT=1
ERROR_COUNT=0
IMAGE_COUNT=0
# 创建一个类去处理爬虫程序
class Crawler(object):
def __init__(self):
pass
def writeLinks(self, link):
file_obj = open("./links/links.txt","a")
file_obj.write(link+"\n")
file_obj.close()
def getImages(self, contents):
global IMAGE_COUNT
imagere = re.compile(IMAGE_RE)
image_list = imagere.findall(contents)
n=0
for image in image_list:
try:
if image.index("http") == -1:
image = "http://"+image
#print str(image)
data = urllib.urlopen(image).read()
f = file("./images/"+str(IMAGE_COUNT)+".jpg", 'wb')
f.write(data)
f.close()
IMAGE_COUNT+=1
except Exception, e:
# print "download image failed: "+image
#print e
pass
def getLink(self,contents):
'''
:param contents: 获取出来的网页内容
:return: 无
'''
global link_set, link_set_copy
linkre = re.compile(LINK_RE)
link_list = linkre.findall(contents, )
#print "链接数量: " + str(len(link_list))
#如果不在已经删除的链接的集合就加进去
for link in link_list:
if link not in link_set_copy:
link_set.add(link)
def req_contents(self):
global link_set, link_set_copy,ERROR_COUNT,SUCCESS_COUNT,connect_flag, running
uri = link_set.pop()
link_set_copy.add(uri)
#print uri
self.writeLinks(uri)
try :
request = urllib2.Request(uri)
response = urllib2.urlopen(request)
result = response.read().decode('utf8')
self.getImages(result)
SUCCESS_COUNT += 1
if connect_flag:
connect_flag = False
if not running:
running = True
self.getLink(result)
except UnicodeDecodeError, e:
ERROR_COUNT += 1
# print "decoding error"
except Exception, e:
ERROR_COUNT += 1
# print e
except urllib2.URLError, e:
ERROR_COUNT += 1
# print e
def start(self):
global link_set
#import pdb;pdb.set_trace()
while len(link_set) > 0:
self.req_contents()
#也是添加一个不断更新的效果,在第一次请求的时候,如果需要时间比较长久能看得到。
class IsGrabbig(threading.Thread):
def run(self):
n = 1
global connect_flag
# print "grbbing ",
while connect_flag:
if n % 4 == 0:
sys.stdout.write("\r || grabbing || \r")
elif n % 4 == 1:
sys.stdout.write("\r /\\ grabbing /\\ \r")
elif n % 4 == 2:
sys.stdout.write("\r -- grabbing -- \r")
else:
sys.stdout.write("\r \\/ grabbing \\/ \r")
sys.stdout.flush()
sleep(0.5)
n += 1
#一直想实现一下linux上的进度条的效果,之前没有试过怎么在一行输出并不断更新。sys.stdout.write("\r")就可以实现,\r的效果是从头开始重新输入,会覆盖之前的输出。
# \r从头开始,覆盖后面的数据,中间不能有换行的输出
sys.stdout.write("\r 已获取到数据" + " " * 100 + "\n")
sys.stdout.flush()
#输出一个不断更新的效果
class LinkCount(threading.Thread):
def run(self):
global running,SUCCESS_COUNT,ERROR_COUNT
while not running:
sleep(0.1)
pass
while running:
sys.stdout.write("\r 已读取的网页数量: "+str(SUCCESS_COUNT) + "\t失败数量:"+str(ERROR_COUNT) + "\t")
sys.stdout.flush()
sleep(1)
def generateReport():
global link_set_copy, link_set
print "\n"+"*" * 10 + "Report" + "*" * 10
print " " * 10 + "over" + " " * 10
print "总共识别出了"+str(len(link_set_copy)+len(link_set))+"个网页"
print "已经读取"+str(SUCCESS_COUNT)+"个网页"
print "读取出错的数量" + str(ERROR_COUNT) + "个网页"
print "*" * 26
if __name__ == "__main__":
# raw_input("please input the uri")
# uri_confirm = raw_input("confirm the uri is : " + base_uri + " (Y/N) \n") "")
# if uri_confirm is "N" or uri_confirm is "n":
# sys.exit(0)
print "program is starting ... "
IsGrabbig().start()
crawler = Crawler()
try:
LinkCount().start()
crawler.start()
print
except KeyboardInterrupt,e:
print
except UnicodeDecodeError,e:
print
finally:
running = False
connect_flag = True
generateReport()
很简单,输出的结果也就是这样
sh-3.2# python main.py
program is starting ...
已获取到数据
已读取的网页数量: 11 失败数量:5 ^C
**********Report**********
over
总共识别出了739个网页
已经读取11个网页
读取出错的数量5个网页
**************************