#!/usr/bin/env python
#coding:utf8
#2014.3.9
#bug:1.下载文件夹未分,不能直接创建文件夹【/】 2.下载失败503未重新下载【/】
#3.线程很奇怪【暂时不好说】 4.文件打开有时不能用XnView,但可以用自带打开【暂时不好说】
#5.获取pic_list到第三个就不见了?【/】6.可能直接跳过一个列表【********TODO********】
#7.可能直接崩溃掉【估计取决于网络状况】
import random,threading,time
from Queue import Queue
import urllib,os,urllib2,re
import socket
socket.setdefaulttimeout(36)
path = "E://PIC/"
error = 1
def download(url,name):
try:
urllib.urlretrieve(url,name)
except:
try:
urllib.urlretrieve(url,name)
except:
global error
print 'error '+ str(error)
error+=1
fw=open("E://PIC/error.txt","w+")
fw.write(url)
fw.write(str(name.decode('utf8')))
fw.close()
page_number = input("Please input the nage number:\n")
#打开主页
list_page_request = urllib2.Request('http://animewallpaperstock.com/page/' + str(page_number))
list_page_open = urllib2.urlopen(list_page_request).read()
print 'list_page_open:',len(list_page_open)
#re得出続きみる
re_tsuzuki = re.compile(r'<h2><a href="(.+?)">(.+?)</a></h2>')
list_url = re_tsuzuki.findall(list_page_open)
#list_url[1][0] 为url list_url[1][1]为合集名
class Producer(threading.Thread):
def __init__(self,t_name,queue,list_url):
threading.Thread.__init__(self,name=t_name)
self.list_url = list_url
self.queue = queue
def run(self):
for i in range(len(list_url)):
#打开多图页
print 'one mult page start'
try:
pic_list_request = urllib2.Request(self.list_url[i][0])
pic_list_open = urllib2.urlopen(pic_list_request).read()
print 'new pic_list_open:',len(pic_list_open)
#re得出url和名
re_pix = re.compile(r'<div class="resolution">(\d{3,4})x.+?</div')
pic_pix = re_pix.findall(pic_list_open)
re_pic = re.compile(r'<a href="(.+?)" target=.+?width.+?alt="(.+?)"></a>')
pic_url_name = re_pic.findall(pic_list_open) #[1][0]为url [1][1]为名
if len(pic_url_name) < 2:#进行校验,确认得到了url
pic_list_request = urllib2.Request(self.list_url[i][0])
pic_list_open = urllib2.urlopen(pic_list_request).read()
print 'new pic_list_open:',len(pic_list_open)
#re得出url和名
re_pix = re.compile(r'<div class="resolution">(\d{3,4})x.+?</div')
pic_pix = re_pix.findall(pic_list_open)
re_pic = re.compile(r'<a href="(.+?)" target=.+?width.+?alt="(.+?)"></a>')
pic_url_name = re_pic.findall(pic_list_open)
if len(pic_url_name) < 2:
print '********pic_url_name false********'
for pic in range(len(pic_pix)):
if int(pic_pix[pic]) >= 1920:
#one = [pic_url_name[pic][0],(self.list_url[i][1] + '/' + pic_url_name[pic][1]).decode('utf8')]
one = [pic_url_name[pic][0],list_url[i][1],pic_url_name[pic][1]]
self.queue.put(one)
print 'queue length: ',self.queue.qsize(),'\r',
except:
print ' get false'
while 1:
if self.queue.qsize() > 15:
time.sleep(1)
else:
break
print 'one mult page finished,start next page'
print "All mult pages got,done"
class Consumer(threading.Thread):
def __init__(self,t_name,queue):
threading.Thread.__init__(self, name=t_name)
self.queue=queue
def run(self):
while 1:
try:
one = self.queue.get(1,5)
#print 'one pic got:',one[0]
if not os.path.exists(path + one[1].decode('utf8')):
os.makedirs(path + one[1].decode('utf8'))
path2 = path + one[1].decode('utf8') + '/' + one[2].decode('utf8') + '.jpg'
print '%s downloading ...' % one[2].decode('utf8')
download(one[0],path2)
print '%s downloaded' % one[2].decode('utf8')
print 'queue length: ',self.queue.qsize()
if os.path.getsize(path2)/1024 < 10 :
download(one[0],path2)
except: #等待queue输入,超过5秒 就报异常
try:
#print 'one pic got:',one[0]
print '%s downloading ...' % one[2].decode('utf8')
download(one[0],path2)
print '%s downloaded' % one[2].decode('utf8')
print 'queue length: ',self.queue.qsize(),'\r',
if os.path.getsize(path2)/1024 < 10 :
download(one[0],path2)
if os.path.getsize(path2)/1024 < 10 :
raise Error
except:
print "false"
if self.queue.qsize() == 0:
print 'queue empty ,wait 3s...'
time.sleep(3)
if self.queue.qsize() == 0:
print 'queue empty ,break'
break
def start_new_consumer(queue):
consumer = Consumer('Download ', queue)
consumer.start()
def main():
queue = Queue()
producer = Producer('GET ', queue, list_url)
producer.start()
time.sleep(5)
while 1:
if threading.activeCount() < 7:
start_new_consumer(queue)
time.sleep(0.5)
print 'All finished'
if __name__ == '__main__':
main()
Python 爬虫 队列
最新推荐文章于 2023-10-09 09:40:16 发布