Python 爬虫队列

最新推荐文章于 2023-10-09 09:40:16 发布
Nonikka
最新推荐文章于 2023-10-09 09:40:16 发布
阅读量1.7k
点赞数
分类专栏：爬虫 Python 文章标签： python 爬虫线程
本文链接：https://blog.youkuaiyun.com/nonikka/article/details/44155521
版权
Python 同时被 2 个专栏收录
11 篇文章
订阅专栏
爬虫
4 篇文章
订阅专栏
#!/usr/bin/env python
#coding:utf8
#2014.3.9
#bug：1.下载文件夹未分，不能直接创建文件夹【/】 2.下载失败503未重新下载【/】 
#3.线程很奇怪【暂时不好说】 4.文件打开有时不能用XnView，但可以用自带打开【暂时不好说】
#5.获取pic_list到第三个就不见了？【/】6.可能直接跳过一个列表【********TODO********】
#7.可能直接崩溃掉【估计取决于网络状况】
import random,threading,time
from Queue import Queue
import urllib,os,urllib2,re
import socket
socket.setdefaulttimeout(36)
path = "E://PIC/"
error = 1
def download(url,name):
    try:
        urllib.urlretrieve(url,name)
    except:
        try:
            urllib.urlretrieve(url,name)
        except:
            global error
            print 'error '+ str(error)
            error+=1
            fw=open("E://PIC/error.txt","w+")
            fw.write(url)
            fw.write(str(name.decode('utf8')))
            fw.close()

page_number = input("Please input the nage number:\n")
#打开主页
list_page_request = urllib2.Request('http://animewallpaperstock.com/page/' + str(page_number))
list_page_open = urllib2.urlopen(list_page_request).read()
print 'list_page_open:',len(list_page_open)
#re得出続きみる
re_tsuzuki = re.compile(r'<h2><a href="(.+?)">(.+?)</a></h2>')
list_url = re_tsuzuki.findall(list_page_open)
#list_url[1][0] 为url list_url[1][1]为合集名
class Producer(threading.Thread):
    def __init__(self,t_name,queue,list_url):
        threading.Thread.__init__(self,name=t_name)
        self.list_url = list_url
        self.queue = queue
    def run(self):
        for i in range(len(list_url)):
            #打开多图页
            print 'one mult page start'
            try:
                pic_list_request = urllib2.Request(self.list_url[i][0])
                pic_list_open = urllib2.urlopen(pic_list_request).read()
                print 'new pic_list_open:',len(pic_list_open)
                #re得出url和名
                re_pix = re.compile(r'<div class="resolution">(\d{3,4})x.+?</div')
                pic_pix = re_pix.findall(pic_list_open)
                re_pic = re.compile(r'<a href="(.+?)" target=.+?width.+?alt="(.+?)"></a>')
                pic_url_name = re_pic.findall(pic_list_open) #[1][0]为url [1][1]为名
                if len(pic_url_name) < 2:#进行校验，确认得到了url
                    pic_list_request = urllib2.Request(self.list_url[i][0])
                    pic_list_open = urllib2.urlopen(pic_list_request).read()
                    print 'new pic_list_open:',len(pic_list_open)
                    #re得出url和名
                    re_pix = re.compile(r'<div class="resolution">(\d{3,4})x.+?</div')
                    pic_pix = re_pix.findall(pic_list_open)
                    re_pic = re.compile(r'<a href="(.+?)" target=.+?width.+?alt="(.+?)"></a>')
                    pic_url_name = re_pic.findall(pic_list_open)
                    if len(pic_url_name) < 2:
                        print '********pic_url_name false********'
                for pic in range(len(pic_pix)):
                    if int(pic_pix[pic]) >= 1920:
                        #one = [pic_url_name[pic][0],(self.list_url[i][1] + '/' + pic_url_name[pic][1]).decode('utf8')]
                        one = [pic_url_name[pic][0],list_url[i][1],pic_url_name[pic][1]]
                        self.queue.put(one)
                        print 'queue length: ',self.queue.qsize(),'\r',
            except:
                print ' get false'
            while 1:
                if self.queue.qsize() > 15:
                    time.sleep(1)
                else:
                    break
            print 'one mult page finished,start next page'
        print "All mult pages got,done"

class Consumer(threading.Thread):
    def __init__(self,t_name,queue):
        threading.Thread.__init__(self, name=t_name)
        self.queue=queue
    def run(self):
        while 1:
            try:
                one = self.queue.get(1,5)
                #print 'one pic got:',one[0]
                if not os.path.exists(path + one[1].decode('utf8')):
                    os.makedirs(path + one[1].decode('utf8'))
                path2 = path + one[1].decode('utf8') + '/' + one[2].decode('utf8') + '.jpg'
                print '%s downloading ...' % one[2].decode('utf8')
                download(one[0],path2)
                print '%s downloaded' % one[2].decode('utf8')
                print 'queue length: ',self.queue.qsize()
                if os.path.getsize(path2)/1024 < 10 :
                    download(one[0],path2)
            except:   #等待queue输入，超过5秒 就报异常
                try:
                    #print 'one pic got:',one[0]
                    print '%s downloading ...' % one[2].decode('utf8')
                    download(one[0],path2)
                    print '%s downloaded' % one[2].decode('utf8')
                    print 'queue length: ',self.queue.qsize(),'\r',
                    if os.path.getsize(path2)/1024 < 10 :
                        download(one[0],path2)
                    if os.path.getsize(path2)/1024 < 10 :
                        raise Error
                except:
                    print "false"
            if self.queue.qsize() == 0:
                print 'queue empty ,wait 3s...'
                time.sleep(3)
                if self.queue.qsize() == 0:
                    print 'queue empty ,break'
                    break

def start_new_consumer(queue):
    consumer = Consumer('Download ', queue)
    consumer.start()

def main():
    queue = Queue()
    producer = Producer('GET ', queue, list_url)
    producer.start()
    time.sleep(5)
    while 1:
        if threading.activeCount() < 7:
            start_new_consumer(queue)
            time.sleep(0.5)
    print 'All finished'

if __name__ == '__main__':
    main()
Python 爬虫 队列

Python 爬虫队列