python下多线程爬虫爬取斗图网的所有最新图片

最新推荐文章于 2020-12-04 11:15:39 发布

fenyujinian

最新推荐文章于 2020-12-04 11:15:39 发布

阅读量712

点赞数

CC 4.0 BY-SA版权

分类专栏： python 文章标签： python 多线程爬虫

本文链接：https://blog.youkuaiyun.com/fenyujinian/article/details/78474815

python 专栏收录该内容

3 篇文章

订阅专栏

网易云课堂上的教程，写在这里方便以后查看。先来一个简单的多线程例子：

#encoding:utf-8

import threading
import random
import time

#生产者和消费者

MONEY = 0
gLock = threading.Lock()
def procuder():
    while True:
        global MONEY
        random_money = random.randint(10,100)
        gLock.acquire();
        MONEY += random_money
        gLock.release()
        print('生产者%s--生产了%d'%(threading.currentThread,random_money))
        time.sleep(0.5)


def customer():
    while True:
        global MONEY
        random_money = random.randint(10,100)
        if MONEY < random_money:
            print('余额不足，欲消费:%d,但是仓库剩余：%d'%(random_money,MONEY) )
        else:
            gLock.acquire()
            MONEY -= random_money
            gLock.release()
            print('消费者%s--消费了%d'%(threading.currentThread,random_money))

        time.sleep(0.5)


def p_c_test():
    for x in range(3):
        th = threading.Thread(target=procuder)
        th.start()
    for x in range(3):
        th = threading.Thread(target=customer)
        th.start()


if __name__ =="__main__":
    p_c_test()

接着是爬取图片：

#encoding: utf-8


import requests
import urllib.request
from bs4 import BeautifulSoup
import os
import threading

'''
def GetUrl():
    url = []
    first_url = 'http://www.doutula.com/photo/list/?page={}'
    for i in range(1,1076):
        url.append(first_url.format(i))
    return url
print(GetUrl())
'''

PAGE_URL_LIST = []
BASE_PAGE_URL = 'http://www.doutula.com/photo/list/?page='
FACE_URL_LIST = []#所有表情的url列表
gLock = threading.Lock()

for x in range(1,1111):
    url = BASE_PAGE_URL + str(x)
    PAGE_URL_LIST.append(url)

def download_image(url):
   # url = 'https://ws4.sinaimg.cn/bmiddle/9150e4e5ly1fkrb5wjzr5j207706z3z1.jpg'
    split_list = url.split('/')
    filename = split_list.pop()
    path = os.path.join('images', filename)
    urllib.request.urlretrieve(url, filename=path)

#获取每一页的图片链接
def get_page(page_url):
    response = requests.get(page_url)
    content = response.content
    soup = BeautifulSoup(content, 'lxml')
    img_list = soup.find_all('img', attrs={'class': 'img-responsive lazy image_dta'})
    # print(img_list)
    for img in img_list:
        url = img['data-original']
        download_image(url)
        # print(img['data-original'])
        # print('-'*10)

def procuder():
    while True:
        gLock.acquire()
        if len(PAGE_URL_LIST) == 0:
            gLock.release()
            break
        else:
            page_url = PAGE_URL_LIST.pop()
            gLock.release()
            response = requests.get(page_url)
            content = response.content
            soup = BeautifulSoup(content, 'lxml')
            img_list = soup.find_all('img', attrs={'class': 'img-responsive lazy image_dta'})
            # print(img_list)
            gLock.acquire()
            for img in img_list:
                url = img['data-original']
                if not url.startswith('http'):
                    url = 'http:' + url
                url = img['data-original']
                FACE_URL_LIST.append(url)
            gLock.release()

def customer():
    while True:
        gLock.acquire()
        if len(FACE_URL_LIST) == 0:
            gLock.release()
            continue
        else:
            face_url = FACE_URL_LIST.pop()
            gLock.release()
            split_list = face_url.split('/')
            filename = split_list.pop()
            path = os.path.join('images', filename)
            urllib.request.urlretrieve(face_url, filename=path)



def main():
    #创建两个多线程作为生产者去爬取表情的url
    for x in range(5):
        th =  threading.Thread(target=procuder)
        th.start()
    for x in range(5):
        th =  threading.Thread(target=customer)
        th.start()
    #创建4个线程来作为消费者去把表情图片的url下载到本地
    for page_url in PAGE_URL_LIST:
        get_page(page_url)

if __name__ == "__main__":
    main()