头歌：多线程、多进程爬虫

最新推荐文章于 2024-08-14 18:20:48 发布

原创最新推荐文章于 2024-08-14 18:20:48 发布 · 1.6k 阅读

7 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫

头歌专栏收录该内容

2 篇文章

订阅专栏

部署运行你感兴趣的模型镜像

step1/web/index.html文件下，将所有alt=""填入step1/images

step1/student.py文件源码

import requests
from lxml import etree
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from multiprocessing import Pool
import os
import threading
import psutil
# URL伪装
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
}
def downimg(img_src):
    start_time = time.time()
    
    name = img_src.split('/')[-1].split('.')[0]
    img_url = "http://127.0.0.1:8080" + img_src
    img = requests.get(img_url)
    dir_path = 'step1/images'
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    img_path = dir_path + '/' + name + '.jpg'  # 图片的最终存储路径
    print(img_url, name + '.jpg', '开始下载。。。')
    thread = threading.currentThread()
    process = psutil.Process(os.getpid())
    print("线程ID：%s, 进程ID：%s"
          % (thread.ident, process.pid))
    #********** Begin *********#
    """保存图片"""
    with open(img_path, 'wb')as file:
        file.write(img.content)
    #********** End *********#
    finisTime = time.time() - start_time
    print(name + ".jpg 用时为：" + str(finisTime) + " second")
def parsePage():
    url = "http://127.0.0.1:8080/imgs/"
    response = requests.get(url=url, headers=header)
    html_content = response.text
    #********** Begin *********#
    """解析网页"""
    html = etree.HTML(html_content)
    item_list = html.xpath("//div[@class='box']/div/a/img/@src")
    print(item_list)
    s_time = time.time()
    
    #********** End *********#
    """非线程操作"""
    # for item in item_list:
    #     downimg(item)
    #********** Begin *********#
    """线程操作方式"""
    thread = []
    for item in item_list:
        
        thread.append(threading.Thread(target=downimg, args=(item, )))
    for t in thread:
        t.start()
    for t in thread:
        t.join()
    #********** End *********#
    print('总耗时: %s' % (time.time() - s_time))

希望可以给到帮助

您可能感兴趣的与本文相关的镜像