Python实现爬取京东网页的图片

最新推荐文章于 2024-01-19 15:00:13 发布

左边Luckyboy

最新推荐文章于 2024-01-19 15:00:13 发布

阅读量730

点赞数

分类专栏： Python

本文链接：https://blog.youkuaiyun.com/qq_31187881/article/details/79274801

版权

Python 专栏收录该内容

14 篇文章

订阅专栏

import random
from urllib import request
import re
import os
import time
import threadpool


def getAmason(i,j):
    agentsList = [
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2"
    ]
    num = 50
    iNum = i
    while iNum<j:
        k = (iNum*2-1)
        num = (num+iNum+48)
        urlPath="http://search.jd.com/Search?keyword=%E8%BF%9E%E8%A1%A3%E8%A3%99&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E8%BF%9E%E8%A1%A3%E8%A3%99&page="+str(k)+"&s="+str(num)+"&click=0"
        print(urlPath)
        requestPath = request.Request(urlPath)
        userAgent = random.choice(agentsList)
        requestPath.add_header("User-Agent",userAgent)
        responseYama = request.urlopen(requestPath)
        res = responseYama.read().decode("utf-8")
        imag1 = re.compile('<img width="220" height="282".*(img\d{1,}\.\w+\.com(/\w+)+\.jpg)"')
        imgList1 = imag1.findall(res)
        print(len(imgList1))
        for imgFile in imgList1:
            index = imgFile[0].rfind("/")
            imgfinename = imgFile[0][index+1:]
            imgpath = os.path.join("H:\Python Project\day19\img",imgfinename)
            imgUrl = "http://"+imgFile[0]
            request.urlretrieve(imgUrl,imgpath)
        time.sleep(5)
        iNum += 1

if __name__ == '__main__':

    value1 = (1,26)
    value2 = (26,51)
    value3 = (51,76)
    value4 = (76,100)

    arglist = [(value1,None),(value2,None),(value3,None),(value4,None)]

    thPool = threadpool.ThreadPool(4)

    threadRequests = threadpool.makeRequests(getAmason,arglist)

    for threquest in threadRequests:
        thPool.putRequest(threquest)

    thPool.wait()

    print("结束")