import random from urllib import request import re import os import time import threadpool def getAmason(i,j): agentsList = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2" ] num = 50 iNum = i while iNum<j: k = (iNum*2-1) num = (num+iNum+48) urlPath="http://search.jd.com/Search?keyword=%E8%BF%9E%E8%A1%A3%E8%A3%99&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E8%BF%9E%E8%A1%A3%E8%A3%99&page="+str(k)+"&s="+str(num)+"&click=0" print(urlPath) requestPath = request.Request(urlPath) userAgent = random.choice(agentsList) requestPath.add_header("User-Agent",userAgent) responseYama = request.urlopen(requestPath) res = responseYama.read().decode("utf-8") imag1 = re.compile('<img width="220" height="282".*(img\d{1,}\.\w+\.com(/\w+)+\.jpg)"') imgList1 = imag1.findall(res) print(len(imgList1)) for imgFile in imgList1: index = imgFile[0].rfind("/") imgfinename = imgFile[0][index+1:] imgpath = os.path.join("H:\Python Project\day19\img",imgfinename) imgUrl = "http://"+imgFile[0] request.urlretrieve(imgUrl,imgpath) time.sleep(5) iNum += 1 if __name__ == '__main__': value1 = (1,26) value2 = (26,51) value3 = (51,76) value4 = (76,100) arglist = [(value1,None),(value2,None),(value3,None),(value4,None)] thPool = threadpool.ThreadPool(4) threadRequests = threadpool.makeRequests(getAmason,arglist) for threquest in threadRequests: thPool.putRequest(threquest) thPool.wait() print("结束")
Python实现爬取京东网页的图片
最新推荐文章于 2024-01-19 15:00:13 发布