豆瓣用户相册爬虫

#coding:utf-8
import requests
from pyquery import PyQuery as pq
import os
import random

def gethtml(url):
    try:
        page = requests.get(url,headers=headers,proxies=proxiess)
        htmlcode = page.content
        return htmlcode
    except:
        print("Error")
        return None

def getimgsrc(code):
    doc = pq(code)
    items = doc(".photolst .photo_wrap a img").items()
    imgsrc =[]
    for item in items:
        try:
            imgsrc.append(item.attr("src"))
        except:
            print("wancheng")
    return imgsrc

def get_img(imglist):
    global pagenum
    for imgs in imglist:
        os.chdir(path)
        with open("%s.jpg" %pagenum,'wb') as fb:
            img = gethtml(imgs)
            fb.write(img)
            print("正在下载第"+str(pagenum)+"张图片")
        pagenum+=1

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Host': 'meishi.meituan.com',
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
    'Cache-Control': 'max-age=1Cache-Control: no-cache, no-store, private, must-revalidate, proxy-revalidate',
    'Referer':'http://hz.meituan.com/meishi/',
    'Upgrade-Insecure-Requests': 1,
    'Content-Type': 'text/html; charset=UTF-8'
}
proxies = [
"http://119.101.116.253:9999",
"http://119.101.112.97:9999",
"http://111.177.178.139:9999",
"http://111.177.171.78:9999",
"http://111.181.32.45:9999",
"http://115.203.99.9:9999",
"http://119.101.115.194:9999",
"http://110.52.234.64:9999",
"http://111.177.166.126:9999",
"http://119.39.238.55:9999",
]
proxiess = proxies[random.randint(1,10)]
print(proxiess)
path = "F:\douban"
pagenum = 0
    

try:
    while 1:
        url = "https://www.douban.com/photos/album/127882234/"+"?start="+str(pagenum)
        htmlcode = gethtml(url)
        imglist = getimgsrc(htmlcode)
        get_img(imglist)
except:
    print("完成")
        

 
 

转载于:https://my.oschina.net/u/4069811/blog/3003607

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值