对煎·····蛋网的图片进行批量下载,但此脚本下载会漏图(与该网站的限制有关).输出信息时可能存在文字编码问题
#!usr/bin/env python
# -*- coding:utf-8 -*-
# URl:http://jandan.net/ooxx/page-+str(1-3000)+#comments
import multiprocessing
import os
import random
import urllib
import urllib2
from multiprocessing import Pool, Queue, cpu_count
import BeautifulSoup
import re
import requests
# r=requests.get("//ww3.sinaimg.cn/bmiddle/7c8e8afbjw1dh9yimwp4xj.jpg")
# urllib.urlopen("//ww3.sinaimg.cn/bmiddle/7c8e8afbjw1dh9yimwp4xj.jpg")
# 保存一张图片
import time
import socket
timeout = 10
socket.setdefaulttimeout(timeout)
#获得URL并保存图片
'''
def saveImg(imageURL, fileName):
try:
u = urllib2.urlopen(imageURL, timeout=10)
data = u.read()
f = open(fileName, 'wb')
f.write(data)
f.close()
except:
print u"图片地址有问题"
'''
def saveImg(imageURL,pageIndex,fileName):
try:
# 选择随机的User-Agent,以做辨别 Referer的作用
user_agent = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", ]
agent = {"User-Agent": random.choice(user_agent)}
r = urllib2.build_opener()
r.addheaders = [("User-agent", agent), ("Accept", "*/*"),
('Referer', 'http://jandan.net/ooxx/page-' + str(pageIndex))]
u = r.open(imageURL)
# u = urllib2.urlopen(r)#, timeout=20)
data = u.read()
f = open(fileName, 'wb')
f.write(data)
f.close()
except urllib2.HTTPError, e:
print u"图片地址有问题,httpcode:%s" % e.code
# 获取图片存放URL
def getAllImgs(pageIndex):
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent}
url = "http://jandan.net/ooxx/page-" + str(pageIndex) + "#comments"
# print url
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
r = response.read()
#print r
# 调用BeautifulSoup库
soup = BeautifulSoup.BeautifulSoup(r).prettify()
# print soup
# 格式化
# s=soup.find(r"<span.*?righttext.*?<img src=(.*?)/>")
pattern = re.compile('<div class="text">.*?<p.*?<img src="(.*?)"', re.S)
images = re.findall(pattern, soup)
return images #返回list
# for item in images:
# print item
getAllImgs(2000)
# q=getAllImgs(1)
# print q
# for pageIndex in range(1,1000):
# q = getAllImg(1)
# print q
# 保存多张写真图片
def saveImgs(images, pageIndex=1):
number = 1
for imageURL in images:
print imageURL
splitPath = imageURL.split(".")
fTail = splitPath.pop()
fileName = str(pageIndex) + "/" + str(number) + "." + fTail
if str(imageURL).startswith("h"):
saveImg(imageURL, pageIndex,fileName)
else:
imageURL1 = "http:" + imageURL
saveImg(imageURL1,pageIndex, fileName)
number += 1
#以页数命名创建目录
def mkdir(pageIndex=1):
path = str(pageIndex) # .strip()
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
print u"偷偷新建了名字叫做", path, u'的文件夹'
# 创建目录操作函数
os.makedirs(path)
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print u"名为", path, '的文件夹已经创建成功'
return False
# 保存一页的图片
def savePageInfo(pageIndex):
# 获取一页图片的URL
time.sleep(2)
images = getAllImgs(pageIndex)
# 创建保存目录,以页数命名
mkdir(pageIndex)
# 保存所有图像
saveImgs(images, pageIndex)
def savePagesInfo(start, end):
for i in range(start, end + 1):
savePageInfo(i)
#利用进程池进行下载
def pool(start, end):
if __name__ == "__main__":
#startTime = time.time()
p = Pool(cpu_count()) #cpu_count()
list = []
numlist = []
startTime=time.time()
'''
for i in xrange(481,490):
p = multiprocessing.Process(target=savePageInfo, args=(i,))
numlist.append(p)
p.start()
p.join()
print "process end."
time.sleep(5)
endTime = time.time()
print u"used time is ", endTime - startTime
print u"太阳出来爬山破咯喂!!!!!!!!!!!!!!!"
'''
for i in xrange(start, end + 1):
list.append(i)
result = p.map_async(savePageInfo, list)
print "Waiting for all subprocess done..."
flag = True
while (flag):
time.sleep(5)
proces_count = len(p._cache)
if proces_count != 0:
print "%s processes running" % len(p._cache)
print "开始下载图片的process %s" % os.getpid()
else:
print u"all processes are finished!!"
flag = False
p.close()
p.join()
endTime = time.time()
print "xiazaihaole"
print u"used time is ", endTime - startTime
pool(1,50)