#-*- coding: utf-8 -*-
import re
import urllib
import urllib2
import os
class Spider:
def downLoadImage(self, url, imageDirName):
html = self.__getHtml(url)
imageList = self.__getImageUrls(html)
self.__saveImage(imageDirName, imageList)
def __mkdir(self, dirName):
if os.path.exists(dirName):
print(dict, "ready exits")
else:
os.makedirs(dirName)
def __getHtml(self, url):
urlCtx = urllib.urlopen(url)
return urlCtx.read()
def __getImageUrls(self, html):
rgx = re.compile(r'src="(.+?\.jpg)" pic_ext')
imageList = rgx.findall(html)
return imageList
def __saveImage(self, dirName, imageList):
self.__mkdir(dirName)
imageIndex = 0
for s in imageList:
try:
data = urllib2.urlopen(s).read()
fileName = dirName + str("/%d.jpg" % imageIndex)
file = open(fileName, "wb+")
file.write(data)
imageIndex += 1
except urllib2.URLError as e:
print (e.reason)
if __name__ == "__main__":
spider = Spider()
spider.downLoadImage('http://tieba.baidu.com/p/2460150866', "picture")