网页图片爬虫:
第一个爬虫抓去bing主页图片,24张
第二个抓取贴吧图片
第三个抓去图虫图片
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# -*- author:miko-*-
# python3抓取bing主页所有背景图片
import urllib.request
import urllib,re,sys,os
def get_bing_backphoto():
if (os.path.exists('img')== False):
os.mkdir('img')
for i in range(0,24):
url = 'http://cn.bing.com/HPImageArchive.aspx?format=js&idx='+str(i)+'&n=1&nc=1361089515117&FORM=HYLH1'
html = urllib.request.urlopen(url).read()
if html == 'null':
print( 'open & read bing error!')
sys.exit(-1)
html = html.decode('utf-8')
#print (html)
reg = re.compile('"url":"(.*?)","urlbase"',re.S)
text = re.findall(reg,html)
#http://s.cn.bing.net/az/hprichbg/rb/LongJi_ZH-CN8658435963_1366x768.jpg
for imgurl in text :
right = imgurl.rindex('/')
name = imgurl.replace(imgurl[:right+1],'')
savepath = 'img/'+ name
print (imgurl)
urllib.request.urlretrieve(imgurl, savepath)
#print (name + ' save success!')
get_bing_backphoto()
#coding=utf-8
import urllib.request
import re
import urllib,re,sys,os
def getHtml(url):
html = urllib.request.urlopen(url).read()
if html == 'null':
print( 'open & read bing error!')
sys.exit(-1)
html=html.decode('utf-8')
return html
def getImg(html):
if (os.path.exists('baidu')== False):
os.mkdir('baidu')
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = imgre.findall(html)
x = 0
for imgurl in imglist:
urllib.request.urlretrieve(imgurl,'baidu/%s.jpg' % x)
x = x + 1
print(imgurl)
#http://imgsrc.baidu.com/forum/pic/item/16391f30e924b89915f86eb06f061d950b7bf677.jpg
html = getHtml("http://tieba.baidu.com/p/2460150866")
getImg(html)
#print (getImg(html))
#-*- encoding: utf-8 -*-
'''
Created on 2015-7-30
@author: Miko
'''
import urllib.request
import urllib,re,sys,os,time
import uuid
#获取二级页面url
def findUrl2(html):
re1 = r'http://tuchong.com/\d+/\d+/|http://\w+(?<!photos).tuchong.com/\d+/'
url2list = re.findall(re1,html)
url2lstfltr = list(set(url2list))
url2lstfltr.sort(key=url2list.index)
#print url2lstfltr
return url2lstfltr
#获取html文本
def getHtml(url):
html = urllib.request.urlopen(url).read().decode('utf-8')#解码为utf-8
return html
#下载图片到本地
def download(html_page , pageNo):
#定义文件夹的名字
x = time.localtime(time.time())
foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
re2=r'http://photos.tuchong.com/.+/f/.+\.jpg'
imglist=re.findall(re2,html_page)
print (imglist)
download_img=None
for imgurl in imglist:
picpath = 'D:\\TuChong\\%s\\%s' % (foldername,str(pageNo))
filename = str(uuid.uuid1())
if not os.path.exists(picpath):
os.makedirs(picpath)
target = picpath+"\\%s.jpg" % filename
print ("The photos location is:"+target)
download_img = urllib.request.urlretrieve(imgurl, target)#将图片下载到指定路径中
time.sleep(1)
print(imgurl)
return download_img
# def callback(blocknum, blocksize, totalsize):
# '''回调函数
# @blocknum: 已经下载的数据块
# @blocksize: 数据块的大小
# @totalsize: 远程文件的大小
# '''
# print str(blocknum),str(blocksize),str(totalsize)
# if blocknum * blocksize >= totalsize:
# print '下载完成'
def quitit():
print ("Bye!")
exit(0)
if __name__ == '__main__':
print (''' *****************************************
** Welcome to Spider for TUCHONG **
** Created on 2015-7-30 **
** @author: miko **
*****************************************''')
pageNo ='10' # raw_input("Input the page number you want to scratch (1-100),please input 'quit' if you want to quit>")
while not pageNo.isdigit() or int(pageNo) > 100 :
if pageNo == 'quit':quitit()
print ("Param is invalid , please try again.")
pageNo = raw_input("Input the page number you want to scratch >")
#针对图虫人像模块来爬取
html = getHtml("http://tuchong.com/tags/%E4%BA%BA%E5%83%8F/?page="+str(pageNo))
detllst = findUrl2(html)
for detail in detllst:
html2 = getHtml(detail)
download(html2,pageNo)
print ("Finished.")