python入门
前几天自己看了一下ptyhon的语法,根据网上的教程自己写了一个小小的爬虫,可以爬出淘mm的展示图片,然后自动保存到目录,
import random
import sys
reload(sys)
import requests
import urllib
from bs4 import BeautifulSoup
import time
def gDownloadWithFilename(url,savePath,file):
#参数检查,现忽略
try:
urlopen=urllib.URLopener()
fp = urlopen.open(url)
data = fp.read()
fp.close()
file=open(savePath + file,'w+b')
file.write(data)
file.close()
except IOError, error:
print "DOWNLOAD %s ERROR!==>>%s" % (url, error)
except Exception, e:
print "Exception==>>" + e
return
url='https://mm.taobao.com/json/request_top_list.htm?page=3'
headerss = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.933.400 QQBrowser/9.4.8699.400',
}
data = requests.get(url, headers=headerss)
soup = BeautifulSoup(data.text, 'lxml')
count=0
import time
for img in soup.find_all(class_="lady-avatar"):
print img.get('href')
src_1="http:"+img.get('href')
data2 = requests.get(src_1,headers=headerss)
soup2=BeautifulSoup(data2.text,'lxml')
fo= open("list_img_taobao.txt","a")
fo.write("====="+str((soup.find_all(class_="lady-avatar")).index(img))+"============")
fo.close()
for imgg in soup2.find_all("img"):
print imgg.get('src')
fo= open("list_img_taobao.txt","a")
time.sleep(1)
jpg_url ="http:"+imgg.get('src')
fo.write("第"+str(count)+" "+jpg_url+"\n")
gDownloadWithFilename(jpg_url,"./img/",str(count)+".jpg")
count+=1
fo.close()
有这样几个问题
一个是python的冒号我经常忘掉
另外下载之前似乎应该检查一下下载的后缀,为jpg才应该下载,然后保存可以在优化一下,分为几个文件夹保存,最后,下载下来的图片有时候莫名其妙的大小就是5m固定的,有可能我因为我网络不好的问题,在另存为一下就好了。