通过网络请求分析发现是json传递的信息
接口就是“https://v.taobao.com/micromission/req/selectCreatorV3.do”
这个接口的翻页参数是currentPage 把这个参数每次更改然后提交就能获得某一页小姐姐们的信息啦
淘女郎网站当然有一些反扒策略,如果直接请求接口会返回非法请求,小姐姐是没有的哦!
我通过把headers里面的信息在请求的时候逐个删除,发现只要有referer参数就可以成功请求,连user-agent都不需要的。
反反扒策略
第一个方法
pip3安装:
sudo apt-get install python3-pip
BeautifulSoup安装:
sudo pip3 install Beautifulsoup4
selenium安装:
sudo pip3 install Beautifulsoup4
from urllib.request import urlopen
import threading
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import os
#主函数
def main():
driver = webdriver.Firefox()
driver.get("https://mm.taobao.com/search_tstar_model.htm?")
bsObj = BeautifulSoup(driver.page_source,"lxml")
fp = open('mm.txt','w+')
fp.write(driver.find_element_by_id("J_GirlsList").text)
MMsinfoUrl = bsObj.findAll("a",{"href":re.compile("\/\/.*\.htm\?(userId=)\d*")})
imagesUrl = bsObj.findAll("img",{"src":re.compile("gtd.*\.jpg")})
fp.close()
fp = open('mm.txt','r+')
items = fp.readlines()
content1 = []
n = 0
m = 1
while(i<5):
print("MM's name:"+contents[i][0][0]+"with"+contents[i][0][1])
print("saving......"+contents[i][0][0]+"in the folder")
perMMpageUrl = "https:"+contents[i][1]
path = '/home/lgz/pythontest/mmphoto/'+contents[i][0][0]
mkdir(path)
getperMMpageImg(perMMpageUrl,path)
i += 1
fp.flush()
fp.close()
number = 1
for imageUrl in imagesUrl:
url = "https:"+str(imageUrl["src"])
html = urlopen(url)
data = html.read()
fileName = '/home/lgz/pythontest/mmphoto/mm'+str(number)+'.jpeg'
fph = open(fileName,"wb")
print("loading MM......"+fileName)
fph.write(data)
fph.flush()
fph.close()
number += 1
driver.close()
def mkdir(path):
isExists = os.path.exists(path)
if not isExists:
print("to create a new folder named"+path)
os.makedirs(path)
else:
print("create complete!")
def getperMMpageImg(MMURL,MMpath):
owndriver = webdriver.Firefox()
owndriver.get(MMURL)
ownObj = BeautifulSoup(owndriver.page_source,"lxml")
perMMimgs = ownObj.findAll("img",{"src":re.compile("\/\/img\.alicdn.*\.jpg")})
number = 2
for perMMimg in perMMimgs:
ImgPath = "https:"+str(perMMimg["src"])
print(ImgPath)
try:
html = urlopen(ImgPath)
data = html.read()
fileName = MMpath+"/"+str(number)+'.jpg'
fp = open(fileName,'wb')
print("loading her photo as"+fileName)
fp.write(data)
fp.flush()
fp.close()
number += 1
except Exception:
print("Address Error!!!!!!!!!!!!!!!!!!!!")
if __name__ == '__main__':
main()
第二个方法
通过接口爬下来直接扔到MongoDB里
import requests
from pymongo import MongoClient
import time
client = MongoClient()
db = client.taonvlang
my_set = db.database
# url = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
url="https://v.taobao.com/micromission/req/selectCreatorV3.do"
def get_data(page):
for i in range(1, page + 1):
headers = {
"referer": "https://v.taobao.com/v/content/live?catetype=704"
}
data = {
'currentPage': i
}
try:
r = requests.post(url, data=data,headers=headers)
response = r.json()['data']['result']
if r.status_code == 200:
print("正在爬取第{}页".format(i))
# print(response.count)
my_set.insert(response)
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
time.sleep(1)
except Exception as e:
print("出现异常!")
print(e)
continue
print("爬取完成")
if __name__ == '__main__':
get_data(5)
通过连接爬去图片保存到本地
from pymongo import MongoClient
import requests
clien = MongoClient()
db = clien.taonvlang
my_Coll = db.database
cursor = my_Coll.find()
def get_tu(tuurl):
url = 'http:{}'.format(tuurl)
r = requests.get(url)
return r.content
def get_fullname(doc):
name = doc['realName']
weight = doc['weight']
height = doc['height']
city = doc['city']
namelist = [name,city,height,weight]
fullname = '{}-{}-{}-{}.jpg'.format(*namelist)
return fullname
def save_tu(content,fullname):
with open(r'C:\Users\15810\Desktop\python代码\jpg\{}'.format(fullname), 'wb') as f:
f.write(content)
print('{} 保存成功\n'.format(fullname))
if __name__ == '__main__':
for doc in cursor:
# print(doc['avatarUrl'])
tu = get_tu(doc['avatarUrl'])
name = get_fullname(doc)
save_tu(tu,name)
第三个方法
#-*- coding:utf-8 -*-
'''Zheng 's BUG'''
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os
class Crawl(object):
# 获取首页的页面信息
def getMMsInfo(self):
url = 'https://www.taobao.com/markets/mm/mmku'
#chromedriver一定要3.4以上的,不然会出现element 不能点击的错误
driver = webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver")
driver.get(url)
try:
#等到页面跳转条加载完毕
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME,"skip-wrap"))) # 查看10秒内,是否有页码部分出现
print("成功提取页码")
#通过pagesource传给soup
soup = BeautifulSoup(driver.page_source, "html.parser")
# 获取到了全部的页数
pageNum = soup.find('span',class_ = "skip-wrap").find('em').text
print("页码:"+pageNum)
print("开始爬取头像!")
# 同时得保存第一出现的图片,因为当前页是不能点击的,所以第一次不能通过点击完成
# 每个mm的信息都在一个consli里
mms = soup.find_all('div', class_="cons_li")
# 对于每一个mm对象,获取其名字和头像
self.saveMMS(mms)
# 从第2页开始便利点击
for i in range(2,int(pageNum)):
# 点击当前页
# 防止element不能点击,这里加了一个等待元素出现
element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, str(i))))
#curpage = driver.find_element_by_partial_link_text(str(i))
print(i)
element.click()
# 等待当前页加载完成
pics = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"skip-wrap")))
# 获取网页内容
soup = BeautifulSoup(driver.page_source,"html.parser")
mms = soup.find_all('div',class_ = "cons_li")
# 对于每一个mm对象,获取其名字和头像
self.saveMMS(mms)
print("当前完成:第"+str(i)+"页")
finally:
driver.quit()
#一页的mm的li信息
def saveMMS(self,mms):
for mm in mms:
name = mm.find('div', class_="item_name").find("p").text
#get("src")和arrts["src"]
img = mm.find('div', class_='item_img').find('img').get("src")
# 如果路径不存在,设置存储路径
dirpath = os.getcwd() + "\\美人\\"
if not os.path.exists(dirpath):
os.makedirs(dirpath)
namepath = os.getcwd() + "\\美人\\" + name + ".jpg"
self.saveImg(img, namepath)
# 保存一张照片
def saveImg(self, imageURL, fileName):
if imageURL is None:
return
if 'http' not in imageURL: #去掉src不格式的图片
return
#流获得图片url内容
u = requests.get(imageURL,stream = True).content
try:
with open(fileName,'wb') as jpg:
jpg.write(u)
except IOError:
print("写入图片错误!")
# 开始函数
def start(self):
print("抓起淘女郎-美人库第一页的内容,并存储于 美人 文件夹下")
self.getMMsInfo()
print("下载完成!")
tbmm = Crawl()
tbmm.start()