参考的一位大佬--猪哥,其博客地址https://blog.youkuaiyun.com/u014044812/article/details/99584382
猪哥公开了教程和源码,按照猪哥的教程完成了数据爬取,感谢大佬猪哥嘻嘻!
#第一段代码
import re
import os
import json
import requests
s = requests.Session()
# cookies序列化文件
COOKIES_FILE_PATH = 'taobao_login_cookies.txt'
class TaoBaoLogin:
def __init__(self, session):
"""
账号登录对象
"""
# 检测是否需要验证码的URL
self.user_check_url = 'https://login.taobao.com/member/request_nick_check.do?_input_charset=utf-8'
# 验证淘宝用户名密码URL
self.verify_password_url = "https://login.taobao.com/member/login.jhtml"
# 访问st码URL
self.vst_url = 'https://login.taobao.com/member/vst.htm?st={}'
# 淘宝个人 主页
self.my_taobao_url = 'http://i.taobao.com/my_taobao.htm'
# 淘宝用户名
self.username = '***********'
# 淘宝重要参数,从浏览器或抓包工具中复制,可重复使用
self.ua = '从浏览器复制最新的UA'
# 加密后的密码,从浏览器或抓包工具中复制,可重复使用
self.TPL_password2 = '从浏览器复制最新的TPL_password2 密码'
# 请求超时时间
self.timeout = 20
# session对象,用于共享cookies
self.session = session
if not self.username:
raise RuntimeError('请填写你的淘宝用户名')
def _user_check(self):
"""
检测账号是否需要验证码
:return:
"""
data = {
'username': self.username,
'ua': self.ua
}
try:
response = self.session.post(self.user_check_url, data=data, timeout=self.timeout)
response.raise_for_status()
except Exception as e:
print('检测是否需要验证码请求失败,原因:')
raise e
needcode = response.json()['needcode']
print('是否需要滑块验证:{}'.format(needcode))
return needcode
def _verify_password(self):
"""
验证用户名密码,并获取st码申请URL
:return: 验证成功返回st码申请地址
"""
verify_password_headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Origin': 'https://login.taobao.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://login.taobao.com/member/login.jhtml?from=taobaoindex&f=top&style=&sub=true&redirect_url=https%3A%2F%2Fi.taobao.com%2Fmy_taobao.htm',
}
# 登录toabao.com提交的数据,如果登录失败,可以从浏览器复制你的form data
verify_password_data = {
'TPL_username': self.username,
'ncoSig': ' ',
'ncoSessionid': ' ',
'ncoToken': '20349efd4946649f6cdbdede0436cf77271e0d83',
'slideCodeShow': 'false',
'useMobile': 'false',
'lang': 'zh_CN',
'loginsite': '0',
'newlogin': '0',
'TPL_redirect_url': 'https://www.taobao.com/',
'from': 'tbTop',
'fc': 'default',
'style': 'default',
'css_style': ' ',
'keyLogin': 'false',
'qrLogin': 'true',
'newMini': 'false',
'newMini2': 'false',
'tid': ' ',
'loginType': '3',
'minititle': ' ',
'minipara': ' ',
'pstrong': ' ',
'sign': ' ',
'need_sign': ' ',
'isIgnore': ' ',
'full_redirect': ' ',
'sub_jump': ' ',
'popid': ' ',
'callback': ' ',
'guf': ' ',
'not_duplite_str': ' ',
'need_user_id': ' ',
'poy': ' ',
'gvfdcname': '10',
'gvfdcre': '68747470733A2F2F7777772E74616F62616F2E636F6D2F',
'from_encoding': ' ',
'sub': ' ',
'TPL_password_2': self.TPL_password2,
'loginASR': '1',
'loginASRSuc': '1',
'allp': ' ',
'oslanguage': 'zh-CN',
'sr': '1920*1080',
'osVer': ' ',
'naviVer': 'chrome|80.03987122',
'osACN': 'Mozilla',
'osAV': '5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
'osPF': 'Win32',
'miserHardInfo': ' ',
'appkey': '00000000',
'nickLoginLink': ' ',
'mobileLoginLink': 'https://login.taobao.com/member/login.jhtml?spm=a21bo.2017.754894437.1.5af911d9L8zNXH&f=top&redirectURL=https://www.taobao.com/&useMobile=true',
'showAssistantLink': ' ',
'um_token': 'T412F2B4BE69D3F11CD1701849AD7DAFFD9EF3B65F22038E2929CC42B3B',
'ua': self.ua
}
try:
response = self.session.post(self.verify_password_url, headers=verify_password_headers,
data=verify_password_data,
timeout=self.timeout)
response.raise_for_status()
# 从返回的页面中提取申请st码地址
except Exception as e:
print('验证用户名和密码请求失败,原因:')
raise e
# 提取申请st码url
apply_st_url_match = re.search(r'<script src="(.*?)"></script>', response.text)
# 存在则返回
if apply_st_url_match:
print('验证用户名密码成功,st码申请地址:{}'.format(apply_st_url_match.group(1)))
return apply_st_url_match.group(1)
else:
raise RuntimeError('用户名密码验证失败!response:{}'.format(response.text))
def _apply_st(self):
"""
申请st码
:return: st码
"""
apply_st_url = self._verify_password()
try:
response = self.session.get(apply_st_url)
response.raise_for_status()
except Exception as e:
print('申请st码请求失败,原因:')
raise e
st_match = re.search(r'"data":{"st":"(.*?)"}', response.text)
if st_match:
print('获取st码成功,st码:{}'.format(st_match.group(1)))
return st_match.group(1)
else:
raise RuntimeError('获取st码失败!response:{}'.format(response.text))
def login(self):
"""
使用st码登录
:return:
"""
# 加载cookies文件
if self._load_cookies():
return True
# 判断是否需要滑块验证
self._user_check()
st = self._apply_st()
headers = {
'Host': 'login.taobao.com',
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
try:
response = self.session.get(self.vst_url.format(st), headers=headers)
response.raise_for_status()
except Exception as e:
print('st码登录请求,原因:')
raise e
# 登录成功,提取跳转淘宝用户主页url
my_taobao_match = re.search(r'top.location.href = "(.*?)"', response.text)
if my_taobao_match:
print('登录淘宝成功,跳转链接:{}'.format(my_taobao_match.group(1)))
self._serialization_cookies()
return True
else:
raise RuntimeError('登录失败!response:{}'.format(response.text))
def _load_cookies(self):
# 1、判断cookies序列化文件是否存在
if not os.path.exists(COOKIES_FILE_PATH):
return False
# 2、加载cookies
self.session.cookies = self._deserialization_cookies()
# 3、判断cookies是否过期
try:
self.get_taobao_nick_name()
except Exception as e:
os.remove(COOKIES_FILE_PATH)
print('cookies过期,删除cookies文件!')
return False
print('加载淘宝登录cookies成功!!!')
return True
def _serialization_cookies(self):
"""
序列化cookies
"""
cookies_dict = requests.utils.dict_from_cookiejar(self.session.cookies)
with open(COOKIES_FILE_PATH, 'w+', encoding='utf-8') as file:
json.dump(cookies_dict, file)
print('保存cookies文件成功!')
def _deserialization_cookies(self):
"""
反序列化cookies
"""
with open(COOKIES_FILE_PATH, 'r+', encoding='utf-8') as file:
cookies_dict = json.load(file)
cookies = requests.utils.cookiejar_from_dict(cookies_dict)
return cookies
def get_taobao_nick_name(self):
"""
获取淘宝昵称
:return: 淘宝昵称
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
try:
response = self.session.get(self.my_taobao_url, headers=headers)
response.raise_for_status()
except Exception as e:
print('获取淘宝主页请求失败!原因:')
raise e
# 提取淘宝昵称
nick_name_match = re.search(r'<input id="mtb-nickname" type="hidden" value="(.*?)"/>', response.text)
if nick_name_match:
print('登录淘宝成功,你的用户名是:{}'.format(nick_name_match.group(1)))
return nick_name_match.group(1)
else:
raise RuntimeError('获取淘宝昵称失败!response:{}'.format(response.text))
if __name__ == '__main__':
ul = TaoBaoLogin(s)
ul.login()
ul.get_taobao_nick_name()
#第二段代码
import os
import re
import json
import time
import random
import requests
import pandas as pd
from retrying import retry
from taobao_login import TaoBaoLogin
# 关闭警告
requests.packages.urllib3.disable_warnings()
# 登录与爬取需使用同一个Session对象
req_session = requests.Session()
# 淘宝商品excel文件保存路径
GOODS_EXCEL_PATH = 'taobao_goods.xlsx'
class GoodsSpider:
def __init__(self, q):
self.q = q
# 超时
self.timeout = 15
self.goods_list = []
# 淘宝登录
tbl = TaoBaoLogin(req_session)
tbl.login()
@retry(stop_max_attempt_number=3)
def spider_goods(self, page):
s = page * 44
# 注意:url携带ajax=true参数,会返回滑块验证码,说明被识别了是爬虫!!!,可能是判断了有没有经过js加载直接获取json数据。callback参数也不需要
search_url = f"https://s.taobao.com/search?data-key=s&data-value={s+44}&_ksTS=1582786023656_730&initiative_id=tbindexz_20170306&ie=utf8&spm=a21bo.2017.201856-taobao-item.2&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&imgfile=&q={self.q}&suggest=history_1&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=3&ntoffset=0&p4ppushleft=1%2C48"
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'referer': 'https://www.taobao.com'
}
response = req_session.get(search_url, headers=headers, proxies=None,
verify=False, timeout=self.timeout)
# print(response.request.headers)
# print(response.request.headers['cookie'])
goods_match = re.findall(r'g_page_config = (.*);', response.text)
# goods_match = re.search(r'g_page_config = (.*?);', response.text)
# print(goods_match)
if not goods_match:
print('提取页面中的数据失败222')
raise RuntimeError
# goods_str = goods_match.group(1) + '}}'
goods_str = goods_match[0]
goods_list = self._get_goods_info(goods_str)
# 查看商品列表
print(goods_list)
# self._save_excel(goods_list)
def _get_goods_info(self, goods_str):
"""
解析json数据,并提取标题、价格、商家地址、销量、评价地址
"""
goods_json = json.loads(goods_str) # 字符串转字典
goods_items = goods_json['mods']['itemlist']['data']['auctions']
goods_list = []
for goods_item in goods_items:
goods = {'title': goods_item['raw_title'],
'price': goods_item['view_price'],
'location': goods_item['item_loc'],
'sales': goods_item['view_sales'],
'comment_url': goods_item['comment_url']}
goods_list.append(goods)
return goods_list
def _save_excel(self, goods_list):
"""
将json数据生成excel文件
"""
# pandas没有对excel没有追加模式,只能先读后写
if os.path.exists(GOODS_EXCEL_PATH):
df = pd.read_excel(GOODS_EXCEL_PATH)
df = df.append(goods_list)
else:
df = pd.DataFrame(goods_list)
writer = pd.ExcelWriter(GOODS_EXCEL_PATH)
# columns参数用于指定生成的excel中列的顺序
df.to_excel(excel_writer=writer, columns=['title', 'price', 'location', 'sales', 'comment_url'], index=False,
encoding='utf-8', sheet_name='Sheet')
writer.save()
writer.close()
def patch_spider_goods(self):
"""
批量爬取淘宝商品
如果爬取20多页不能爬,可以分段爬取
"""
# 写入数据前先清空之前的数据
# if os.path.exists(GOODS_EXCEL_PATH):
# os.remove(GOODS_EXCEL_PATH)
# 批量爬取10页试试
for i in range(0, 5):
print('第%d页' % (i + 1))
self.spider_goods(i)
# 设置一个时间间隔
time.sleep(random.randint(20, 30))
if __name__ == '__main__':
gs = GoodsSpider("硬盘")
gs.patch_spider_goods()
加油吧,努力学习爬虫和数据挖掘清洗!!!2020加油!!!