文章目录
前言
本篇文章仅供Python学习,禁止商业用途,其中需要优化的地方很多,比如爬取页数,又比如按条件排序爬取,想拿走就拿走,毕竟得不到什么,这里对自己的成果做一下记录,仅此而已
一、前期准备
1.安装浏览器插件
1.下载插件
2.配置环境变量
放入谷歌的根目录下,然后配置环境变量
3.复制到python文件夹一份
二、使用步骤
1.引入库selenium
from selenium import webdriver
import time
import xlwt
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
2.爬虫编写
class Taobao:
def __init__(self):
self.url = 'http://www.taobao.com'
self.savepath = "淘宝数据.xls" # 定义保存路径
self.book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
def do_slider(self, wait, driver):
"""
处理滑动验证码
:return:
"""
slider_go = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#nc_1_n1z'))
)
# 实例化一个动作链关联游览器
action = ActionChains(driver)
action.reset_actions()
# 使用鼠标动作链进行点击并悬浮
action.click_and_hold(slider_go)
# 滑动验证码
action.move_by_offset(xoffset=258, yoffset=0).perform()
time.sleep(1)
def search_product(self, username, password, keyword, driver, wait):
# 输入关键字
driver.find_element_by_id('q').send_keys(keyword)
time.sleep(2)
# 搜索fm-login-id fm-login-password
driver.find_element_by_class_name('btn-search').click()
time.sleep(2)
# 输入账号密码
input_username = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-id'))
)
input_password = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-password'))
)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#login-form > div.fm-btn > button'))
)
input_username.send_keys(username)
time.sleep(2)
input_password.send_keys(password)
time.sleep(2)
submit.click()
# 点击二维码出现
# driver.find_element_by_css_selector('.iconfont.icon-qrcode').click()
# driver.find_element_by_css_selector('[type=submit]').click()
# time.sleep(10)
# 判断有无滑块验证
try:
slider = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#nc_1__scale_text > span'))
)
if bool(slider):
print('发现滑块验证码')
time.sleep(2)
self.do_slider(wait, driver)
submit.click()
else:
print('未发现滑块')
pass
except:
print('未发现滑块')
finally:
print('登录成功')
def drop_down(self, driver):
for x in range(1, 11, 2):
time.sleep(0.5)
j = x / 10
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
driver.execute_script(js)
def get_product(self, driver):
lis = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]')
for li in lis:
info_list.append(li.find_element_by_xpath('.//div[@class="row row-2 title"]').text)
price_list.append(li.find_element_by_xpath('.//a[@class="J_ClickStat"]').get_attribute('trace-price'))
deal_list.append(li.find_element_by_xpath('.//div[@class="deal-cnt"]').text)
url_list.append(li.find_element_by_xpath('.//div[@class="pic"]/a/img').get_attribute('src'))
name_list.append(li.find_element_by_xpath('.//div[@class="shop"]/a/span[2]').text)
position_list.append(
li.find_element_by_xpath('.//div[@class="row row-3 g-clearfix"]/div[@class="location"]').text)
def saveDate(self, all_list):
col = ("商品标题", "商品价格", "商品销量", "商品链接", "店铺名称", "商品发货地")
for i2 in range(0, 5):
sheet.write(0, i2, col[i2]) # 列名
for i3 in range(0, len(all_list[0])):
b = (all_list[0])[i3]
c = (all_list[1])[i3]
d = (all_list[2])[i3]
e = (all_list[3])[i3]
f = (all_list[4])[i3]
g = (all_list[5])[i3]
sheet.write((i3 + 1), 0, b)
sheet.write((i3 + 1), 1, c)
sheet.write((i3 + 1), 2, d)
sheet.write((i3 + 1), 3, e)
sheet.write((i3 + 1), 4, f)
sheet.write((i3 + 1), 5, g)
self.book.save(self.savepath)
def run(self, username, password):
global sheet
sheet = self.book.add_sheet('淘宝数据', cell_overwrite_ok=True) # 创建工作表
keyword = input('请输入淘宝关键字:')
options = webdriver.ChromeOptions()
# options.add_argument("--headless") # 设置谷歌为headless无界面模式
driver = webdriver.Chrome(options=options)
# 防止被监测,完美隐藏window.navigator.webdriver
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": '''
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
'''
})
wait = WebDriverWait(driver, 5)
driver.get(self.url)
self.search_product(username, password, keyword, driver, wait)
self.drop_down(driver)
self.get_product(driver)
for num in range(1, 5):
driver.get('https://s.taobao.com/search?q={}&s={}'.format(keyword, 44 * num))
print('https://s.taobao.com/search?q={}&s={}'.format(keyword, 44 * num))
driver.implicitly_wait(10)
self.drop_down(driver)
self.get_product(driver)
driver.close()
all_list = [info_list, price_list, deal_list, url_list, name_list, position_list]
# 存储表格
self.saveDate(all_list)
if __name__ == '__main__':
global info_list, price_list, deal_list, url_list, name_list, position_list
info_list = []
price_list = []
deal_list = []
url_list = []
name_list = []
position_list = []
username = '您的手机号'
password = '浏览器登录的密码'
Taobao().run(username, password)