当我们使用selenium 抓取数据时候
首先我们要明白问题出在哪里?
在新打开页面,我们无法获取元素,是因为 句柄 不在当前打开页面,我们需要做的是将 句柄 移动到当前页面就ok 了
我们以智联招聘为例:直接上代码
# -*- coding: utf-8 -*-
import scrapy
import time
from selenium import webdriver
from tutorial.items import TutorialItem
class jobboleSpider(scrapy.Spider):
#先进行登录
global driver
name = 'tutorial'
allowed_domains = ["tutorial.org"]
start_urls = ['https://passport.zhaopin.com/org/login?DYWE=1500951340319.109686.1500951340.1500951340.1&y7bRbP=dponrhKTzMKTzMKTuTndxC6SfMXcLLK6ijFVr6jEz.V']
driver = webdriver.Firefox()
driver.get(start_urls[0])
time.sleep(10)
uname = ''
password = ''
driver.find_element_by_xpath(".//*[@id='LoginName']").send_keys(uname) # 你的用户名
driver.find_element_by_xpath(".//*[@id='Password']").send_keys(password)
time.sleep(5)
driver.find_element_by_xpath(".//*[@id='loginbutton']").click()
time.sleep(5)
driver.get("https://rdsearch.zhaopin.com/Home/SearchByCustom?source=rd")
driver.find_element_by_xpath(".//*[@id='SF_1_1_1']").send_keys("java") # 搜索职位
driver.find_element_by_xpath(".//*[@id='searchSubmit']/button").click()
time.sleep(10)
# 当前信息有多少页
global start_page # 起始页
global page_count # 总页数
page_index = str(driver.find_element_by_xpath(".//*[@id='rd-resumelist-pageNum']").text)
nowPag,pageCount = page_index.split("/")
page_count = int(pageCount)
print("总页数是=====================")
print(page_count)
#tex = 1
x=1
def parse(self, response,num=x,start_page=x):
#start_page = 1
j = 1
for i in range(1, 31):
title="html/body/div[6]/div[5]/form/table/tbody/tr["+str(j)+"]/td[2]/a"
driver.find_element_by_xpath(title).click()
time.sleep(3)
# 移动句柄为当前页面
all=driver.window_handles #显示当前页面一共有多少个句柄,结果是一个列['' ,'']
z=driver.current_window_handle #当前页面的句柄
driver.switch_to_window(driver.window_handles[1])#移动句柄
time.sleep(2)
##具体爬取数据的方法
self.auto_info(response,num)
#关闭当前句柄
driver.close()
#移动句柄
driver.switch_to_window(driver.window_handles[0])
j = j + 2
num +=1
print("-----------------------")
print(num)
#driver.current_window_handle
#return item
#翻页
if (start_page <= page_count):
time.sleep(1)
driver.find_element_by_xpath("html/body/div[6]/div[5]/div/div[3]/a[3]").click()
time.sleep(2)
start_page +=1
print("翻页============================================")
# num1 =num
print(start_page)
print(num)
self.parse(response,num=num,start_page=start_page)
else:
print("-------数据爬取结束---------")
def auto_info(self,response,num):
print("简历内容---------------------------------")
path = "D:/test2/java/"
path += str(num)
path +=".txt"
print(path)
f = open(path, 'a')
for tag in driver.find_elements_by_xpath(".//*[@id='resumeContentBody']/div"):
try:
f.write(tag.text)
f.write("\n")
except:
continue
f.close()