import requests
import time
from lxml import etree
import re
from selenium import webdriver
import time
class LagouSpider(object):
driver_path = r"D:\Python_pycharm\PyCharm Community Edition 2018.3.5\chromedriver.exe"
def __init__(self):
self.driver=webdriver.Chrome(executable_path=LagouSpider.driver_path)
self.url="https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
self.positions=[]
def run(self):
self.driver.get(self.url)
while True:
source=self.driver.page_source
self.parse_list_page(source)
next_btn = self.driver.find_element_by_xpath("//span[contains(@class,'pager_next')]")
next_btn=self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
if "pager_next_disabled" in next_btn.get_attribute('class'):
break
next_btn.click()
time.sleep(1)
def parse_list_page(self,source):
html=etree.HTML(source)
links=html.xpath("//a[@class='position_link']/@href")
for index,link in enumerate(links):
self.driver.execute_script("window.open('%s')"%link)
self.driver.switch_to.window(self.driver.window_handles[1])
source = self.driver.page_source
self.parse_detail_links(source)
self.driver.close()
time.sleep(1)
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_links(self,source):
# response = requests.get(url, headers=headers)
# text = response.text
# # print(text)
html = etree.HTML(source)
position_name = html.xpath("//h1[@class='name']/text()")[0]
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
city = job_request_spans[1].xpath('.//text()')[0].strip()
city = re.sub(r"[\s/]", '', city)
experience = job_request_spans[2].xpath('.//text()')[0].strip()
experience = re.sub(r"[\s/]", '', experience)
education = job_request_spans[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]", '', education)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
position={
"position_name":position_name,
"salary":salary,
"city":city,
"experience":experience,
"education":education,
"desc":desc
}
self.positions.append(position)
if __name__=='__main__':
a=LagouSpider()
a.run()
print(a.positions)