import re
import pymysql
import random
from time import sleep
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
class LagouSpider(object):
driver_path = r'D:\Python\Python38\Scripts\chromedriver.exe'
def __init__(self):
self.url = 'https://www.lagou.com/jobs/list_python%E5%B7%A5%E7%A8%8B%E5%B8%88/p-city_215?&cl=false&fromSearch=true&labelWords=sug&suginput=python'
def run(self):
self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
self.driver.get(self.url)
self.driver.find_element_by_class_name('body-btn').click() ##会弹出一个红包界面,把它点取消掉
while True:
source = self.driver.page_source
WebDriverWait(driver=self.driver,timeout=200).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="pager_container"]/span[last()]')))
self.url_page(source)
next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]')
if 'pager_next_disabled' in next_btn.get_attribute("class"):
return '结束'
else:
next_btn.click()
sleep(random.randint(2,5))
def url_page(self,source): #获取当页面的所有的url
htmls = etree.HTML(source)
# html = etree.tostring(htmls)
links = htmls.xpath('//a[@class="position_link"]/@href')
for link in links:
self.request_detail(link) ##获取每个网页
sleep(2)
def request_detail(self,url): ##获取每个网页的源代码
self.driver.execute_script('window.open("%s")' % url)
self.driver.switch_to.window(self.driver.window_handles[1])
sleep(random.randint(2,6))
source = self.driver.page_source
self.parse_deati_page(source)
sleep(random.randint(2,4))
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_deati_page(self,source): ##解析详情
ask = []
html = etree.HTML(source)
company_name = html.xpath('//h4[@class="company"]/text()')
position_name = html.xpath("//span[@class='pos-name']/text()")
salary = html.xpath('//span[@class="salary"]/text()')
publish_time = html.xpath('//p[@class="publish_time"]/text()')
job_place = html.xpath('//div[@class="work_addr"]/a/text()')
job_details = html.xpath("//div[@class='job-detail']/text()")
for job_detail in job_details:
job_detail_filter = re.sub(r'<br>,\n, ', '', job_detail).strip()
ask.append(job_detail_filter)
position = {
'name':company_name,
'position':position_name,
'salary':salary,
'publish':publish_time,
'job_detail':ask,
'job_place':job_place[0:-1]
}
print(position['job_place'])
self.insert_db(position)
def insert_db(self,content):
conn = pymysql.connect(
host='localhost',
user='root',
password='root',
database='book_manager',
port=3306
)
cursor = conn.cursor()
name = ''.join(content['name'])
position = ''.join(content['position'])
salary = ''.join(content['salary'])
publish = ''.join(content['publish'])
job_detail = ''.join(content['job_detail'])
job_place = ''.join(content['job_place'])
sql = "insert into lagou_job(name,position,salary,publish," \
"job_detail,job_place) values (%s,%s,%s,%s,%s,%s)"
params = (name,position,salary,publish,job_detail,job_place)
cursor.execute(sql,params)
conn.commit()
conn.close()
def main():
spider = LagouSpider()
spider.run()
if __name__ == '__main__':
main()
效果图如下