拉勾网翻页爬行插入数据库表_lagou网翻页-优快云博客

本文链接：https://blog.youkuaiyun.com/yeyedewen/article/details/108687924
import re
import pymysql
import random
from time import sleep
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC  # available since 2.26.0
from selenium.webdriver.support.ui import WebDriverWait  # available since 2.4.0

class LagouSpider(object):
    driver_path = r'D:\Python\Python38\Scripts\chromedriver.exe'
    def __init__(self):
        self.url = 'https://www.lagou.com/jobs/list_python%E5%B7%A5%E7%A8%8B%E5%B8%88/p-city_215?&cl=false&fromSearch=true&labelWords=sug&suginput=python'
    def run(self):
        self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
        self.driver.get(self.url)
        self.driver.find_element_by_class_name('body-btn').click()   ##会弹出一个红包界面，把它点取消掉
        while True:
            source = self.driver.page_source
            WebDriverWait(driver=self.driver,timeout=200).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="pager_container"]/span[last()]')))
            self.url_page(source)
            next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]')
            if 'pager_next_disabled' in next_btn.get_attribute("class"):
                return '结束'
            else:
                next_btn.click()
                sleep(random.randint(2,5))

    def url_page(self,source):     #获取当页面的所有的url
        htmls = etree.HTML(source)
        # html = etree.tostring(htmls)
        links = htmls.xpath('//a[@class="position_link"]/@href')
        for link in links:
            self.request_detail(link)  ##获取每个网页
            sleep(2)

    def request_detail(self,url):   ##获取每个网页的源代码
        self.driver.execute_script('window.open("%s")' % url)
        self.driver.switch_to.window(self.driver.window_handles[1])
        sleep(random.randint(2,6))
        source = self.driver.page_source
        self.parse_deati_page(source)
        sleep(random.randint(2,4))
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])
    def parse_deati_page(self,source):  ##解析详情
        ask = []
        html = etree.HTML(source)
        company_name = html.xpath('//h4[@class="company"]/text()')
        position_name = html.xpath("//span[@class='pos-name']/text()")
        salary = html.xpath('//span[@class="salary"]/text()')
        publish_time = html.xpath('//p[@class="publish_time"]/text()')
        job_place = html.xpath('//div[@class="work_addr"]/a/text()')
        job_details = html.xpath("//div[@class='job-detail']/text()")
        for job_detail in job_details:
            job_detail_filter = re.sub(r'<br>,\n,&nbsp', '', job_detail).strip()
            ask.append(job_detail_filter)
        position = {
            'name':company_name,
            'position':position_name,
            'salary':salary,
            'publish':publish_time,
            'job_detail':ask,
            'job_place':job_place[0:-1]
        }
        print(position['job_place'])
        self.insert_db(position)
    def insert_db(self,content):
        conn = pymysql.connect(
            host='localhost',
            user='root',
            password='root',
            database='book_manager',
            port=3306
        )
        cursor = conn.cursor()
        name = ''.join(content['name'])
        position = ''.join(content['position'])
        salary = ''.join(content['salary'])
        publish = ''.join(content['publish'])
        job_detail = ''.join(content['job_detail'])
        job_place = ''.join(content['job_place'])
        sql = "insert into lagou_job(name,position,salary,publish," \
              "job_detail,job_place) values (%s,%s,%s,%s,%s,%s)"
        params = (name,position,salary,publish,job_detail,job_place)
        cursor.execute(sql,params)
        conn.commit()
        conn.close()
def main():
    spider = LagouSpider()
    spider.run()

if __name__ == '__main__':
    main()