使用scrapy框架爬取实习僧python岗位第一页页面内所有岗位名称

最新推荐文章于 2024-07-29 15:43:44 发布

Amazing_DAI

最新推荐文章于 2024-07-29 15:43:44 发布

阅读量291

点赞数

分类专栏： #python爬虫

本文链接：https://blog.youkuaiyun.com/Dai_Ge/article/details/106566504

版权

#python爬虫专栏收录该内容

3 篇文章

订阅专栏

import scrapy
from Scrapy.items import PositionItem
from bs4 import BeautifulSoup
import requests



class ShixisengSpider(scrapy.Spider):
    name = 'shixiseng'
    allowed_domains = ['https://www.shixiseng.com/interns?page=1&keyword=Python&type=intern&area=&months=&days=&degree=&official=&enterprise=&salary=-0&publishTime=&sortType=&city=%E8%BF%90%E5%9F%8E&internExtend=']
    start_urls = ['https://www.shixiseng.com/interns?page=1&keyword=Python&type=intern&area=&months=&days=&degree=&official=&enterprise=&salary=-0&publishTime=&sortType=&city=%E8%BF%90%E5%9F%8E&internExtend=']

    def parse(self, response):
        position = PositionItem()
        j=0
        for i in response.xpath('//*[@id="__layout"]/div/div[2]/div[2]/div[1]/div[1]/div[1]//div/div[1]/div[1]/p[1]/a/@href'):
            position['url_cur']=i.extract()
            response2=requests.get(url=position['url_cur']).text
            bs=BeautifulSoup(response2,"html.parser")
            temp=bs.find(attrs={'class':'new_job_name'}).find('span')
            position['name']=temp.get_text()
            print(j+1,': ',position['name'])
            j += 1