scrapy 爬虫管道 json数据

一些命令

pip install scrapy          爬虫中的异步
   scrapy startproject  项目名
   cd scrape1
    scrapy genspider example example.com
   运行 scrapy crawl  爬虫名

没有开启管道 可以用下面命令 生成文件

bb.py注释那段代码
  scrapy crawl 爬虫名 -o 文件路径.后缀
              scrapy crawl bb -o index/bb.csv 

如果爬虫名是bb

bb.py

import scrapy
#自动导包语句需要更改
#from scrape1.scrape1.items import Scrape1Item
#pycharm的路径无法识别  鼠标右击项目  选择设置根目录  Mark Directory as  Sources Root
from scrape1.items import Scrape1Item

class BbSpider(scrapy.Spider):
    name = "bb"
    #allowed_domains = ["bb.com"]
    start_urls = ["链接"]

    def parse(self, response):
        res_data=response.json()['Data']['Posts']
        # ls=[]
        # for item in res_data:
        #     title=item['title']
        #     name=item['owner']['name']
        #     view=item['stat']['view']
        #     danmaku=item['stat']['danmaku']
        #     desc=item['desc'].replace('\n','').replace('\r','').replace(' ','')
        #     #print(title,name,view,danmaku,desc)
        #     ls.append({'title':title,
        #               'name':name,
        #                'view':view,
        #                'danmaku':danmaku,
        #                'desc':desc})
        # return ls


        for i in res_data:
            #名称
            RecruitPostName=i['RecruitPostName']
            #区域
            LocationName=i['LocationName']
            #时间
            LastUpdateTime=i['LastUpdateTime']
            #工种
            CategoryName=i['CategoryName']

            #print(title,name,view,danmaku,desc)
            items=Scrape1Item()
            #把得到的五个数据绑定给   对象的五个 属性
            #对象名。属性名=属性值
            #items对象的操作和字典操作是一样的     字典名【键】=值
            #items[属性名]=属性值
            items['RecruitPostName']=RecruitPostName
            items['LocationName']=LocationName
            items['LastUpdateTime']=LastUpdateTime
            items['CategoryName']=CategoryName
            #items['desc']=desc
            #循环内提交items对象 使用关键字 yield
            yield items

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

#先定义item类--》创建item类对象--》把数据绑定给对象--》提交item给近道
class Scrape1Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #属性

    RecruitPostName =scrapy.Field()
    LocationName = scrapy.Field()
    LastUpdateTime = scrapy.Field()
    CategoryName = scrapy.Field()

pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
#保存到文本文件中
class Scrape1Pipeline:
    #启动爬虫时执行
    def open_spider(self,spider):
        #文件对象
        #如果属性在对象中不存在,则为添加属性,反之为修改属性
        #对象。属性名=属性值
        self.f=open('index/腾讯招聘.txt', 'a', encoding='utf-8')
    #svrapy会自动调用该方法
    #把爬虫文件中 提交了item类型对象,则这个方法就被执行 一次
    def process_item(self, item, spider):
        #print(1)
        #这个方法定义了两个形参:item会接收爬虫的item 对象,spider会接收运行的爬虫对象Scrape1Item
        # with open('index/排行榜数据.txt', 'a', encoding='utf-8')as f:
        #     s=f"标题{item['title']}作者{item['name']}浏览{item['view']}评论{item['danmaku']}\n"
        #     f.write(s)

        #以下代码可以往文件中写入所有数据:提交一次就打开文件一次,再写入数据一次
        # with open('index/排行榜数据.txt', 'a', encoding='utf-8')as f:
        #    s=f"名称{item['RecruitPostName']}区域{item['LocationName']}时间{item['LastUpdateTime']}工种{item['CategoryName']}\n"
        #     f.write(s)

        #如果只操作文件一次,也可以写入所有数据,那么效率会更高
        s=f"名称:{item['RecruitPostName']}区域:{item['LocationName']}时间:{item['LastUpdateTime']}工种:{item['CategoryName']}\n"
        self.f.write(s)
        return item
    #臭虫结束之后执行一次
    def close_spider(self,spider):
        self.f.close()


from openpyxl import Workbook
class ExcelPipeline:
    def open_spider(self,spider):
        self.wb=Workbook()
        self.sh=self.wb.active
        self.sh.title="腾讯招聘"
        self.sh.append(['名称','区域','时间','工种'])
    def process_item(self, item, spider):
       self.sh.append([item['RecruitPostName'],item['LocationName'],item['LastUpdateTime'],item['CategoryName']])
       return item

    #臭虫结束之后执行一次
    def close_spider(self,spider):
        self.wb.save('./index/腾讯招聘.xlsx')



import pymysql

class MysqlPipeline:
    def open_spider(self,spider):
        try:
            self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', database='cheshi')
            self.cur = self.conn.cursor()
        except:
            self.conn=0
            self.cur=0

    def process_item(self, item, spider):

       sql = f"INSERT INTO tunxunzhaoping VALUES (NULL,'{item['RecruitPostName']}','{item['LocationName']}','{item['LastUpdateTime']}','{item['CategoryName']}')"
       try:
           self.cur.execute(sql)
           self.conn.commit()
       except Exception as e:
           print(e)
           self.conn.rollback()
       return item

    #臭虫结束之后执行一次
    def close_spider(self,spider):
        if self.conn != 0 and self.cur != 0:
            self.cur.close()
            self.conn.close()

settings.py

修改代码
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"


#只输出错误日志
LOG_LEVEL = "ERROR"


#开启管道
ITEM_PIPELINES = {
   #值:数字优先级
   #数字越小,优先级越高
   #数字一样,按照字典的键值对的顺序执行
   "scrape1.pipelines.Scrape1Pipeline": 300,
   "scrape1.pipelines.MysqlPipeline": 300,
   "scrape1.pipelines.ExcelPipeline": 300,
}

分页可以这样写

import scrapy
#自动导包语句需要更改
#from scrape1.scrape1.items import Scrape1Item
#pycharm的路径无法识别  鼠标右击项目  选择设置根目录  Mark Directory as  Sources Root
from scrape1.items import Scrape1Item

class BbSpider(scrapy.Spider):
    name = "bb"
    #allowed_domains = ["bb.com"]
    start_urls = ["https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1764041749332&countryId=&cityId=&bgIds=&productId=&categoryId=40001001,40001002,40001003,40001004,40001005,40001006&parentCategoryId=&attrId=1&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn"]
    page=2
    count=1
    def parse(self, response):
        res_data=response.json()['Data']['Posts']
        if not res_data:
            return
 
        for i in res_data:
            #名称
            RecruitPostName=i['RecruitPostName']
            #区域
            LocationName=i['LocationName']
            #时间
            LastUpdateTime=i['LastUpdateTime']
            #工种
            CategoryName=i['CategoryName']
            #id
            PostId=i['PostId']


            #print(title,name,view,danmaku,desc)
            items=Scrape1Item()
            #把得到的五个数据绑定给   对象的五个 属性
            #对象名。属性名=属性值
            #items对象的操作和字典操作是一样的     字典名【键】=值
            #items[属性名]=属性值
            items['RecruitPostName']=RecruitPostName
            items['LocationName']=LocationName
            items['LastUpdateTime']=LastUpdateTime
            items['CategoryName']=CategoryName

            #print(self.count,items)
            self.count+=1
            #items['desc']=desc
            #循环内提交items对象 使用关键字 yield
            yield items

        #发起下一个请求
        #创建请求对象
        next_url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1764041749332&countryId=&cityId=&bgIds=&productId=&categoryId=40001001,40001002,40001003,40001004,40001005,40001006&parentCategoryId=&attrId=1&keyword=&pageIndex={self.page}&pageSize=10&language=zh-cn&area=cn'
        self.page+=1
        # #向下一页发起请求,请求成功后得到响应对象交给parse方法解析数据
        yield  scrapy.Request(url=next_url,callback=self.parse)

分页后 加入详情页数据,

import scrapy
#自动导包语句需要更改
#from scrape1.scrape1.items import Scrape1Item
#pycharm的路径无法识别  鼠标右击项目  选择设置根目录  Mark Directory as  Sources Root
from scrape1.items import Scrape1Item

class BbSpider(scrapy.Spider):
    name = "bb"
    #allowed_domains = ["bb.com"]
    start_urls = ["https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1764041749332&countryId=&cityId=&bgIds=&productId=&categoryId=40001001,40001002,40001003,40001004,40001005,40001006&parentCategoryId=&attrId=1&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn"]
    page=2
    count=1
    def parse(self, response):
        res_data=response.json()['Data']['Posts']
        if not res_data:
            return
        if
        for i in res_data:
            #名称
            RecruitPostName=i['RecruitPostName']
            #区域
            LocationName=i['LocationName']
            #时间
            LastUpdateTime=i['LastUpdateTime']
            #工种
            CategoryName=i['CategoryName']
            #id
            PostId=i['PostId']


            #print(title,name,view,danmaku,desc)
            items=Scrape1Item()
            #把得到的五个数据绑定给   对象的五个 属性
            #对象名。属性名=属性值
            #items对象的操作和字典操作是一样的     字典名【键】=值
            #items[属性名]=属性值
            items['RecruitPostName']=RecruitPostName
            items['LocationName']=LocationName
            items['LastUpdateTime']=LastUpdateTime
            items['CategoryName']=CategoryName

            detail_url = f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1764132429731&postId={PostId}&language=zh-cn'

            # 详情页的解析方式和列表页的解析方式肯定不一样,不能共用一个解析方法
            yield scrapy.Request(url=detail_url, callback=self.detail_parse, meta={'i': items})

            #print(self.count,items)
            self.count+=1
            #items['desc']=desc
            #循环内提交items对象 使用关键字 yield
            #yield items

        #发起下一个请求
        #创建请求对象
        next_url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1764041749332&countryId=&cityId=&bgIds=&productId=&categoryId=40001001,40001002,40001003,40001004,40001005,40001006&parentCategoryId=&attrId=1&keyword=&pageIndex={self.page}&pageSize=10&language=zh-cn&area=cn'
        self.page+=1
        # #向下一页发起请求,请求成功后得到响应对象交给parse方法解析数据
        yield  scrapy.Request(url=next_url,callback=self.parse)

    def detail_parse(self, response):
        items=response.meta['i']
        res_data=response.json()['Data']
        Requirement=res_data['Requirement']
        items['Requirement']=Requirement
        #print(items)
        yield items

其他文件也要相应的加上岗位需要这个字段

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值