一些命令
pip install scrapy 爬虫中的异步
scrapy startproject 项目名
cd scrape1
scrapy genspider example example.com
运行 scrapy crawl 爬虫名
没有开启管道 可以用下面命令 生成文件
bb.py注释那段代码
scrapy crawl 爬虫名 -o 文件路径.后缀
scrapy crawl bb -o index/bb.csv
如果爬虫名是bb
bb.py
import scrapy
#自动导包语句需要更改
#from scrape1.scrape1.items import Scrape1Item
#pycharm的路径无法识别 鼠标右击项目 选择设置根目录 Mark Directory as Sources Root
from scrape1.items import Scrape1Item
class BbSpider(scrapy.Spider):
name = "bb"
#allowed_domains = ["bb.com"]
start_urls = ["链接"]
def parse(self, response):
res_data=response.json()['Data']['Posts']
# ls=[]
# for item in res_data:
# title=item['title']
# name=item['owner']['name']
# view=item['stat']['view']
# danmaku=item['stat']['danmaku']
# desc=item['desc'].replace('\n','').replace('\r','').replace(' ','')
# #print(title,name,view,danmaku,desc)
# ls.append({'title':title,
# 'name':name,
# 'view':view,
# 'danmaku':danmaku,
# 'desc':desc})
# return ls
for i in res_data:
#名称
RecruitPostName=i['RecruitPostName']
#区域
LocationName=i['LocationName']
#时间
LastUpdateTime=i['LastUpdateTime']
#工种
CategoryName=i['CategoryName']
#print(title,name,view,danmaku,desc)
items=Scrape1Item()
#把得到的五个数据绑定给 对象的五个 属性
#对象名。属性名=属性值
#items对象的操作和字典操作是一样的 字典名【键】=值
#items[属性名]=属性值
items['RecruitPostName']=RecruitPostName
items['LocationName']=LocationName
items['LastUpdateTime']=LastUpdateTime
items['CategoryName']=CategoryName
#items['desc']=desc
#循环内提交items对象 使用关键字 yield
yield items
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
#先定义item类--》创建item类对象--》把数据绑定给对象--》提交item给近道
class Scrape1Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#属性
RecruitPostName =scrapy.Field()
LocationName = scrapy.Field()
LastUpdateTime = scrapy.Field()
CategoryName = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
#保存到文本文件中
class Scrape1Pipeline:
#启动爬虫时执行
def open_spider(self,spider):
#文件对象
#如果属性在对象中不存在,则为添加属性,反之为修改属性
#对象。属性名=属性值
self.f=open('index/腾讯招聘.txt', 'a', encoding='utf-8')
#svrapy会自动调用该方法
#把爬虫文件中 提交了item类型对象,则这个方法就被执行 一次
def process_item(self, item, spider):
#print(1)
#这个方法定义了两个形参:item会接收爬虫的item 对象,spider会接收运行的爬虫对象Scrape1Item
# with open('index/排行榜数据.txt', 'a', encoding='utf-8')as f:
# s=f"标题{item['title']}作者{item['name']}浏览{item['view']}评论{item['danmaku']}\n"
# f.write(s)
#以下代码可以往文件中写入所有数据:提交一次就打开文件一次,再写入数据一次
# with open('index/排行榜数据.txt', 'a', encoding='utf-8')as f:
# s=f"名称{item['RecruitPostName']}区域{item['LocationName']}时间{item['LastUpdateTime']}工种{item['CategoryName']}\n"
# f.write(s)
#如果只操作文件一次,也可以写入所有数据,那么效率会更高
s=f"名称:{item['RecruitPostName']}区域:{item['LocationName']}时间:{item['LastUpdateTime']}工种:{item['CategoryName']}\n"
self.f.write(s)
return item
#臭虫结束之后执行一次
def close_spider(self,spider):
self.f.close()
from openpyxl import Workbook
class ExcelPipeline:
def open_spider(self,spider):
self.wb=Workbook()
self.sh=self.wb.active
self.sh.title="腾讯招聘"
self.sh.append(['名称','区域','时间','工种'])
def process_item(self, item, spider):
self.sh.append([item['RecruitPostName'],item['LocationName'],item['LastUpdateTime'],item['CategoryName']])
return item
#臭虫结束之后执行一次
def close_spider(self,spider):
self.wb.save('./index/腾讯招聘.xlsx')
import pymysql
class MysqlPipeline:
def open_spider(self,spider):
try:
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', database='cheshi')
self.cur = self.conn.cursor()
except:
self.conn=0
self.cur=0
def process_item(self, item, spider):
sql = f"INSERT INTO tunxunzhaoping VALUES (NULL,'{item['RecruitPostName']}','{item['LocationName']}','{item['LastUpdateTime']}','{item['CategoryName']}')"
try:
self.cur.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
#臭虫结束之后执行一次
def close_spider(self,spider):
if self.conn != 0 and self.cur != 0:
self.cur.close()
self.conn.close()
settings.py
修改代码
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
#只输出错误日志
LOG_LEVEL = "ERROR"
#开启管道
ITEM_PIPELINES = {
#值:数字优先级
#数字越小,优先级越高
#数字一样,按照字典的键值对的顺序执行
"scrape1.pipelines.Scrape1Pipeline": 300,
"scrape1.pipelines.MysqlPipeline": 300,
"scrape1.pipelines.ExcelPipeline": 300,
}
分页可以这样写
import scrapy
#自动导包语句需要更改
#from scrape1.scrape1.items import Scrape1Item
#pycharm的路径无法识别 鼠标右击项目 选择设置根目录 Mark Directory as Sources Root
from scrape1.items import Scrape1Item
class BbSpider(scrapy.Spider):
name = "bb"
#allowed_domains = ["bb.com"]
start_urls = ["https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1764041749332&countryId=&cityId=&bgIds=&productId=&categoryId=40001001,40001002,40001003,40001004,40001005,40001006&parentCategoryId=&attrId=1&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn"]
page=2
count=1
def parse(self, response):
res_data=response.json()['Data']['Posts']
if not res_data:
return
for i in res_data:
#名称
RecruitPostName=i['RecruitPostName']
#区域
LocationName=i['LocationName']
#时间
LastUpdateTime=i['LastUpdateTime']
#工种
CategoryName=i['CategoryName']
#id
PostId=i['PostId']
#print(title,name,view,danmaku,desc)
items=Scrape1Item()
#把得到的五个数据绑定给 对象的五个 属性
#对象名。属性名=属性值
#items对象的操作和字典操作是一样的 字典名【键】=值
#items[属性名]=属性值
items['RecruitPostName']=RecruitPostName
items['LocationName']=LocationName
items['LastUpdateTime']=LastUpdateTime
items['CategoryName']=CategoryName
#print(self.count,items)
self.count+=1
#items['desc']=desc
#循环内提交items对象 使用关键字 yield
yield items
#发起下一个请求
#创建请求对象
next_url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1764041749332&countryId=&cityId=&bgIds=&productId=&categoryId=40001001,40001002,40001003,40001004,40001005,40001006&parentCategoryId=&attrId=1&keyword=&pageIndex={self.page}&pageSize=10&language=zh-cn&area=cn'
self.page+=1
# #向下一页发起请求,请求成功后得到响应对象交给parse方法解析数据
yield scrapy.Request(url=next_url,callback=self.parse)
分页后 加入详情页数据,
import scrapy
#自动导包语句需要更改
#from scrape1.scrape1.items import Scrape1Item
#pycharm的路径无法识别 鼠标右击项目 选择设置根目录 Mark Directory as Sources Root
from scrape1.items import Scrape1Item
class BbSpider(scrapy.Spider):
name = "bb"
#allowed_domains = ["bb.com"]
start_urls = ["https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1764041749332&countryId=&cityId=&bgIds=&productId=&categoryId=40001001,40001002,40001003,40001004,40001005,40001006&parentCategoryId=&attrId=1&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn"]
page=2
count=1
def parse(self, response):
res_data=response.json()['Data']['Posts']
if not res_data:
return
if
for i in res_data:
#名称
RecruitPostName=i['RecruitPostName']
#区域
LocationName=i['LocationName']
#时间
LastUpdateTime=i['LastUpdateTime']
#工种
CategoryName=i['CategoryName']
#id
PostId=i['PostId']
#print(title,name,view,danmaku,desc)
items=Scrape1Item()
#把得到的五个数据绑定给 对象的五个 属性
#对象名。属性名=属性值
#items对象的操作和字典操作是一样的 字典名【键】=值
#items[属性名]=属性值
items['RecruitPostName']=RecruitPostName
items['LocationName']=LocationName
items['LastUpdateTime']=LastUpdateTime
items['CategoryName']=CategoryName
detail_url = f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1764132429731&postId={PostId}&language=zh-cn'
# 详情页的解析方式和列表页的解析方式肯定不一样,不能共用一个解析方法
yield scrapy.Request(url=detail_url, callback=self.detail_parse, meta={'i': items})
#print(self.count,items)
self.count+=1
#items['desc']=desc
#循环内提交items对象 使用关键字 yield
#yield items
#发起下一个请求
#创建请求对象
next_url=f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1764041749332&countryId=&cityId=&bgIds=&productId=&categoryId=40001001,40001002,40001003,40001004,40001005,40001006&parentCategoryId=&attrId=1&keyword=&pageIndex={self.page}&pageSize=10&language=zh-cn&area=cn'
self.page+=1
# #向下一页发起请求,请求成功后得到响应对象交给parse方法解析数据
yield scrapy.Request(url=next_url,callback=self.parse)
def detail_parse(self, response):
items=response.meta['i']
res_data=response.json()['Data']
Requirement=res_data['Requirement']
items['Requirement']=Requirement
#print(items)
yield items
其他文件也要相应的加上岗位需要这个字段
65万+

被折叠的 条评论
为什么被折叠?



