首先用scrapy框架来写爬虫。这个爬虫代码基本是参照这里这个博主写的,我只是添加了middlewaves的部分。其实关于爬虫还有很多不明白的地方,只是了解基本原理和scrapy框架,但毕竟不是爬虫工程师,有时间再慢慢补充吧。
数据获取
爬虫代码如下
Items:
import scrapy
from scrapy.item import Item, Field
class Lagou2Item(scrapy.Item):
name = Field()
location = Field()
position = Field()
exprience = Field()
money = Field()
Middlewares:
import scrapy
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
class LagouwangSpiderMiddleware(object):
def __init__(self, user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls,crawler):
return cls(
user_agent = crawler.settings.get('MY_USER_AGENT')
)
def process_request(self, request, spider):
agent = random.choice(self.user_agent)
request.headers['User-Agent'] = agent
Pipelines:
from scrapy import signals
import json
import codecs
from openpyxl import Workbook
class LagouPipeline(object):
def __init__(self):
self.workbook = Workbook()
self.ws = self.workbook.active
self.ws.append(['公司名称', '工作地点', '职位名称', '经验要求', '薪资待遇']) # 设置表头
#self.file = codecs.open('lagouwang.json', 'w', encoding='utf-8