1.新建Item
import scrapy
class zhaopinItem(scrapy.Item):
company = scrapy.Field() #公司
content = scrapy.Field() #内容要求
url = scrapy.Field() #链接
pay = scrapy.Field() #薪资
zhiwei = scrapy.Field() #职位
didian = scrapy.Field() #地点
(2)pipelines.py文件中添加json支持
import json
import codecs
class JsonWithEncodingCnblogsPipeline(object):
def __init__(self):
FILE_SOURCE = '/Users/DDD/PycharmProjects/truespider/truespider/Json/'
filname = '招聘.json'
true_path = FILE_SOURCE+filname
self.file = codecs.open(true_path,'w',encoding='utf-8')
self.first_item = True
def process_item(self,item,spider):
line = json.dumps(dict(item), ensure_ascii=False) + ",\n"
self.file.write(line)
return item
def spider_close(self,spider):
self.file.close()
在setting.py中添加
ITEM_PIPELINES = {
'truespider.pipelines.JsonWithEncodingCnblogsPipeline': 300,
}
(3)编写爬虫
# -*- coding:utf-8 -*-
import scrapy
from truespider.Item.zhaopin_Item import zhaopinItem
from scrapy.http import Request
class zhaopinSpider(scrapy.spiders.Spider):
name = 'zhaopin'
allowed_domains = ['sou.zhaopin.com']
start_urls = [
'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=python%E5%B7%A5%E7%A8%8B%E5%B8%88&sm=0&p=1'
]
# 由于详情页的结构很混乱,这里就先放下了
def parse2(self,response):
for info in response.xpath('//div[@class="tab-cont-box"]/div[@class="tab-inner-cont"]'):
print info.extract()
def parse(self,response):
items = []
for info in response.xpath('//div[@class="newlist_list_content"]/table')[1:]:
item = zhaopinItem()
item['company'] = info.xpath('tr/td[@class="gsmc"]/a/text()').extract()
item['content'] = info.xpath('tr[@class="newlist_tr_detail"]/td/div[@class="newlist_detail"]/div[@class="clearfix"]/ul/li[@class="newlist_deatil_last"]/text()').extract()
item['url']=info.xpath('tr/td[@class="zwmc"]/div/a/@href').extract()
item['pay']=info.xpath('tr/td[@class="zwyx"]/text()').extract()
item['zhiwei']=info.xpath('tr/td[@class="zwmc"]/div/a/text()').extract()
items.append(item)
yield item #添加json
# 根据提取的url继续爬取
for item in items:
# 因为url作用域和allowed_domains不同,所以添加dont_filter=True
yield Request(item['url'][0],callback=self.parse2,dont_filter=True)
(4)方便运行添加main文件
# -*- coding:utf-8 -*-
from scrapy import cmdline
cmdline.execute('scrapy crawl zhaopin'.split())
(5)添加到数据库
首先用建表工具建一个表(我用是phpMyAdmin),设置好表头和类型
在pipelines.py中添加
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
class MySQLStorePipeline(object):
def __init__(self):
dbargs = dict(
host = '127.0.0.1',
db = 'dbname',
user = 'root',
passwd = 'pass',
cursorclass = MySQLdb.cursors.DictCursor,
charset = 'utf8',
use_unicode = True
)
self.dbpool = adbapi.ConnectionPool('MySQLdb',**dbargs)
def process_item(self,item,spider):
res = self.dbpool.runInteraction(self.insert_into_table,item)
return item
def insert_into_table(self,conn,item):
conn.execute('insert into zhaopin(company,pay,url,zhiwei,content) VALUES(%s,%s,%s,%s,%s)',(
item['company'],
item['pay'],
item['url'],
item['zhiwei'],
item['content']
))
setting.py添加ITEM_PIPELINES = {
'truespider.pipelines.JsonWithEncodingCnblogsPipeline': 300,
'truespider.pipelines.MySQLStorePipeline':300,
}
结果: