安装scrapy
pip install Scrapy
本教程以爬取中国天气网信息为例
1.创建scrapy项目命令
scrapy startproject 项目名
2.创建spider命令
scrapy genspider spider文件名 访问的域名
3.定义Item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class Demo03Item2(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#时间
date = scrapy.Field()
#天气状态
state = scrapy.Field()
#温度
temp = scrapy.Field()
#风向
wind = scrapy.Field()
#相对湿度
humidity =scrapy.Field()
#空气质量
air = scrapy.Field()
#降水量
amount = scrapy.Field()
#风速
speed = scrapy.Field()
#城市名称
city = scrapy.Field()
#区号
areacode = scrapy.Field()
pass
编写自己的爬虫文件
进入刚刚创建的爬虫文件中编写爬虫项目
下面是我写的样例
import scrapy
from datashape import null
import pandas as pd
from demo03.items02 import Demo03Item2
import datetime
import json
import os
class WeatherSpider(scrapy.Spider):
name = "weather3"
#得到城市数据
# def fileload(filename='weather.csv'):
# csvfile = open(filename,encoding='GBK')
# data = csv.reader(csvfile)
# dataset = []
# for line in data:
# dataset.append(line)
# csvfile.close()
# return dataset
Env_path = os.getcwd()
City_WeatherCode = pd.read_csv(Env_path+'/weather.csv',encoding='GBK')
WeatherCode = City_WeatherCode['citycode']
start_urls = []
for code in WeatherCode:
#print('http://www.weather.com.cn/weather1d/%s.shtml'%code)
start_urls.append('http://www.weather.com.cn/weather1d/%s.shtml'%code)
def parse(self, response):
self.logger.info('A response from %s just arrived!', response.url)
############获取城市天气编码
url = response.url
split_url = url.split('/')
get_areaCode = split_url[-1].split('.')
areaCode = get_areaCode[0]
sel = scrapy.Selector(response)
title = sel.xpath('//div[@class="crumbs fl"]/a',encoding='GBK')
city_list = title.xpath('text()').extract()
city = ' '.join(city_list)
span = sel.xpath('//div[@class="crumbs fl"]/span')
area_list = span.xpath('text()').extract()
area = area_list[-1]
#print(city+area+"今天天气情况:")
script = sel.xpath('//script')
# 爬取天气状态
# scriptText = script[2].xpath('text()').extract_first()
# scriptNewText = scriptText[23:193]
# scriptArray = scriptNewText.split('\",\"')
# stateArray = []
# for index in scriptArray:
# indexArray = index.split(',')
# stateArray.append(indexArray[2])
# itm = iter(stateArray)
# state = next(itm)
# 爬取实时天气信息
NowDate = (datetime.datetime.now()).strftime('%Y-%m-%d') ####当前时间
pastTime = (datetime.datetime.now()-datetime.timedelta(hours=24)).strftime('%Y-%m-%d')###前一天日期
scriptText = script[3].xpath('text()').extract_first()
scriptStr = scriptText[23:-2]
scriptJson = json.loads(scriptStr, encoding='utf-8')
WeatherList = scriptJson['od']['od2']
List_len = len(WeatherList)
#print(List_len)
label = 0
for i in range(1, List_len):
weather = Demo03Item2()
weather['city'] = city+' '+area
weather['areacode'] = areaCode
weather_i = WeatherList[i]
if WeatherList[i-1]['od21']=='00':
label = 1
if label == 1:
weather['date'] = pastTime +' '+weather_i['od21'] ###时刻
else:
weather['date'] = NowDate+' '+weather_i['od21'] ###时刻
weather['state'] = null
weather['temp'] = weather_i['od22'] ##温度
weather['wind'] = weather_i['od24'] ##风向
weather['speed'] = weather_i['od25'] ##风速
weather['amount'] = weather_i['od26'] ###降水量
weather['humidity'] = weather_i['od27'] ###相对湿度
weather['air'] = weather_i['od28'] ####空气质量指数:简称AQI
yield weather
由此可见,爬虫编写无非是对特定网页使用一定的方法进行解析然后的到自己想要的内容,以上属于较为基础的爬虫解析方式,使用xpath进行解析
还可以使用正则等方法对网页进行解析,都是比较基础的东西,对爬取的网页进行解析是爬虫最为繁琐且最为主要的地方,解析的方法都比较简单,
难点在于等将各种方法融汇贯通,得到自己想要的东西。
为了启用一个Item Pipeline组件,你必须将它的类添加到 ITEM_PIPELINES 配置,就像下面这个例子:
ITEM_PIPELINES = {
'myproject.pipelines.PricePipeline': 300,
'myproject.pipelines.JsonWriterPipeline': 800,
}
此配置书写在setting中,一般setting中已经配置好,需要自己取消注释即可
对pipline的编写
# -*- coding: utf-8 -*-
#coding:gbk
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import cx_Oracle
import csv
from demo03.items02 import Demo03Item2
class Demo03Pipeline(object):
def getDbConn(self):
tns_name = cx_Oracle.makedsn('111.111.111.111', '1111', ' aaaa ')
connection = cx_Oracle.connect('aaaa', 'aaaa', tns_name, encoding='utf-8')
return connection
def process_item(self, item, spider):
print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
con = self.getDbConn()
cur = con.cursor()
# print(lis)
sql = "insert into CNWEATHER3(AREACODE,CITY,DATER,JN_STATE,TEMP,WIND,SPEED,HUMIDITY,AIR,JN_AMOUNT) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"
lis = (
item['areacode'], item['city'], item['date'], item['state'], item['temp'], item['wind'], item['speed'],
item['humidity'], item['air'], item['amount'])
try:
cur.execute(sql%lis)
except Exception as e:
print("Insert Error:", e)
con.rollback()
else:
con.commit()
print('成功插入', cur.rowcount, '条气象数据')
cur.close()
con.close()
return item
以上是pipline的编写。