scrapy基础写法

本文介绍了如何使用Scrapy框架创建爬虫项目,以爬取中国天气网为例,包括创建项目和spider命令,定义Item,以及使用XPath进行网页解析。文章还提及了基础的Item Pipeline配置和编写,强调了解析方法和Pipeline组件的重要性。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

安装scrapy

pip install Scrapy

本教程以爬取中国天气网信息为例
1.创建scrapy项目命令

scrapy startproject 项目名

2.创建spider命令

scrapy genspider spider文件名 访问的域名

3.定义Item

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class Demo03Item2(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #时间
    date = scrapy.Field()
    #天气状态
    state = scrapy.Field()
    #温度
    temp = scrapy.Field()
    #风向
    wind = scrapy.Field()
    #相对湿度
    humidity =scrapy.Field()
    #空气质量
    air = scrapy.Field()
    #降水量
    amount = scrapy.Field()
    #风速
    speed = scrapy.Field()
    #城市名称
    city = scrapy.Field()
    #区号
    areacode = scrapy.Field()
    pass

编写自己的爬虫文件
进入刚刚创建的爬虫文件中编写爬虫项目
下面是我写的样例

import scrapy
from datashape import null
import pandas as pd
from demo03.items02 import Demo03Item2
import datetime
import json
import os
class WeatherSpider(scrapy.Spider):

    name = "weather3"
    #得到城市数据
    # def fileload(filename='weather.csv'):
    #     csvfile = open(filename,encoding='GBK')
    #     data = csv.reader(csvfile)
    #     dataset = []
    #     for line in data:
    #         dataset.append(line)
    #     csvfile.close()
    #     return dataset
    Env_path = os.getcwd()
    City_WeatherCode = pd.read_csv(Env_path+'/weather.csv',encoding='GBK')
    WeatherCode = City_WeatherCode['citycode']
    start_urls = []
    for code in WeatherCode:
        #print('http://www.weather.com.cn/weather1d/%s.shtml'%code)
        start_urls.append('http://www.weather.com.cn/weather1d/%s.shtml'%code)

    def parse(self, response):
        self.logger.info('A response from %s just arrived!', response.url)
        ############获取城市天气编码
        url = response.url
        split_url = url.split('/')
        get_areaCode = split_url[-1].split('.')
        areaCode = get_areaCode[0]

        sel = scrapy.Selector(response)
        title = sel.xpath('//div[@class="crumbs fl"]/a',encoding='GBK')
        city_list = title.xpath('text()').extract()
        city = ' '.join(city_list)
        span = sel.xpath('//div[@class="crumbs fl"]/span')
        area_list = span.xpath('text()').extract()
        area = area_list[-1]
        #print(city+area+"今天天气情况:")
        script = sel.xpath('//script')
        # 爬取天气状态
        # scriptText = script[2].xpath('text()').extract_first()
        # scriptNewText = scriptText[23:193]
        # scriptArray = scriptNewText.split('\",\"')
        # stateArray = []
        # for index in scriptArray:
        #     indexArray = index.split(',')
        #     stateArray.append(indexArray[2])
        # itm = iter(stateArray)
        # state = next(itm)

        # 爬取实时天气信息
        NowDate = (datetime.datetime.now()).strftime('%Y-%m-%d') ####当前时间
        pastTime = (datetime.datetime.now()-datetime.timedelta(hours=24)).strftime('%Y-%m-%d')###前一天日期
        scriptText = script[3].xpath('text()').extract_first()
        scriptStr = scriptText[23:-2]
        scriptJson = json.loads(scriptStr, encoding='utf-8')
        WeatherList = scriptJson['od']['od2']
        List_len = len(WeatherList)
        #print(List_len)
        label = 0
        for i in range(1, List_len):
            weather = Demo03Item2()

            weather['city'] = city+' '+area
            weather['areacode'] = areaCode
            weather_i = WeatherList[i]

            if WeatherList[i-1]['od21']=='00':
                label = 1
            if label == 1:
                weather['date'] = pastTime +' '+weather_i['od21'] ###时刻
            else:
                weather['date'] = NowDate+' '+weather_i['od21']  ###时刻
            weather['state'] = null
            weather['temp'] = weather_i['od22'] ##温度
            weather['wind'] = weather_i['od24'] ##风向
            weather['speed'] = weather_i['od25'] ##风速
            weather['amount'] = weather_i['od26'] ###降水量
            weather['humidity'] = weather_i['od27'] ###相对湿度
            weather['air'] = weather_i['od28'] ####空气质量指数:简称AQI

            yield weather

由此可见,爬虫编写无非是对特定网页使用一定的方法进行解析然后的到自己想要的内容,以上属于较为基础的爬虫解析方式,使用xpath进行解析
还可以使用正则等方法对网页进行解析,都是比较基础的东西,对爬取的网页进行解析是爬虫最为繁琐且最为主要的地方,解析的方法都比较简单,
难点在于等将各种方法融汇贯通,得到自己想要的东西。

为了启用一个Item Pipeline组件,你必须将它的类添加到 ITEM_PIPELINES 配置,就像下面这个例子:

ITEM_PIPELINES = {
    'myproject.pipelines.PricePipeline': 300,
    'myproject.pipelines.JsonWriterPipeline': 800,
}

此配置书写在setting中,一般setting中已经配置好,需要自己取消注释即可

对pipline的编写

# -*- coding: utf-8 -*-
#coding:gbk
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


import cx_Oracle
import csv
from demo03.items02 import Demo03Item2

class Demo03Pipeline(object):


    def getDbConn(self):
        tns_name = cx_Oracle.makedsn('111.111.111.111', '1111', ' aaaa ')
        connection = cx_Oracle.connect('aaaa', 'aaaa', tns_name, encoding='utf-8')
        return connection
    def process_item(self, item, spider):
        print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
        con = self.getDbConn()
        cur = con.cursor()
        # print(lis)
		sql = "insert into CNWEATHER3(AREACODE,CITY,DATER,JN_STATE,TEMP,WIND,SPEED,HUMIDITY,AIR,JN_AMOUNT) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"
		lis = (
		item['areacode'], item['city'], item['date'], item['state'], item['temp'], item['wind'], item['speed'],
		item['humidity'], item['air'], item['amount'])
        try:
            cur.execute(sql%lis)
        except Exception as e:
            print("Insert Error:", e)
            con.rollback()
        else:
            con.commit()
        print('成功插入', cur.rowcount, '条气象数据')
        cur.close()
        con.close()
        return item

以上是pipline的编写。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值