Python+Scrapy爬取安居客信息及数据存入MySQL,sqlite,MongoDB数据库

代码很详细,就不注释啦!有问题尽情留言,有问必答。。。
spider爬虫模块:

# -*- coding: utf-8 -*-

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from anjuke2.items import Anjuke2Item
from scrapy import Request
import time

class Anju2Spider(CrawlSpider):
    name = 'anju2'
    allowed_domains = ['sh.zu.anjuke.com']
    start_urls = ['http://sh.zu.anjuke.com/']
    
    page_link = LinkExtractor(restrict_xpaths='//div[@class="page-content"]/div/a')

    rules = ( 
        Rule(page_link, callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        infolist = response.xpath('//div[@class="list-content"]/div')
        infolist = infolist[2:-1]
        for house in infolist:
            try:
                name = house.xpath('.//div[@class="zu-info"]/h3/a/text()')[0].extract()
                huxing = house.xpath('.//div[@class="zu-info"]/p/text()')[0].extract().split(' ')[-1]
                louceng = house.xpath('.//div[@class="zu-info"]/p/text()')[2].extract()
                mianji = house.xpath('.//div[@class="zu-info"]/p/text()')[1].extract()
                addrss = house.xpath('.//div[@class="zu-info"]/address/a/text()')[0].extract()
                chuzufangshi = house.xpath('.//div[@class="zu-info"]/p[2]/span[1]/text()')[0].extract()
                rent = house.xpath('.//div[@class="zu-side"]/p/strong/text()')[0].extract()
                
                item = Anjuke2Item()
                city = response.xpath('//div[@class="cityselect"]/div[1]/text()')[0].extract().split(' ')[-1]

                item['city'] = city
                item['name'] = name
                item['huxing'] = huxing
                item['louceng'] = louceng
                item['mianji'] = mianji
                item['addrss'] = addrss
                item['chuzufangshi'] = chuzufangshi
                item['rent'] = rent
                yield item
            except Exception as e:
                print(e)

pipeline管道模块:
#写入本地/

import json
class Anjuke2Pipeline(object):
    def open_spider(self,spider):
        self.fp=open('上海.txt','w',encoding='utf8')
    def close_spider(self,spider):
        self.fp.close()

    def process_item(self, item, spider):
        dic=dict(item)
        string=json.dumps(dic,ensure_ascii=False)
        self.fp.write(string+'\n')
        return item

#写入MongoDB/

import pymongo
class mongodbPipeline(object):
    def open_spider(self,spider):
        self.client=pymongo.MongoClient(host='localhost',port=27017)
    def close_spider(self,spider):
        self.client.close()
    def process_item(self,item,spider):
        db=self.client.anjuke
        clo=db.zufang
        clo.insert(dict(item))
        return item

#写入pymysql数据库

import pymysql
class mysqlPipeline(object):
    def open_spider(self,spider):
        self.connect=pymysql.connect(host='127.0.0.1',port=3306,user='root',pwd='123456',database='anjuke',charset='utf8')
    def close_spider(self,spider):
        self.connect.close()
    def process_item(self,item,spider):
        self.save_mysql(item)
        return item
    def save_mysql(self,item):
        cursor=self.connect.cursor()
        sql='insert into zufang(city,title,huxing,louceng,mianji,addrss,chuzufangshi,rent) values("%s","%s","%s","%s","%s","%s","%s","%s")' % (item['city'],item['name'],item['huxing'],item['louceng'],item['mianji'],item['addrss'],item['chuzufangshi'],item['rent'])
        try:
            cursor.execute(sql)
            self.connect.commit()
        except Exception as e:
            print(e)
            self.connect.rollback()

#写入sqlite

from scrapy.utils.project import get_project_settings
import sqlite3
class sqllitPipeline(object):
    def open_spider(self,spider):
        settings = get_project_settings()

        self.db=sqlite3.connect('sql.db')
        self.cur = self.db.cursor()
        sql1 = '''create table zufang( 
                    city char(50)  not null,
                    title char(50) not null,
                    huxing char(50) not null,
                    louceng char(50) not null,
                    mianji char(50) not null,
                    addrss char(50) not null,
                    chuzufangshi char(50) not null,
                    rent char(50) not null)'''
        self.cur.execute(sql1)
    def close_spider(self,spider):
        self.db.close()

    def process_item(self,item,spider):
        self.save_to_sqlite(item)

        return item
    def save_to_sqlite(self,item):

        sql='insert into zufang(city,title,huxing,louceng,mianji,addrss,chuzufangshi,rent) values("%s","%s","%s","%s","%s","%s","%s","%s")' % (item['city'],item['name'],item['huxing'],item['louceng'],item['mianji'],item['addrss'],item['chuzufangshi'],item['rent'])
        try:
            self.cur.execute(sql)
            self.db.commit()
        except Exception as e:
            print(e)
            self.db.rollback()

items模块:

import scrapy

class Anjuke2Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    city = scrapy.Field()
    name = scrapy.Field()
    huxing = scrapy.Field()
    louceng = scrapy.Field()
    mianji = scrapy.Field()
    addrss = scrapy.Field()
    chuzufangshi = scrapy.Field()
    rent = scrapy.Field()

settings模块:

BOT_NAME = 'anjuke2'

SPIDER_MODULES = ['anjuke2.spiders']
NEWSPIDER_MODULE = 'anjuke2.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 3

ITEM_PIPELINES = {
   'anjuke2.pipelines.Anjuke2Pipeline': 300,
   # 'anjuke2.pipelines.mongodbPipeline': 301,
   # 'anjuke2.pipelines.mysqlPipeline': 302,
   # 'anjuke2.pipelines.sqllitPipeline': 303,
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值