总结:总的来说不是很难,只是提取的字段有些多。总共获取了一个120多个南京房租信息
1 爬取的item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class YoutxnanjinItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# pass
# 房源名称
homeName = scrapy.Field()
# 房源链接
homeLine = scrapy.Field()
# 房租单价
homeSinglePrice = scrapy.Field()
# 房租地址
homeAddress = scrapy.Field()
# 房租近期信息
homeDetai = scrapy.Field()
# 满七天价格
homeSeven = scrapy.Field()
# 满30天价格
homeThirth = scrapy.Field()
# 房东
homePerson = scrapy.Field()
# 房东头像
homePersonImg = scrapy.Field()
# 房东头像链接
homePersonLink = scrapy.Field()
# 房子大图
homePicBg = scrapy.Field()
# 房子大图链接
homePicLink = scrapy.Field()
# 品牌店铺信息
# homePinPai = scrapy.Field()
# 明星房东
# homeStarrPerson = scrapy.Field()
我就问:是不是注释很详细,。
2 spider里面的内容
#encoding=utf8
import scrapy
from youtxNanJin.items import YoutxnanjinItem
class NanJinDefault(scrapy.Spider):
name = 'youtx'
allowed_domains = ['youtx.com']
start_urls = ["http://www.youtx.com/nanjing/longrent1-page{}".format(n) for n in range(0,6)]
def parse(self, response):
# print(response.body)
node_list = response.xpath("//div[@class='duanzu houseList']/ul/li[@class='clearfix']")
# print(node_list)
for node in node_list:
item = YoutxnanjinItem()
homeName = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/text()").extract()
homeLink = node.xpath("./div[@clas