1. create a scrapy project
>>> scrapy startproject appstore
2. define extracted data schema
edit appstore/appstore/items.py, add the following:
import scrapy
class AppstoreItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
appid = scrapy.Field()
intro = scrapy.Field()
3. edit huawei_spider.py (example here: extract data from huawei appstore)
import scrapy
import re
from scrapy.selector import Selector
from appstore.items import AppstoreItem
class HuaweiSpider(scrapy.Spider):
name = "huawei"
allowed_domains = ["huawei.com"]
start_urls = ["http://appstore.huawei.com/more/all"]
def parse(self, response):
page = Selector(response)
divs = page.xpath('//div[@class="game-info whole"]')
for div in divs:
item = AppstoreItem()
item['title'] = div.xpath('.//h4[@class="title"]/a/text()'). \
extract_first().encode('utf-8')
item['url'] = div.xpath('.//h4[@class="title"]/a/@href').extract_first().encode('utf-8')
appid = re.match(r'http://.*/(.*)', item['url']).group(1)
item['appid'] = appid
item['intro'] = div.xpath('.//p[@class="content"]/text()'). \
extract_first().encode('utf-8')
yield item
4. enable data pipeline in Scrapy
edit appstore/appstore/settings.py
ITEM_PIPELINES = {
'appstore.pipelines.AppstorePipeline':300,
}
DOWNLOAD_DELAY=5
edit appstore/appstore/pipelines.py
class AppstorePipeline(object):
def __init__(self):
self.file = open('appstore.dat', 'wb')
def process_item(self, item, spider):
val = "{0}\t{1}\t{2}\n".format(item['appid'], item['title'], item['intro'])
self.file.write(val)
return item
5. run your spider
cd appstore
scrapy crawl huawei
cat appstore.dat
follow URLs in homepage
1. define extracted data schema -- add a new field to the schema
recommended = scrapy.Field()
2. modify huawei_spider.py
import scrapy
import re
from scrapy.selector import Selector
from appstore.items import AppstoreItem
class HuaweiSpider(scrapy.Spider):
name = "huawei"
allowed_domains = ["huawei.com"]
start_urls = ["http://appstore.huawei.com/more/all"]
def parse(self, response):
page = Selector(response)
hrefs = page.xpath('//h4[@class="title"]/a/@href')
for href in hrefs:
url = href.extract()
yield scrapy.Request(url, callback = self.parse_item)
def parse_item(self, response):
page = Selector(response)
item = AppstoreItem()
item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()'). \
extract_first().encode('utf-8')
item['url'] = response.url
appid = re.match(r'http://.*/(.*)', item['url']).group(1)
item['appid'] = appid
item['intro'] = page.xpath('//meta[@name="description"]/@content'). \
extract_first().encode('utf-8')
divs = page.xpath('//div[@class="open-info"]')
recomm = ""
for div in divs:
url = div.xpath('./p[@class="name"]/a/@href').extract_first()
recommended_appid = re.match(r'http://.*/(.*)', url).group(1)
name = div.xpath('./p[@class="name"]/a/text()').extract_first().encode('utf-8')
recomm += "{0}:{1},".format(recommended_appid, name)
item['recommended'] = recomm
yield item
3. run the spider
>>> scrapy crawl huawei
>>> cat appstore.dat