引言
在当今数据驱动的时代,网络爬虫已成为获取互联网信息的重要工具。本文将介绍如何使用Python的Scrapy框架构建一个高效的电商数据爬虫,并以京东商品信息爬取为例进行演示。
技术选型
我们选择以下技术栈:
-
Python 3.8+:作为主要编程语言
-
Scrapy:专业的爬虫框架,提供完整的爬虫开发解决方案
-
Scrapy-Redis:实现分布式爬取
-
MongoDB:存储爬取的数据
-
Docker:容器化部署
环境准备
首先确保已安装Python 3.8+,然后安装必要的依赖:
pip install scrapy scrapy-redis pymongo
项目创建
创建一个新的Scrapy项目:
scrapy startproject jd_spider
cd jd_spider
爬虫实现
1. 定义数据模型
在items.py
中定义我们要爬取的数据结构:
import scrapy
class JdProductItem(scrapy.Item):
# 商品基本信息
product_id = scrapy.Field() # 商品ID
title = scrapy.Field() # 商品标题
price = scrapy.Field() # 商品价格
brand = scrapy.Field() # 品牌
shop = scrapy.Field() # 店铺名称
comments = scrapy.Field() # 评论数量
good_rate = scrapy.Field() # 好评率
params = scrapy.Field() # 商品参数
url = scrapy.Field() # 商品链接
crawl_time = scrapy.Field() # 爬取时间
2. 编写爬虫核心代码
在spiders
目录下创建jd_product.py
:
import scrapy
import json
import re
from urllib.parse import urljoin
from jd_spider.items import JdProductItem
from scrapy_redis.spiders import RedisSpider
import datetime
class JdProductSpider(RedisSpider):
name = 'jd_product'
redis_key = 'jd_product:start_urls'
# 京东商品列表页URL模板
list_url_template = 'https://list.jd.com/list.html?cat={cat_id}&page={page}&sort={sort}'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# 初始化分类ID,这里以手机为例
self.cat_id = '9987,653,655'
self.sort = 'sort_rank_asc' # 排序方式
self.max_page = 100 # 最大爬取页数
def start_requests(self):
"""生成初始请求"""
for page in range(1, self.max_page + 1):
url = self.list_url_template.format(
cat_id=self.cat_id,
page=page,
sort=self.sort
)
yield scrapy.Request(url=url, callback=self.parse_list)
def parse_list(self, response):
"""解析商品列表页"""
# 提取商品SKU ID
sku_ids = response.css('li.gl-item::attr(data-sku)').getall()
for sku_id in sku_ids:
if not sku_id:
continue
# 构造商品详情页API请求
detail_url = f'https://item.jd.com/{sku_id}.html'
yield scrapy.Request(
url=detail_url,
callback=self.parse_detail,
meta={'sku_id': sku_id}
)
def parse_detail(self, response):
"""解析商品详情页"""
item = JdProductItem()
sku_id = response.meta['sku_id']
# 提取基本信息
item['product_id'] = sku_id
item['title'] = response.css('div.sku-name::text').get('').strip()
item['url'] = response.url
# 提取价格(通过API)
price_url = f'https://p.3.cn/prices/mgets?skuIds=J_{sku_id}'
yield scrapy.Request(
url=price_url,
callback=self.parse_price,
meta={'item': item}
)
def parse_price(self, response):
"""解析价格API返回"""
item = response.meta['item']
try:
price_data = json.loads(response.text)[0]
item['price'] = price_data.get('p', '')
except:
item['price'] = ''
# 提取评论信息
comments_url = f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={item["product_id"]}'
yield scrapy.Request(
url=comments_url,
callback=self.parse_comments,
meta={'item': item}
)
def parse_comments(self, response):
"""解析评论信息"""
item = response.meta['item']
try:
comments_data = json.loads(response.text)
comments = comments_data['CommentsCount'][0]
item['comments'] = comments.get('CommentCountStr', '')
item['good_rate'] = comments.get('GoodRate', '')
except:
item['comments'] = ''
item['good_rate'] = ''
# 提取品牌和店铺信息
yield scrapy.Request(
url=item['url'],
callback=self.parse_brand_shop,
meta={'item': item},
dont_filter=True
)
def parse_brand_shop(self, response):
"""解析品牌和店铺信息"""
item = response.meta['item']
# 提取品牌
brand = response.css('ul#parameter-brand li a::text').get('')
item['brand'] = brand.strip() if brand else ''
# 提取店铺
shop = response.css('div.popbox-inner h3 a::text').get('')
item['shop'] = shop.strip() if shop else ''
# 提取商品参数
params = {}
param_selector = response.css('ul#parameter2 li')
for li in param_selector:
text = li.css('::text').getall()
if len(text) >= 2:
key = text[0].strip()
value = text[1].strip()
if key and value:
params[key] = value
item['params'] = params
# 添加爬取时间
item['crawl_time'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
yield item
3. 配置项目设置
在settings.py
中添加以下配置:
# 基本设置
BOT_NAME = 'jd_spider'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
# 遵守robots.txt规则
ROBOTSTXT_OBEY = False
# 并发设置
CONCURRENT_REQUESTS = 16
DOWNLOAD_DELAY = 1
CONCURRENT_REQUESTS_PER_DOMAIN = 8
# 启用Redis分布式爬取
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_URL = 'redis://localhost:6379/0'
# 启用Item Pipeline
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300,
'jd_spider.pipelines.MongoDBPipeline': 400,
}
# 自动限速扩展
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 5
AUTOTHROTTLE_TARGET_CONCURRENCY = 4.0
# 日志设置
LOG_LEVEL = 'INFO'
LOG_FILE = 'jd_spider.log'
4. 实现数据存储Pipeline
在pipelines.py
中实现MongoDB存储:
import pymongo
from itemadapter import ItemAdapter
class MongoDBPipeline:
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'jd_products')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
collection_name = item.__class__.__name__
self.db[collection_name].update_one(
{'product_id': item['product_id']},
{'$set': ItemAdapter(item).asdict()},
upsert=True
)
return item
并在settings.py
中添加MongoDB配置:
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = 'jd_products'
运行爬虫
启动Redis服务后,运行爬虫:
scrapy crawl jd_product
数据可视化
我们可以使用爬取的数据进行简单的分析。以下是使用Python的pandas和matplotlib进行价格分布分析的示例:
import pymongo
import pandas as pd
import matplotlib.pyplot as plt
# 连接MongoDB
client = pymongo.MongoClient('mongodb://localhost:27017')
db = client['jd_products']
collection = db['JdProductItem']
# 查询数据
data = list(collection.find({}, {'price': 1, 'brand': 1}))
df = pd.DataFrame(data)
# 转换价格类型
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df = df.dropna(subset=['price'])
# 价格分布分析
plt.figure(figsize=(12, 6))
plt.hist(df['price'], bins=50, edgecolor='black')
plt.title('京东商品价格分布')
plt.xlabel('价格(元)')
plt.ylabel('商品数量')
plt.show()
# 品牌价格分析
brand_price = df.groupby('brand')['price'].mean().sort_values(ascending=False).head(10)
brand_price.plot(kind='bar', figsize=(12, 6))
plt.title('各品牌平均价格对比')
plt.ylabel('平均价格(元)')
plt.show()