背景
最近在做电商数据分析项目时,需要大量采集亚马逊产品数据。从最初的简单爬虫到后来使用专业API,踩了不少坑,也积累了一些经验。这里分享一下技术实现的完整过程。
需求分析
项目需要采集的数据包括:
- 产品基础信息(标题、价格、品牌等)
- 销售数据(排名、评分、评论数)
- 库存状态
- 变体信息
- 历史价格趋势
数据量级:每天需要更新约10万个ASIN的数据。
技术方案演进
方案一:传统爬虫
最开始用的是经典的Python爬虫方案:
import requests
from bs4 import BeautifulSoup
import time
import random
from fake_useragent import UserAgent
class AmazonScraper:
def __init__(self):
self.session = requests.Session()
self.ua = UserAgent()
def get_product_info(self, asin):
url = f"https://www.amazon.com/dp/{asin}"
headers = {
'User-Agent': self.ua.random,
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection': 'keep-alive',
}
try:
# 随机延时
time.sleep(random.uniform(2, 5))
response = self.session.get(url, headers=headers, timeout=10)
if response.status_code == 200:
return self.parse_page(response.text, asin)
else:
print(f"HTTP {response.status_code} for {asin}")
return None
except Exception as e:
print(f"Error scraping {asin}: {e}")
return None
def parse_page(self, html, asin):
soup = BeautifulSoup(html, 'html.parser')
# 提取标题
title_elem = soup.find('span', {'id': 'productTitle'})
title = title_elem.get_text().strip() if title_elem else None
# 提取价格
price = self.extract_price(soup)
# 提取评分
rating_elem = soup.find('span', {'class': 'a-icon-alt'})
rating = None
if rating_elem:
rating_text = rating_elem.get_text()
if 'out of' in rating_text:
rating = float(rating_text.split()[0])
# 提取评论数
reviews_elem = soup.find('span', {'id': 'acrCustomerReviewText'})
reviews_count = None
if reviews_elem:
reviews_text = reviews_elem.get_text()
reviews_count = int(reviews_text.split()[0].replace(',', ''))
return {
'asin': asin,
'title': title,
'price': price,
'rating': rating,
'reviews_count': reviews_count,
'timestamp': time.time()
}
def extract_price(self, soup):
# 价格提取逻辑比较复杂,亚马逊有多种价格显示方式
price_selectors = [
'.a-price-whole',
'.a-offscreen',
'#priceblock_dealprice',
'#priceblock_ourprice'
]
for selector in price_selectors:
price_elem = soup.select_one(selector)
if price_elem:
price_text = price_elem.get_text().strip()
# 清理价格文本,提取数字
import re
price_match = re.search(r'[\d,]+\.?\d*', price_text)
if price_match:
return float(price_match.group().replace(',', ''))
return None
遇到的问题:
- IP封禁:即使使用代理池,大量请求还是会被封
- 验证码:频繁出现CAPTCHA验证
- 页面结构变化:亚马逊经常调整页面结构,需要不断更新解析逻辑
- 数据不完整:某些字段经常抓取不到
- 效率低下:为了避免被封,请求间隔很长
方案二:Selenium + 代理池
为了解决反爬虫问题,尝试了Selenium方案:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import random
class SeleniumAmazonScraper:
def __init__(self, proxy_list):
self.proxy_list = proxy_list
self.driver = None
def setup_driver(self):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# 随机选择代理
proxy = random.choice(self.proxy_list)
chrome_options.add_argument(f'--proxy-server={proxy}')
# 随机User-Agent
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
chrome_options.add_argument(f'--user-agent={random.choice(user_agents)}')
self.driver = webdriver.Chrome(options=chrome_options)
def scrape_asin(self, asin):
if not self.driver:
self.setup_driver()
url = f"https://www.amazon.com/dp/{asin}"
try:
self.driver.get(url)
# 等待页面加载
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.ID, "productTitle"))
)
# 检查是否出现验证码
if "captcha" in self.driver.page_source.lower():
print(f"CAPTCHA detected for {asin}")
return None
# 提取数据
data = self.extract_data(asin)
return data
except Exception as e:
print(f"Error scraping {asin}: {e}")
# 重新初始化driver
self.driver.quit()
self.driver = None
return None
def extract_data(self, asin):
try:
title = self.driver.find_element(By.ID, "productTitle").text.strip()
except:
title = None
try:
price_elem = self.driver.find_element(By.CSS_SELECTOR, ".a-price-whole")
price = float(price_elem.text.replace(',', ''))
except:
price = None
# ... 其他字段提取逻辑
return {
'asin': asin,
'title': title,
'price': price,
# ... 其他字段
}
改进效果:
- 能够处理JavaScript渲染的内容
- 一定程度上绕过了反爬虫检测
新问题:
- 资源消耗大,每个浏览器实例占用大量内存
- 速度更慢
- 代理成本高
- 仍然会遇到验证码
方案三:分布式爬虫
为了提高效率,搭建了基于Scrapy的分布式爬虫:
# scrapy_amazon/spiders/amazon_spider.py
import scrapy
from scrapy_redis.spiders import RedisSpider
import json
class AmazonSpider(RedisSpider):
name = 'amazon'
redis_key = 'amazon:start_urls'
custom_settings = {
'DOWNLOAD_DELAY': 3,
'RANDOMIZE_DOWNLOAD_DELAY': 0.5,
'CONCURRENT_REQUESTS': 16,
'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
'RETRY_TIMES': 3,
'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
}
def parse(self, response):
asin = response.meta['asin']
# 检查是否被重定向到机器人检测页面
if 'robot_check' in response.url:
self.logger.warning(f"Robot check detected for {asin}")
return
# 提取产品数据
item = self.extract_product_data(response, asin)
if item:
yield item
def extract_product_data(self, response, asin):
title = response.css('#productTitle::text').get()
if title:
title = title.strip()
# 价格提取
price_selectors = [
'.a-price-whole::text',
'.a-offscreen::text',
'#priceblock_dealprice::text'
]
price = None
for selector in price_selectors:
price_text = response.css(selector).get()
if price_text:
import re
price_match = re.search(r'[\d,]+\.?\d*', price_text)
if price_match:
price = float(price_match.group().replace(',', ''))
break
# 评分
rating_text = response.css('.a-icon-alt::text').get()
rating = None
if rating_text and 'out of' in rating_text:
rating = float(rating_text.split()[0])
return {
'asin': asin,
'title': title,
'price': price,
'rating': rating,
'url': response.url,
'scraped_at': time.time()
}
# settings.py
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'scrapy_redis.middlewares.RedisPriorityQueue': 110,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
'scrapy_proxy_middleware.middlewares.ProxyMiddleware': 350,
}
# Redis配置
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_DB = 0
# 数据存储
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300,
'scrapy_amazon.pipelines.PostgresPipeline': 400,
}
效果:
- 提高了并发处理能力
- 支持断点续传
- 数据去重
问题:
- 系统复杂度大幅增加
- 运维成本高
- 反爬虫对抗仍然激烈
方案四:API解决方案
经过前面的折腾,最终选择了API方案。对比了几个服务商后,选择了Pangolin的API:
import requests
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor
import logging
class AmazonAPIClient:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://api.pangolinfo.com"
self.session = requests.Session()
# 设置日志
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
def get_product_data(self, asin, marketplace='US'):
"""
获取单个ASIN数据
"""
url = f"{self.base_url}/scrape"
params = {
'api_key': self.api_key,
'asin': asin,
'marketplace': marketplace,
'parse': True
}
try:
response = self.session.get(url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
if data.get('status') == 'success':
return self.parse_api_response(data['data'], asin)
else:
self.logger.error(f"API error for {asin}: {data.get('message')}")
return None
except requests.RequestException as e:
self.logger.error(f"Request failed for {asin}: {e}")
return None
def parse_api_response(self, data, asin):
"""
解析API返回的数据
"""
return {
'asin': asin,
'title': data.get('title'),
'price': data.get('price'),
'currency': data.get('currency'),
'rating': data.get('rating'),
'reviews_count': data.get('reviews_count'),
'best_seller_rank': data.get('best_seller_rank'),
'availability': data.get('availability'),
'brand': data.get('brand'),
'category': data.get('category'),
'images': data.get('images', []),
'features': data.get('features', []),
'description': data.get('description'),
'variations': data.get('variations', []),
'scraped_at': time.time()
}
def batch_get_products(self, asin_list, max_workers=10):
"""
批量获取产品数据
"""
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# 提交所有任务
future_to_asin = {
executor.submit(self.get_product_data, asin): asin
for asin in asin_list
}
# 收集结果
for future in future_to_asin:
asin = future_to_asin[future]
try:
result = future.result()
if result:
results.append(result)
self.logger.info(f"Successfully scraped {asin}")
else:
self.logger.warning(f"No data for {asin}")
except Exception as e:
self.logger.error(f"Exception for {asin}: {e}")
return results
def save_to_database(self, data_list):
"""
保存数据到数据库
"""
df = pd.DataFrame(data_list)
# 这里可以连接到你的数据库
# 例如PostgreSQL, MySQL等
from sqlalchemy import create_engine
engine = create_engine('postgresql://user:password@localhost/amazon_data')
df.to_sql('products', engine, if_exists='append', index=False)
self.logger.info(f"Saved {len(data_list)} records to database")
# 使用示例
def main():
api_client = AmazonAPIClient('your_api_key')
# 测试单个ASIN
asin = 'B08N5WRWNW'
product_data = api_client.get_product_data(asin)
print(json.dumps(product_data, indent=2))
# 批量处理
asin_list = ['B08N5WRWNW', 'B07XJ8C8F5', 'B09B8RDPX7']
results = api_client.batch_get_products(asin_list)
# 保存到数据库
if results:
api_client.save_to_database(results)
if __name__ == "__main__":
main()
性能对比
| 方案 | 成功率 | 速度(ASIN/小时) | 维护成本 | 开发成本 |
|---|---|---|---|---|
| 基础爬虫 | 60% | 500 | 高 | 低 |
| Selenium | 70% | 200 | 高 | 中 |
| 分布式爬虫 | 75% | 2000 | 很高 | 高 |
| API方案 | 98% | 10000+ | 低 | 低 |
数据处理与存储
数据库设计
-- 产品基础信息表
CREATE TABLE products (
id SERIAL PRIMARY KEY,
asin VARCHAR(20) UNIQUE NOT NULL,
title TEXT,
brand VARCHAR(255),
category VARCHAR(255),
price DECIMAL(10,2),
currency VARCHAR(3),
rating DECIMAL(3,2),
reviews_count INTEGER,
availability VARCHAR(50),
best_seller_rank INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- 价格历史表
CREATE TABLE price_history (
id SERIAL PRIMARY KEY,
asin VARCHAR(20) REFERENCES products(asin),
price DECIMAL(10,2),
currency VARCHAR(3),
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- 评论统计表
CREATE TABLE review_stats (
id SERIAL PRIMARY KEY,
asin VARCHAR(20) REFERENCES products(asin),
total_reviews INTEGER,
five_star INTEGER,
four_star INTEGER,
three_star INTEGER,
two_star INTEGER,
one_star INTEGER,
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- 创建索引
CREATE INDEX idx_products_asin ON products(asin);
CREATE INDEX idx_price_history_asin ON price_history(asin);
CREATE INDEX idx_price_history_recorded_at ON price_history(recorded_at);
数据清洗
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
class DataCleaner:
def __init__(self):
pass
def clean_price_data(self, df):
"""
清洗价格数据
"""
# 移除异常价格
df = df[df['price'] > 0]
df = df[df['price'] < 10000] # 假设最高价格不超过10000
# 处理价格异常波动
df['price_change'] = df.groupby('asin')['price'].pct_change()
# 标记异常数据点(价格变化超过50%)
df['is_anomaly'] = abs(df['price_change']) > 0.5
return df
def clean_review_data(self, df):
"""
清洗评论数据
"""
# 评论数只能增加,不能减少(除非产品下架重新上架)
df = df.sort_values(['asin', 'recorded_at'])
# 检查评论数异常减少
df['review_change'] = df.groupby('asin')['reviews_count'].diff()
df['review_decrease'] = df['review_change'] < -100 # 评论数减少超过100认为异常
return df
def detect_outliers(self, df, column):
"""
使用IQR方法检测异常值
"""
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df['is_outlier'] = (df[column] < lower_bound) | (df[column] > upper_bound)
return df
监控与告警
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
class MonitoringSystem:
def __init__(self, db_config, email_config):
self.db_config = db_config
self.email_config = email_config
def check_data_freshness(self):
"""
检查数据新鲜度
"""
query = """
SELECT COUNT(*) as count
FROM products
WHERE updated_at < NOW() - INTERVAL '1 hour'
"""
# 执行查询
stale_count = self.execute_query(query)[0]['count']
if stale_count > 1000: # 超过1000个产品数据过期
self.send_alert(f"数据过期警告:{stale_count}个产品数据超过1小时未更新")
def check_api_quota(self):
"""
检查API配额使用情况
"""
# 这里可以调用API查询剩余配额
# 或者从数据库统计今日API调用次数
pass
def send_alert(self, message):
"""
发送告警邮件
"""
msg = MIMEMultipart()
msg['From'] = self.email_config['from']
msg['To'] = self.email_config['to']
msg['Subject'] = "Amazon数据采集系统告警"
msg.attach(MIMEText(message, 'plain'))
server = smtplib.SMTP(self.email_config['smtp_server'], 587)
server.starttls()
server.login(self.email_config['username'], self.email_config['password'])
server.send_message(msg)
server.quit()
总结
从技术角度来看,亚马逊数据采集的演进过程反映了几个趋势:
- 反爬虫技术越来越复杂:传统爬虫方案的成本和难度不断增加
- 专业化分工:专业的API服务商能够提供更稳定、高效的解决方案
- 合规性要求:数据采集需要考虑法律法规和平台规则
技术选型建议:
- 小规模需求(<1000 ASIN/天):可以考虑简单爬虫
- 中等规模需求(1000-10000 ASIN/天):建议使用API方案
- 大规模需求(>10000 ASIN/天):必须使用专业API服务
成本分析:
虽然API方案需要付费,但考虑到开发成本、维护成本、机会成本,总体ROI是最高的。
最后,无论选择哪种方案,都要注意:
- 遵守robots.txt和平台规则
- 合理控制请求频率
- 做好数据备份和容错处理
- 建立监控和告警机制
代码仓库:
本文涉及的完整代码已上传至GitHub:amazon-scraping-evolution
本文仅供技术学习交流,请在使用时遵守相关法律法规和平台规则。
472

被折叠的 条评论
为什么被折叠?



