98%采集率!Pangolin亚马逊SP广告数据采集技术深度解析与实战

前言

在电商数据采集领域,亚马逊Sponsored Ads(SP广告)数据采集一直是技术难点。本文将深入分析Pangolin Scrape API如何实现98%的采集成功率,并提供完整的技术实现方案。

1. 技术挑战分析

1.1 亚马逊反爬虫机制

亚马逊部署了多层次的反爬虫系统:

# 常见的反爬虫检测点
ANTI_SCRAPING_CHECKS = {
    'ip_frequency': '单IP请求频率限制',
    'user_agent': 'User-Agent检测',
    'behavior_pattern': '用户行为模式分析',
    'device_fingerprint': '设备指纹识别',
    'javascript_challenge': 'JavaScript挑战验证',
    'captcha': '验证码验证'
}

1.2 动态内容加载

SP广告数据通过JavaScript动态加载,需要精确的时机控制:

// 广告数据加载检测
function waitForAdData() {
    return new Promise((resolve, reject) => {
        const checkInterval = setInterval(() => {
            const adElements = document.querySelectorAll('[data-component-type="s-search-result"]');
            const sponsoredAds = document.querySelectorAll('[data-component-type="sp-sponsored-result"]');
            
            if (adElements.length > 0 && sponsoredAds.length > 0) {
                clearInterval(checkInterval);
                resolve(true);
            }
        }, 100);
        
        // 超时处理
        setTimeout(() => {
            clearInterval(checkInterval);
            reject(new Error('Ad data loading timeout'));
        }, 30000);
    });
}

2. Pangolin API技术架构

2.1 分布式代理池设计

import asyncio
import aiohttp
from typing import List, Dict
import random

class ProxyPool:
    def __init__(self):
        self.proxies: List[Dict] = []
        self.failed_proxies: set = set()
        self.proxy_stats: Dict = {}
    
    async def add_proxy(self, proxy_config: Dict):
        """添加代理到池中"""
        proxy_id = f"{proxy_config['host']}:{proxy_config['port']}"
        self.proxies.append({
            'id': proxy_id,
            'config': proxy_config,
            'success_rate': 1.0,
            'last_used': 0,
            'consecutive_failures': 0
        })
    
    async def get_best_proxy(self) -> Dict:
        """获取最佳代理"""
        available_proxies = [
            p for p in self.proxies 
            if p['id'] not in self.failed_proxies 
            and p['consecutive_failures'] < 3
        ]
        
        if not available_proxies:
            await self.refresh_proxy_pool()
            available_proxies = self.proxies
        
        # 基于成功率和使用频率选择代理
        best_proxy = max(available_proxies, key=lambda x: (
            x['success_rate'] * 0.7 + 
            (1 / (x['last_used'] + 1)) * 0.3
        ))
        
        return best_proxy
    
    async def update_proxy_stats(self, proxy_id: str, success: bool):
        """更新代理统计信息"""
        for proxy in self.proxies:
            if proxy['id'] == proxy_id:
                if success:
                    proxy['consecutive_failures'] = 0
                    proxy['success_rate'] = min(1.0, proxy['success_rate'] + 0.01)
                else:
                    proxy['consecutive_failures'] += 1
                    proxy['success_rate'] = max(0.1, proxy['success_rate'] - 0.05)
                
                proxy['last_used'] = asyncio.get_event_loop().time()
                break

2.2 智能请求调度器

import time
import hashlib
from dataclasses import dataclass
from typing import Optional

@dataclass
class RequestConfig:
    url: str
    headers: Dict[str, str]
    proxy: Dict
    delay: float
    retry_count: int = 0

class IntelligentScheduler:
    def __init__(self):
        self.request_history: Dict = {}
        self.domain_limits: Dict = {}
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
        ]
    
    async def schedule_request(self, config: RequestConfig) -> Optional[Dict]:
        """智能调度请求"""
        # 生成请求指纹
        request_fingerprint = self._generate_fingerprint(config)
        
        # 检查请求频率
        if not self._check_rate_limit(config.url):
            await asyncio.sleep(self._calculate_delay(config.url))
        
        # 动态调整请求头
        config.headers = self._generate_headers(config)
        
        # 执行请求
        try:
            result = await self._execute_request(config)
            self._update_success_stats(config.url)
            return result
        except Exception as e:
            self._update_failure_stats(config.url)
            if config.retry_count < 3:
                config.retry_count += 1
                config.delay *= 2  # 指数退避
                return await self.schedule_request(config)
            raise e
    
    def _generate_fingerprint(self, config: RequestConfig) -> str:
        """生成请求指纹"""
        fingerprint_data = f"{config.url}_{config.proxy['id']}_{time.time()}"
        return hashlib.md5(fingerprint_data.encode()).hexdigest()
    
    def _generate_headers(self, config: RequestConfig) -> Dict[str, str]:
        """动态生成请求头"""
        base_headers = {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
        
        # 添加随机化的浏览器特征
        if random.random() > 0.5:
            base_headers['DNT'] = '1'
        
        if random.random() > 0.3:
            base_headers['Cache-Control'] = 'max-age=0'
        
        return {**base_headers, **config.headers}

2.3 数据提取引擎

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import re

class SPAdDataExtractor:
    def __init__(self):
        self.driver = None
        self.wait = None
    
    async def setup_driver(self, proxy_config: Dict):
        """设置浏览器驱动"""
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument(f'--proxy-server={proxy_config["host"]}:{proxy_config["port"]}')
        
        # 反检测设置
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        
        self.driver = webdriver.Chrome(options=options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        self.wait = WebDriverWait(self.driver, 30)
    
    async def extract_sp_ads(self, keyword: str, marketplace: str = 'amazon.com') -> List[Dict]:
        """提取SP广告数据"""
        search_url = f"https://{marketplace}/s?k={keyword.replace(' ', '+')}"
        
        try:
            # 访问搜索页面
            self.driver.get(search_url)
            
            # 等待页面加载完成
            await self._wait_for_page_load()
            
            # 提取广告数据
            ad_data = await self._extract_ad_elements()
            
            # 数据验证和清洗
            validated_data = await self._validate_data(ad_data)
            
            return validated_data
            
        except Exception as e:
            print(f"Error extracting SP ads: {e}")
            return []
    
    async def _wait_for_page_load(self):
        """等待页面加载完成"""
        # 等待搜索结果容器出现
        self.wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[data-component-type="s-search-result"]'))
        )
        
        # 等待广告标识加载
        try:
            self.wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '[data-component-type="sp-sponsored-result"]'))
            )
        except:
            pass  # 某些页面可能没有广告
        
        # 额外等待确保动态内容加载完成
        await asyncio.sleep(2)
    
    async def _extract_ad_elements(self) -> List[Dict]:
        """提取广告元素数据"""
        ad_data = []
        
        # 查找所有搜索结果
        search_results = self.driver.find_elements(By.CSS_SELECTOR, '[data-component-type="s-search-result"]')
        
        for result in search_results:
            try:
                # 检查是否为广告
                is_sponsored = self._is_sponsored_result(result)
                
                if is_sponsored:
                    ad_info = await self._extract_single_ad(result)
                    if ad_info:
                        ad_data.append(ad_info)
                        
            except Exception as e:
                print(f"Error extracting single ad: {e}")
                continue
        
        return ad_data
    
    def _is_sponsored_result(self, element) -> bool:
        """判断是否为广告结果"""
        # 检查多种广告标识
        sponsored_indicators = [
            '[data-component-type="sp-sponsored-result"]',
            '.s-sponsored-label-text',
            '[aria-label*="Sponsored"]',
            '.a-color-secondary:contains("Sponsored")'
        ]
        
        for indicator in sponsored_indicators:
            try:
                if element.find_element(By.CSS_SELECTOR, indicator):
                    return True
            except:
                continue
        
        return False
    
    async def _extract_single_ad(self, element) -> Optional[Dict]:
        """提取单个广告的详细信息"""
        try:
            ad_data = {
                'title': '',
                'price': '',
                'rating': '',
                'review_count': '',
                'image_url': '',
                'product_url': '',
                'seller': '',
                'ad_position': '',
                'sponsored_label': True
            }
            
            # 提取标题
            try:
                title_element = element.find_element(By.CSS_SELECTOR, 'h2 a span')
                ad_data['title'] = title_element.text.strip()
            except:
                pass
            
            # 提取价格
            try:
                price_element = element.find_element(By.CSS_SELECTOR, '.a-price-whole')
                price_fraction = element.find_element(By.CSS_SELECTOR, '.a-price-fraction')
                ad_data['price'] = f"{price_element.text}.{price_fraction.text}"
            except:
                try:
                    price_element = element.find_element(By.CSS_SELECTOR, '.a-price .a-offscreen')
                    ad_data['price'] = price_element.get_attribute('textContent')
                except:
                    pass
            
            # 提取评分
            try:
                rating_element = element.find_element(By.CSS_SELECTOR, '.a-icon-alt')
                rating_text = rating_element.get_attribute('textContent')
                rating_match = re.search(r'(\d+\.?\d*)', rating_text)
                if rating_match:
                    ad_data['rating'] = rating_match.group(1)
            except:
                pass
            
            # 提取评论数量
            try:
                review_element = element.find_element(By.CSS_SELECTOR, 'a[href*="#customerReviews"] span')
                review_text = review_element.text.strip()
                review_match = re.search(r'([\d,]+)', review_text)
                if review_match:
                    ad_data['review_count'] = review_match.group(1).replace(',', '')
            except:
                pass
            
            # 提取产品链接
            try:
                link_element = element.find_element(By.CSS_SELECTOR, 'h2 a')
                ad_data['product_url'] = link_element.get_attribute('href')
            except:
                pass
            
            # 提取图片URL
            try:
                img_element = element.find_element(By.CSS_SELECTOR, '.s-image')
                ad_data['image_url'] = img_element.get_attribute('src')
            except:
                pass
            
            return ad_data
            
        except Exception as e:
            print(f"Error extracting single ad data: {e}")
            return None
    
    async def _validate_data(self, ad_data: List[Dict]) -> List[Dict]:
        """验证和清洗数据"""
        validated_data = []
        
        for ad in ad_data:
            # 基本验证
            if not ad.get('title') or not ad.get('product_url'):
                continue
            
            # 数据清洗
            ad['title'] = self._clean_text(ad['title'])
            ad['price'] = self._clean_price(ad['price'])
            ad['rating'] = self._clean_rating(ad['rating'])
            
            # 添加时间戳
            ad['extracted_at'] = time.time()
            
            validated_data.append(ad)
        
        return validated_data
    
    def _clean_text(self, text: str) -> str:
        """清洗文本数据"""
        if not text:
            return ''
        return re.sub(r'\s+', ' ', text.strip())
    
    def _clean_price(self, price: str) -> str:
        """清洗价格数据"""
        if not price:
            return ''
        # 提取数字和小数点
        price_match = re.search(r'[\d,]+\.?\d*', price.replace('$', ''))
        return price_match.group(0) if price_match else ''
    
    def _clean_rating(self, rating: str) -> str:
        """清洗评分数据"""
        if not rating:
            return ''
        rating_match = re.search(r'\d+\.?\d*', rating)
        return rating_match.group(0) if rating_match else ''

3. API集成实现

3.1 Pangolin API客户端

import aiohttp
import asyncio
from typing import Dict, List, Optional
import json

class PangolinAPIClient:
    def __init__(self, api_key: str, base_url: str = "https://api.pangolinfo.com"):
        self.api_key = api_key
        self.base_url = base_url
        self.session = None
    
    async def __aenter__(self):
        self.session = aiohttp.ClientSession(
            headers={'Authorization': f'Bearer {self.api_key}'},
            timeout=aiohttp.ClientTimeout(total=60)
        )
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.session:
            await self.session.close()
    
    async def search_sponsored_ads(self, 
                                 keyword: str, 
                                 marketplace: str = 'amazon.com',
                                 page: int = 1,
                                 options: Optional[Dict] = None) -> Dict:
        """搜索SP广告数据"""
        endpoint = f"{self.base_url}/v1/amazon/sponsored-ads/search"
        
        params = {
            'keyword': keyword,
            'marketplace': marketplace,
            'page': page,
            'format': 'json'
        }
        
        if options:
            params.update(options)
        
        try:
            async with self.session.get(endpoint, params=params) as response:
                if response.status == 200:
                    data = await response.json()
                    return {
                        'success': True,
                        'data': data,
                        'metadata': {
                            'keyword': keyword,
                            'marketplace': marketplace,
                            'page': page,
                            'timestamp': time.time()
                        }
                    }
                else:
                    error_text = await response.text()
                    return {
                        'success': False,
                        'error': f"API Error {response.status}: {error_text}"
                    }
        
        except Exception as e:
            return {
                'success': False,
                'error': f"Request failed: {str(e)}"
            }
    
    async def get_product_details(self, asin: str, marketplace: str = 'amazon.com') -> Dict:
        """获取产品详情"""
        endpoint = f"{self.base_url}/v1/amazon/product/{asin}"
        
        params = {
            'marketplace': marketplace,
            'format': 'json'
        }
        
        try:
            async with self.session.get(endpoint, params=params) as response:
                if response.status == 200:
                    return {
                        'success': True,
                        'data': await response.json()
                    }
                else:
                    return {
                        'success': False,
                        'error': f"API Error {response.status}"
                    }
        
        except Exception as e:
            return {
                'success': False,
                'error': str(e)
            }
    
    async def batch_search(self, keywords: List[str], **kwargs) -> List[Dict]:
        """批量搜索"""
        tasks = []
        for keyword in keywords:
            task = self.search_sponsored_ads(keyword, **kwargs)
            tasks.append(task)
        
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return results

3.2 数据处理和存储

import sqlite3
import pandas as pd
from datetime import datetime
import json

class DataProcessor:
    def __init__(self, db_path: str = "sp_ads_data.db"):
        self.db_path = db_path
        self.init_database()
    
    def init_database(self):
        """初始化数据库"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS sp_ads (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                keyword TEXT NOT NULL,
                marketplace TEXT NOT NULL,
                title TEXT,
                price REAL,
                rating REAL,
                review_count INTEGER,
                asin TEXT,
                seller TEXT,
                ad_position INTEGER,
                image_url TEXT,
                product_url TEXT,
                extracted_at TIMESTAMP,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        ''')
        
        cursor.execute('''
            CREATE INDEX IF NOT EXISTS idx_keyword_marketplace 
            ON sp_ads(keyword, marketplace)
        ''')
        
        cursor.execute('''
            CREATE INDEX IF NOT EXISTS idx_extracted_at 
            ON sp_ads(extracted_at)
        ''')
        
        conn.commit()
        conn.close()
    
    def save_ad_data(self, ad_data: List[Dict], keyword: str, marketplace: str):
        """保存广告数据"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        for ad in ad_data:
            cursor.execute('''
                INSERT INTO sp_ads (
                    keyword, marketplace, title, price, rating, 
                    review_count, asin, seller, ad_position, 
                    image_url, product_url, extracted_at
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                keyword,
                marketplace,
                ad.get('title', ''),
                float(ad.get('price', 0)) if ad.get('price') else None,
                float(ad.get('rating', 0)) if ad.get('rating') else None,
                int(ad.get('review_count', 0)) if ad.get('review_count') else None,
                self._extract_asin(ad.get('product_url', '')),
                ad.get('seller', ''),
                ad.get('ad_position', 0),
                ad.get('image_url', ''),
                ad.get('product_url', ''),
                datetime.fromtimestamp(ad.get('extracted_at', time.time()))
            ))
        
        conn.commit()
        conn.close()
    
    def _extract_asin(self, product_url: str) -> str:
        """从产品URL中提取ASIN"""
        if not product_url:
            return ''
        
        asin_match = re.search(r'/dp/([A-Z0-9]{10})', product_url)
        if asin_match:
            return asin_match.group(1)
        
        asin_match = re.search(r'/gp/product/([A-Z0-9]{10})', product_url)
        if asin_match:
            return asin_match.group(1)
        
        return ''
    
    def get_competitor_analysis(self, keyword: str, days: int = 7) -> pd.DataFrame:
        """获取竞争对手分析数据"""
        conn = sqlite3.connect(self.db_path)
        
        query = '''
            SELECT 
                asin,
                title,
                AVG(price) as avg_price,
                AVG(rating) as avg_rating,
                AVG(review_count) as avg_reviews,
                COUNT(*) as appearance_count,
                AVG(ad_position) as avg_position
            FROM sp_ads 
            WHERE keyword = ? 
                AND extracted_at >= datetime('now', '-{} days')
            GROUP BY asin, title
            ORDER BY appearance_count DESC, avg_position ASC
        '''.format(days)
        
        df = pd.read_sql_query(query, conn, params=(keyword,))
        conn.close()
        
        return df
    
    def export_to_excel(self, keyword: str, output_file: str):
        """导出数据到Excel"""
        conn = sqlite3.connect(self.db_path)
        
        # 基础数据
        df_basic = pd.read_sql_query('''
            SELECT * FROM sp_ads 
            WHERE keyword = ? 
            ORDER BY extracted_at DESC
        ''', conn, params=(keyword,))
        
        # 竞争分析
        df_analysis = self.get_competitor_analysis(keyword)
        
        # 趋势分析
        df_trends = pd.read_sql_query('''
            SELECT 
                DATE(extracted_at) as date,
                COUNT(*) as total_ads,
                AVG(price) as avg_price,
                COUNT(DISTINCT asin) as unique_products
            FROM sp_ads 
            WHERE keyword = ?
            GROUP BY DATE(extracted_at)
            ORDER BY date DESC
        ''', conn, params=(keyword,))
        
        conn.close()
        
        with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
            df_basic.to_excel(writer, sheet_name='原始数据', index=False)
            df_analysis.to_excel(writer, sheet_name='竞争分析', index=False)
            df_trends.to_excel(writer, sheet_name='趋势分析', index=False)

4. 完整使用示例

4.1 基础使用

async def main():
    # 初始化API客户端
    async with PangolinAPIClient('your_api_key') as client:
        # 搜索SP广告数据
        result = await client.search_sponsored_ads(
            keyword='wireless earbuds',
            marketplace='amazon.com',
            options={
                'include_details': True,
                'max_results': 50
            }
        )
        
        if result['success']:
            print(f"找到 {len(result['data']['ads'])} 个广告")
            
            # 处理和保存数据
            processor = DataProcessor()
            processor.save_ad_data(
                result['data']['ads'], 
                'wireless earbuds', 
                'amazon.com'
            )
            
            # 生成分析报告
            analysis = processor.get_competitor_analysis('wireless earbuds')
            print(analysis.head())
        else:
            print(f"API调用失败: {result['error']}")

# 运行示例
if __name__ == "__main__":
    asyncio.run(main())

4.2 批量监控系统

class SPAdMonitor:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.processor = DataProcessor()
        self.keywords = []
        self.running = False
    
    def add_keyword(self, keyword: str, marketplace: str = 'amazon.com'):
        """添加监控关键词"""
        self.keywords.append({
            'keyword': keyword,
            'marketplace': marketplace
        })
    
    async def start_monitoring(self, interval_minutes: int = 60):
        """开始监控"""
        self.running = True
        
        while self.running:
            async with PangolinAPIClient(self.api_key) as client:
                for kw_config in self.keywords:
                    try:
                        result = await client.search_sponsored_ads(**kw_config)
                        
                        if result['success']:
                            self.processor.save_ad_data(
                                result['data']['ads'],
                                kw_config['keyword'],
                                kw_config['marketplace']
                            )
                            print(f"✅ {kw_config['keyword']}: {len(result['data']['ads'])} ads")
                        else:
                            print(f"❌ {kw_config['keyword']}: {result['error']}")
                    
                    except Exception as e:
                        print(f"❌ {kw_config['keyword']}: {e}")
                    
                    # 避免请求过于频繁
                    await asyncio.sleep(5)
            
            # 等待下一次监控
            await asyncio.sleep(interval_minutes * 60)
    
    def stop_monitoring(self):
        """停止监控"""
        self.running = False
    
    def generate_report(self, keyword: str, output_file: str):
        """生成监控报告"""
        self.processor.export_to_excel(keyword, output_file)

# 使用示例
async def run_monitor():
    monitor = SPAdMonitor('your_api_key')
    
    # 添加监控关键词
    monitor.add_keyword('wireless earbuds')
    monitor.add_keyword('bluetooth headphones')
    monitor.add_keyword('gaming mouse')
    
    # 开始监控(每30分钟一次)
    await monitor.start_monitoring(interval_minutes=30)

# 运行监控
if __name__ == "__main__":
    asyncio.run(run_monitor())

5. 性能优化和错误处理

5.1 并发控制

import asyncio
from asyncio import Semaphore

class ConcurrencyController:
    def __init__(self, max_concurrent: int = 10):
        self.semaphore = Semaphore(max_concurrent)
        self.active_requests = 0
    
    async def execute_with_limit(self, coro):
        """限制并发执行"""
        async with self.semaphore:
            self.active_requests += 1
            try:
                result = await coro
                return result
            finally:
                self.active_requests -= 1

# 使用示例
async def batch_search_with_concurrency(keywords: List[str]):
    controller = ConcurrencyController(max_concurrent=5)
    
    async with PangolinAPIClient('your_api_key') as client:
        tasks = []
        for keyword in keywords:
            task = controller.execute_with_limit(
                client.search_sponsored_ads(keyword)
            )
            tasks.append(task)
        
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return results

5.2 错误重试机制

import asyncio
import random
from functools import wraps

def retry_with_backoff(max_retries: int = 3, base_delay: float = 1.0):
    """指数退避重试装饰器"""
    def decorator(func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            last_exception = None
            
            for attempt in range(max_retries + 1):
                try:
                    return await func(*args, **kwargs)
                except Exception as e:
                    last_exception = e
                    
                    if attempt == max_retries:
                        raise last_exception
                    
                    # 计算延迟时间(指数退避 + 随机抖动)
                    delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
                    print(f"Attempt {attempt + 1} failed, retrying in {delay:.2f}s...")
                    await asyncio.sleep(delay)
            
            raise last_exception
        return wrapper
    return decorator

# 使用示例
@retry_with_backoff(max_retries=3, base_delay=2.0)
async def robust_search(client, keyword):
    """带重试的搜索函数"""
    return await client.search_sponsored_ads(keyword)

6. 监控和告警

6.1 数据质量监控

class DataQualityMonitor:
    def __init__(self, processor: DataProcessor):
        self.processor = processor
        self.quality_thresholds = {
            'min_ads_per_keyword': 5,
            'max_price_variance': 0.5,
            'min_data_completeness': 0.8
        }
    
    def check_data_quality(self, keyword: str) -> Dict:
        """检查数据质量"""
        conn = sqlite3.connect(self.processor.db_path)
        
        # 检查广告数量
        cursor = conn.cursor()
        cursor.execute('''
            SELECT COUNT(*) FROM sp_ads 
            WHERE keyword = ? AND DATE(extracted_at) = DATE('now')
        ''', (keyword,))
        
        ads_count = cursor.fetchone()[0]
        
        # 检查数据完整性
        cursor.execute('''
            SELECT 
                COUNT(*) as total,
                SUM(CASE WHEN title IS NOT NULL AND title != '' THEN 1 ELSE 0 END) as has_title,
                SUM(CASE WHEN price IS NOT NULL THEN 1 ELSE 0 END) as has_price,
                SUM(CASE WHEN asin IS NOT NULL AND asin != '' THEN 1 ELSE 0 END) as has_asin
            FROM sp_ads 
            WHERE keyword = ? AND DATE(extracted_at) = DATE('now')
        ''', (keyword,))
        
        completeness_data = cursor.fetchone()
        conn.close()
        
        if completeness_data[0] > 0:
            completeness_rate = (
                completeness_data[1] + completeness_data[2] + completeness_data[3]
            ) / (completeness_data[0] * 3)
        else:
            completeness_rate = 0
        
        quality_report = {
            'keyword': keyword,
            'ads_count': ads_count,
            'completeness_rate': completeness_rate,
            'quality_score': self._calculate_quality_score(ads_count, completeness_rate),
            'issues': []
        }
        
        # 检查质量问题
        if ads_count < self.quality_thresholds['min_ads_per_keyword']:
            quality_report['issues'].append(f"广告数量过少: {ads_count}")
        
        if completeness_rate < self.quality_thresholds['min_data_completeness']:
            quality_report['issues'].append(f"数据完整性不足: {completeness_rate:.2%}")
        
        return quality_report
    
    def _calculate_quality_score(self, ads_count: int, completeness_rate: float) -> float:
        """计算质量分数"""
        count_score = min(1.0, ads_count / self.quality_thresholds['min_ads_per_keyword'])
        completeness_score = completeness_rate
        
        return (count_score * 0.4 + completeness_score * 0.6) * 100

7. 总结

通过本文的技术深度解析,我们可以看到Pangolin API实现98%采集成功率的关键技术要素:

  1. 智能反检测机制:多层次的伪装和行为模拟
  2. 精确时机控制:准确捕获数据加载完成时机
  3. 分布式架构:高可用性和可扩展性
  4. 数据质量保证:多重验证和清洗机制

对于开发者而言,使用Pangolin API可以显著降低技术复杂度,提高开发效率,同时获得更高质量的数据。配合本文提供的代码示例,可以快速构建完整的SP广告监控系统。

参考资源


作者简介:资深电商数据工程师,专注于大规模数据采集和分析系统设计,拥有5年+电商数据处理经验。

声明:本文仅供技术学习交流,请遵守相关平台的使用条款和法律法规。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值