import time
import random
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class TaobaoCrawler:
def __init__(self):
"""初始化游客模式爬虫"""
self.driver = None
self.products_data = []
self.setup_browser()
def setup_browser(self):
"""设置浏览器配置"""
options = webdriver.ChromeOptions()
# 反检测配置
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')
options.add_argument(
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0')
# 隐藏自动化特征
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
# 可选:无头模式(生产环境使用)
# options.add_argument('--headless')
try:
# 直接使用系统 ChromeDriver,不通过 webdriver_manager
self.driver = webdriver.Chrome(options=options)
# 执行脚本来隐藏webdriver特征
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
Object.defineProperty(navigator, 'webdriver', {get: () => undefined})
Object.defineProperty(navigator, 'languages', {get: () => ['zh-CN', 'zh']})
'''
})
logger.info("浏览器初始化成功 - 游客模式")
except Exception as e:
logger.error(f"浏览器初始化失败: {str(e)}")
raise
def search_products(self, keyword, pages=3, sort_by="sale-desc"):
"""搜索商品并爬取多页数据 - 改进版"""
try:
# 构建搜索URL,添加排序参数
search_url = f"https://s.taobao.com/search?q={keyword}&sort={sort_by}"
self.driver.get(search_url)
logger.info(f"搜索关键词: {keyword}, 排序方式: {sort_by}")
# 等待搜索结果加载 - 使用更稳定的选择器
WebDriverWait(self.driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".CardV2--doubleCard--_OJ1T8j"))
)
# 爬取多页数据
for page in range(1, pages + 1):
logger.info(f"正在爬取第 {page} 页")
self._scrape_current_page()
# 如果不是最后一页,尝试翻页
if page < pages:
if not self._go_to_next_page():
logger.warning("无法翻页,停止爬取")
break
# 随机延迟避免被封
time.sleep(random.uniform(3, 7))
return True
except Exception as e:
logger.error(f"搜索商品失败: {str(e)}")
return False
def _go_to_next_page(self):
"""翻页到下一页"""
try:
# 尝试多种翻页方式
next_selectors = [
".next-btn.next-next", # 淘宝的下一页按钮
".next-btn",
"button[aria-label*='下一页']",
".next-btn[aria-label*='下一页']"
]
for selector in next_selectors:
try:
next_btn = WebDriverWait(self.driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
)
self.driver.execute_script("arguments[0].click();", next_btn)
# 等待新页面加载
WebDriverWait(self.driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".CardV2--doubleCard--_OJ1T8j"))
)
logger.info("成功翻页到下一页")
return True
except:
continue
return False
except Exception as e:
logger.error(f"翻页失败: {str(e)}")
return False
def _scrape_current_page(self):
"""爬取当前页面的商品数据 - 改进版"""
try:
# 使用实际的选择器
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".CardV2--doubleCard--_OJ1T8j"))
)
# 获取商品元素
items = self.driver.find_elements(By.CSS_SELECTOR, ".CardV2--doubleCard--_OJ1T8j")
if not items:
# 备用选择器
items = self.driver.find_elements(By.CSS_SELECTOR, ".CardV2--mainPicAndDesc--Eb_BxDU")
logger.info(f"找到 {len(items)} 个商品")
successful_count = 0
for index, item in enumerate(items):
try:
# 滚动到元素位置
self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", item)
time.sleep(0.5)
product_data = self._extract_product_data(item)
if product_data:
self.products_data.append(product_data)
successful_count += 1
logger.info(f"成功提取商品 {successful_count}: {product_data['title'][:20]}...")
# 每5个商品休息一下
if (index + 1) % 5 == 0:
time.sleep(random.uniform(1, 3))
except Exception as e:
logger.error(f"提取第{index + 1}个商品失败: {str(e)}")
continue
logger.info(f"当前页面爬取完成,成功获取 {successful_count}/{len(items)} 个商品")
except Exception as e:
logger.error(f"爬取当前页面失败: {str(e)}")
def _extract_product_data(self, item):
"""从商品元素中提取数据 - 改进版"""
try:
# 提取基本信息 - 使用实际的选择器
title = self._safe_find_text(item, ".Title--title--wJY8TeA span")
# 提取价格(需要组合整数和小数)
price_int = self._safe_find_text(item, ".Price--priceInt--BXYeCOI")
price_float = self._safe_find_text(item, ".Price--priceFloat--rI_BYho")
price = f"{price_int}{price_float}" if price_int else "未知"
# 提取销量信息
sales = self._safe_find_text(item, ".Price--realSales--wnhaqVr")
# 提取店铺信息
shop_name = self._safe_find_text(item, ".ShopInfo--shopNameText--kxQC2cC")
# 提取发货地
location = self._safe_find_text(item, ".Price--procity--Na1DQVe")
# 提取商品链接
try:
link_element = item.find_element(By.CSS_SELECTOR, ".Title--title--wJY8TeA a")
product_url = link_element.get_attribute("href")
except:
product_url = ""
# 构建商品数据
product_data = {
"title": title or "未知",
"price": price or "未知",
"shop_name": shop_name or "未知",
"sales": sales or "0",
"location": location or "未知",
"url": product_url,
"crawl_time": time.strftime("%Y-%m-%d %H:%M:%S")
}
return product_data
except Exception as e:
logger.error(f"提取商品数据失败: {str(e)}")
return None
def _safe_find_text(self, parent, selector):
"""安全查找元素文本"""
try:
element = parent.find_element(By.CSS_SELECTOR, selector)
return element.text.strip()
except:
return ""
def save_to_csv(self, filename="taobao_products.csv"):
"""将爬取的数据保存为CSV文件"""
if not self.products_data:
logger.warning("没有数据可保存")
return False
try:
df = pd.DataFrame(self.products_data)
df.to_csv(filename, index=False, encoding='utf-8-sig')
logger.info(f"数据已保存到 {filename},共 {len(self.products_data)} 条记录")
return True
except Exception as e:
logger.error(f"保存CSV失败: {str(e)}")
return False
def save_to_json(self, filename="taobao_products.json"):
"""将爬取的数据保存为JSON文件"""
if not self.products_data:
logger.warning("没有数据可保存")
return False
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(self.products_data, f, ensure_ascii=False, indent=4)
logger.info(f"数据已保存到 {filename},共 {len(self.products_data)} 条记录")
return True
except Exception as e:
logger.error(f"保存JSON失败: {str(e)}")
return False
def get_statistics(self):
"""获取爬取统计信息"""
if not self.products_data:
return "没有数据"
total = len(self.products_data)
shops = len(set(item['shop_name'] for item in self.products_data))
return f"总计: {total} 个商品, {shops} 个店铺"
def close(self):
"""关闭浏览器"""
if hasattr(self, 'driver') and self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
def __del__(self):
"""析构函数,确保浏览器被关闭"""
self.close()
# 使用示例
if __name__ == "__main__":
crawler = TaobaoCrawler()
try:
# 搜索商品,按销量排序
crawler.search_products("女装", pages=60, sort_by="sale-desc")
# 保存数据
crawler.save_to_csv("taobao_products.csv")
crawler.save_to_json("taobao_products.json")
# 打印统计信息
print(crawler.get_statistics())
finally:
crawler.close()将这段代码里面的谷歌浏览器全都换成edge浏览器
最新发布