import time
import random
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
import logging
import os
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class TaobaoCrawler:
def __init__(self):
"""初始化游客模式爬虫"""
self.driver = None
self.products_data = []
self.setup_browser()
def setup_browser(self):
"""设置浏览器配置 - 使用系统自带的Edge驱动"""
# 先清理可能的残留进程
try:
os.system('taskkill /f /im msedge.exe 2>nul')
os.system('taskkill /f /im msedgedriver.exe 2>nul')
time.sleep(2)
except:
pass
options = Options()
# 简化配置
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument(
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
try:
# 方法1:不使用Service,直接让Selenium自动查找系统EdgeDriver
self.driver = webdriver.Edge(options=options)
logger.info("使用系统EdgeDriver成功 - 游客模式")
except Exception as e:
logger.warning(f"系统EdgeDriver失败: {str(e)}")
try:
# 方法2:尝试使用当前目录的msedgedriver.exe
service = Service("./msedgedriver.exe")
self.driver = webdriver.Edge(service=service, options=options)
logger.info("使用当前目录EdgeDriver成功")
except Exception as e2:
logger.error(f"所有Edge方案都失败: {str(e2)}")
raise
# 执行脚本来隐藏webdriver特征
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
Object.defineProperty(navigator, 'webdriver', {get: () => undefined})
Object.defineProperty(navigator, 'languages', {get: () => ['zh-CN', 'zh']})
'''
})
# 设置窗口大小
self.driver.set_window_size(1366, 768)
def search_products(self, keyword, pages=2, sort_by="sale-desc"):
"""搜索商品并爬取多页数据 - 改进版"""
try:
# 使用天猫搜索URL(不需要登录)
search_url = f"https://list.tmall.com/search_product.htm?q={keyword}"
logger.info(f"正在访问天猫: {search_url}")
self.driver.get(search_url)
logger.info(f"搜索关键词: {keyword}")
# 随机等待,模拟人类行为
time.sleep(random.uniform(2, 5))
# 检查是否需要登录
if self._check_login_required():
logger.warning("检测到登录要求,尝试绕过...")
if not self._bypass_login():
logger.error("无法绕过登录,停止爬取")
return False
# 尝试多种选择器等待页面加载
selectors_to_try = [
".product-iWrap", # 天猫常用选择器
".product",
".item",
".productItem",
"[data-id]"
]
element_found = False
for selector in selectors_to_try:
try:
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, selector))
)
logger.info(f"使用选择器找到商品列表: {selector}")
element_found = True
break
except:
continue
if not element_found:
logger.warning("未找到商品列表元素,尝试直接爬取")
# 即使没找到元素也继续尝试爬取
# 爬取多页数据
for page in range(1, pages + 1):
logger.info(f"正在爬取第 {page} 页")
self._scrape_current_page()
# 如果不是最后一页,尝试翻页
if page < pages:
time.sleep(random.uniform(2, 4))
if not self._go_to_next_page():
logger.warning("无法翻页,停止爬取")
break
# 随机延迟避免被封
time.sleep(random.uniform(5, 10))
return True
except Exception as e:
logger.error(f"搜索商品失败: {str(e)}")
return False
def _check_login_required(self):
"""检查是否需要登录"""
try:
# 检查登录相关的元素
login_selectors = [
".login-box",
".sign-in",
".login-form",
"#login",
".password-login",
".fm-button"
]
for selector in login_selectors:
elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
if elements:
logger.info(f"检测到登录元素: {selector}")
return True
return False
except:
return False
def _bypass_login(self):
"""尝试绕过登录"""
try:
# 方法1: 刷新页面
self.driver.refresh()
time.sleep(3)
# 方法2: 回退到首页再重新搜索
self.driver.get("https://www.tmall.com")
time.sleep(2)
self.driver.get(f"https://list.tmall.com/search_product.htm?q=女装")
time.sleep(3)
return not self._check_login_required()
except Exception as e:
logger.error(f"绕过登录失败: {str(e)}")
return False
def _go_to_next_page(self):
"""翻页到下一页"""
try:
# 尝试多种翻页方式
next_selectors = [
"a[aria-label*='下一页']",
"a[title*='下一页']",
".next-btn",
".next",
".page-next",
"li.next a"
]
for selector in next_selectors:
try:
next_btn = WebDriverWait(self.driver, 5).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
)
# 滚动到元素
self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_btn)
time.sleep(1)
# 使用JavaScript点击,避免被检测
self.driver.execute_script("arguments[0].click();", next_btn)
# 等待页面加载
time.sleep(random.uniform(3, 6))
logger.info("成功翻页到下一页")
return True
except:
continue
logger.warning("未找到翻页按钮")
return False
except Exception as e:
logger.error(f"翻页失败: {str(e)}")
return False
def _scrape_current_page(self):
"""爬取当前页面的商品数据"""
try:
# 模拟人类滚动行为
self._simulate_human_scroll()
# 尝试多种商品选择器(天猫专用)
item_selectors = [
".product-iWrap", # 天猫经典选择器
".product",
".item",
".productItem",
"[data-id]"
]
items = []
for selector in item_selectors:
found_items = self.driver.find_elements(By.CSS_SELECTOR, selector)
if found_items:
items = found_items
logger.info(f"使用选择器 '{selector}' 找到 {len(items)} 个商品")
break
if not items:
logger.warning("未找到任何商品元素")
return
successful_count = 0
for index, item in enumerate(items[:20]): # 限制数量避免被封
try:
# 滚动到元素位置
self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", item)
time.sleep(random.uniform(0.5, 1.5))
product_data = self._extract_product_data(item)
if product_data:
self.products_data.append(product_data)
successful_count += 1
logger.info(f"成功提取商品 {successful_count}: {product_data['title'][:30]}...")
# 随机延迟,模拟人类阅读
if (index + 1) % 3 == 0:
time.sleep(random.uniform(1, 3))
except Exception as e:
logger.debug(f"提取第{index + 1}个商品失败: {str(e)}")
continue
logger.info(f"当前页面爬取完成,成功获取 {successful_count}/{len(items)} 个商品")
except Exception as e:
logger.error(f"爬取当前页面失败: {str(e)}")
def _simulate_human_scroll(self):
"""模拟人类滚动行为"""
try:
scroll_height = self.driver.execute_script("return document.body.scrollHeight")
current_pos = 0
while current_pos < scroll_height:
scroll_distance = random.randint(200, 500)
current_pos += scroll_distance
self.driver.execute_script(f"window.scrollTo(0, {current_pos});")
time.sleep(random.uniform(0.1, 0.3))
# 随机回滚一点
if random.random() > 0.5:
self.driver.execute_script("window.scrollTo(0, window.scrollY - 100);")
time.sleep(0.5)
except Exception as e:
logger.debug(f"模拟滚动失败: {str(e)}")
def _extract_product_data(self, item):
"""从商品元素中提取数据"""
try:
# 提取标题(天猫专用选择器)
title_selectors = [
".productTitle",
".product-title",
".title",
"a[title]"
]
title = ""
for selector in title_selectors:
title = self._safe_find_text(item, selector)
if title and title != "未知":
break
# 尝试从title属性获取
try:
title = item.find_element(By.CSS_SELECTOR, selector).get_attribute("title")
if title:
break
except:
pass
# 提取价格(天猫专用选择器)
price_selectors = [
".productPrice",
".product-price",
".c-price",
".price"
]
price = ""
for selector in price_selectors:
price_text = self._safe_find_text(item, selector)
if price_text and price_text != "未知":
price = price_text
break
# 提取销量
sales_selectors = [
".productStatus",
".product-status",
".sale",
".deal-cnt"
]
sales = ""
for selector in sales_selectors:
sales_text = self._safe_find_text(item, selector)
if sales_text:
sales = sales_text
break
# 提取店铺
shop_selectors = [
".productShop",
".product-shop",
".shopname",
".shop-name"
]
shop_name = ""
for selector in shop_selectors:
shop_text = self._safe_find_text(item, selector)
if shop_text and shop_text != "未知":
shop_name = shop_text
break
# 构建商品数据
product_data = {
"title": title or "未知",
"price": price or "未知",
"shop_name": shop_name or "未知",
"sales": sales or "0",
"location": "未知", # 天猫通常不直接显示发货地
"crawl_time": time.strftime("%Y-%m-%d %H:%M:%S")
}
return product_data
except Exception as e:
logger.debug(f"提取商品数据失败: {str(e)}")
return None
def _safe_find_text(self, parent, selector):
"""安全查找元素文本"""
try:
element = parent.find_element(By.CSS_SELECTOR, selector)
text = element.text.strip()
return text if text else ""
except:
return ""
def save_to_csv(self, filename="taobao_products.csv"):
"""将爬取的数据保存为CSV文件"""
if not self.products_data:
logger.warning("没有数据可保存")
return False
try:
df = pd.DataFrame(self.products_data)
df.to_csv(filename, index=False, encoding='utf-8-sig')
logger.info(f"数据已保存到 {filename},共 {len(self.products_data)} 条记录")
return True
except Exception as e:
logger.error(f"保存CSV失败: {str(e)}")
return False
def save_to_json(self, filename="taobao_products.json"):
"""将爬取的数据保存为JSON文件"""
if not self.products_data:
logger.warning("没有数据可保存")
return False
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(self.products_data, f, ensure_ascii=False, indent=4)
logger.info(f"数据已保存到 {filename},共 {len(self.products_data)} 条记录")
return True
except Exception as e:
logger.error(f"保存JSON失败: {str(e)}")
return False
def get_statistics(self):
"""获取爬取统计信息"""
if not self.products_data:
return "没有数据"
total = len(self.products_data)
shops = len(set(item['shop_name'] for item in self.products_data))
return f"总计: {total} 个商品, {shops} 个店铺"
def close(self):
"""关闭浏览器"""
if hasattr(self, 'driver') and self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
def __del__(self):
"""析构函数,确保浏览器被关闭"""
self.close()
# 使用示例
if __name__ == "__main__":
crawler = TaobaoCrawler()
try:
# 搜索商品,使用天猫(不需要登录)
success = crawler.search_products("女装", pages=1)
if success and crawler.products_data:
# 保存数据
crawler.save_to_csv("taobao_products.csv")
crawler.save_to_json("taobao_products.json")
# 打印统计信息
print(crawler.get_statistics())
else:
print("爬取失败,没有获取到数据")
except Exception as e:
logger.error(f"程序运行出错: {str(e)}")
finally:
crawler.close()
最新发布