隐藏webdriver

webdriver检测这个问题最明显的就是淘宝登录,当我们使用不加任何隐藏处理的seleniumpyppeteer模拟登陆时,淘宝登录时的人机验证将会返回一个错误信息,其实很多其他网站的反爬机制也会检测这一条。所以我们使用seleniumpyppeteer这种库时就需要做webdriver的隐藏处理。

网上大多数处理这种问题就是使用js注入,但这种处理方法明显就是掩耳盗铃,认真思考后就会知道这种方案不可行,因为这些js注入都是在打开网页后进行的,网页早已检测到你是通过webdriver进行的网页访问。所以我们隐藏webdriver的方法应该是在打开网页之前,生成浏览器对象后。之前的方案可以通过开启Chrome浏览器的实验性功能参数 excludeSwitches,但在Chrome浏览器的安全更新后,这个方法失效了,目前可行方法是调用Chrome浏览器CDP的Page.addScriptToEvaluateOnNewDocument命令,通过这个命令可以让Chrome浏览器打开页面时先执行我们所给的js脚本。具体的讲解可以查阅 这篇文章seleniumpyppeteer的使用代码如下:

from selenium.webdriver import Chrome

driver = Chrome()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  "source": """
    Object.defineProperty(navigator, 'webdriver', {
      get: () => undefined
    })
  """
})
driver.get('https://www.baidu.com')
from pyppeteer import launch

async def main():
    browser = await launch(headless=False, userDataDir='F:\\python\\userdata', args=['--disable-infobars'])
    page = await browser.newPage()
    await page.evaluateOnNewDocument('''() => {
            Object.defineProperty(navigator, 'webdriver', {
            get: () => undefined
            })
            }
        ''')

    await page.goto('https://www.baidu.com')

当然也可以通过编写Chrome浏览器的插件来达到隐藏webdriver的目的,但是相对于这种方法来说略显麻烦,不过优点在于会更稳定一些,不会受到Chrome更新的影响。

import time import random import json import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import logging # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class TaobaoCrawler: def __init__(self): """初始化游客模式爬虫""" self.driver = None self.products_data = [] self.setup_browser() def setup_browser(self): """设置浏览器配置""" options = webdriver.ChromeOptions() # 反检测配置 options.add_argument('--disable-blink-features=AutomationControlled') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-extensions') options.add_argument( '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0') # 隐藏自动化特征 options.add_experimental_option('excludeSwitches', ['enable-automation']) options.add_experimental_option('useAutomationExtension', False) # 可选:无头模式(生产环境使用) # options.add_argument('--headless') try: # 直接使用系统 ChromeDriver,通过 webdriver_manager self.driver = webdriver.Chrome(options=options) # 执行脚本来隐藏webdriver特征 self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source': ''' Object.defineProperty(navigator, 'webdriver', {get: () => undefined}) Object.defineProperty(navigator, 'languages', {get: () => ['zh-CN', 'zh']}) ''' }) logger.info("浏览器初始化成功 - 游客模式") except Exception as e: logger.error(f"浏览器初始化失败: {str(e)}") raise def search_products(self, keyword, pages=3, sort_by="sale-desc"): """搜索商品并爬取多页数据 - 改进版""" try: # 构建搜索URL,添加排序参数 search_url = f"https://s.taobao.com/search?q={keyword}&sort={sort_by}" self.driver.get(search_url) logger.info(f"搜索关键词: {keyword}, 排序方式: {sort_by}") # 等待搜索结果加载 - 使用更稳定的选择器 WebDriverWait(self.driver, 15).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".CardV2--doubleCard--_OJ1T8j")) ) # 爬取多页数据 for page in range(1, pages + 1): logger.info(f"正在爬取第 {page} 页") self._scrape_current_page() # 如果是最后一页,尝试翻页 if page < pages: if not self._go_to_next_page(): logger.warning("无法翻页,停止爬取") break # 随机延迟避免被封 time.sleep(random.uniform(3, 7)) return True except Exception as e: logger.error(f"搜索商品失败: {str(e)}") return False def _go_to_next_page(self): """翻页到下一页""" try: # 尝试多种翻页方式 next_selectors = [ ".next-btn.next-next", # 淘宝的下一页按钮 ".next-btn", "button[aria-label*='下一页']", ".next-btn[aria-label*='下一页']" ] for selector in next_selectors: try: next_btn = WebDriverWait(self.driver, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, selector)) ) self.driver.execute_script("arguments[0].click();", next_btn) # 等待新页面加载 WebDriverWait(self.driver, 15).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".CardV2--doubleCard--_OJ1T8j")) ) logger.info("成功翻页到下一页") return True except: continue return False except Exception as e: logger.error(f"翻页失败: {str(e)}") return False def _scrape_current_page(self): """爬取当前页面的商品数据 - 改进版""" try: # 使用实际的选择器 WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".CardV2--doubleCard--_OJ1T8j")) ) # 获取商品元素 items = self.driver.find_elements(By.CSS_SELECTOR, ".CardV2--doubleCard--_OJ1T8j") if not items: # 备用选择器 items = self.driver.find_elements(By.CSS_SELECTOR, ".CardV2--mainPicAndDesc--Eb_BxDU") logger.info(f"找到 {len(items)} 个商品") successful_count = 0 for index, item in enumerate(items): try: # 滚动到元素位置 self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", item) time.sleep(0.5) product_data = self._extract_product_data(item) if product_data: self.products_data.append(product_data) successful_count += 1 logger.info(f"成功提取商品 {successful_count}: {product_data['title'][:20]}...") # 每5个商品休息一下 if (index + 1) % 5 == 0: time.sleep(random.uniform(1, 3)) except Exception as e: logger.error(f"提取第{index + 1}个商品失败: {str(e)}") continue logger.info(f"当前页面爬取完成,成功获取 {successful_count}/{len(items)} 个商品") except Exception as e: logger.error(f"爬取当前页面失败: {str(e)}") def _extract_product_data(self, item): """从商品元素中提取数据 - 改进版""" try: # 提取基本信息 - 使用实际的选择器 title = self._safe_find_text(item, ".Title--title--wJY8TeA span") # 提取价格(需要组合整数和小数) price_int = self._safe_find_text(item, ".Price--priceInt--BXYeCOI") price_float = self._safe_find_text(item, ".Price--priceFloat--rI_BYho") price = f"{price_int}{price_float}" if price_int else "未知" # 提取销量信息 sales = self._safe_find_text(item, ".Price--realSales--wnhaqVr") # 提取店铺信息 shop_name = self._safe_find_text(item, ".ShopInfo--shopNameText--kxQC2cC") # 提取发货地 location = self._safe_find_text(item, ".Price--procity--Na1DQVe") # 提取商品链接 try: link_element = item.find_element(By.CSS_SELECTOR, ".Title--title--wJY8TeA a") product_url = link_element.get_attribute("href") except: product_url = "" # 构建商品数据 product_data = { "title": title or "未知", "price": price or "未知", "shop_name": shop_name or "未知", "sales": sales or "0", "location": location or "未知", "url": product_url, "crawl_time": time.strftime("%Y-%m-%d %H:%M:%S") } return product_data except Exception as e: logger.error(f"提取商品数据失败: {str(e)}") return None def _safe_find_text(self, parent, selector): """安全查找元素文本""" try: element = parent.find_element(By.CSS_SELECTOR, selector) return element.text.strip() except: return "" def save_to_csv(self, filename="taobao_products.csv"): """将爬取的数据保存为CSV文件""" if not self.products_data: logger.warning("没有数据可保存") return False try: df = pd.DataFrame(self.products_data) df.to_csv(filename, index=False, encoding='utf-8-sig') logger.info(f"数据已保存到 {filename},共 {len(self.products_data)} 条记录") return True except Exception as e: logger.error(f"保存CSV失败: {str(e)}") return False def save_to_json(self, filename="taobao_products.json"): """将爬取的数据保存为JSON文件""" if not self.products_data: logger.warning("没有数据可保存") return False try: with open(filename, 'w', encoding='utf-8') as f: json.dump(self.products_data, f, ensure_ascii=False, indent=4) logger.info(f"数据已保存到 {filename},共 {len(self.products_data)} 条记录") return True except Exception as e: logger.error(f"保存JSON失败: {str(e)}") return False def get_statistics(self): """获取爬取统计信息""" if not self.products_data: return "没有数据" total = len(self.products_data) shops = len(set(item['shop_name'] for item in self.products_data)) return f"总计: {total} 个商品, {shops} 个店铺" def close(self): """关闭浏览器""" if hasattr(self, 'driver') and self.driver: self.driver.quit() logger.info("浏览器已关闭") def __del__(self): """析构函数,确保浏览器被关闭""" self.close() # 使用示例 if __name__ == "__main__": crawler = TaobaoCrawler() try: # 搜索商品,按销量排序 crawler.search_products("女装", pages=60, sort_by="sale-desc") # 保存数据 crawler.save_to_csv("taobao_products.csv") crawler.save_to_json("taobao_products.json") # 打印统计信息 print(crawler.get_statistics()) finally: crawler.close()将这段代码里面的谷歌浏览器全都换成edge浏览器
11-04
import time import random import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver import ChromeOptions from bs4 import BeautifulSoup import sqlite3 # 禁用图片加载 chrome_options = Options() prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) # 隐藏webdriver属性 option = ChromeOptions() option.add_argument('--disable-blink-features=AutomationControlled') option.add_experimental_option("excludeSwitches", ["enable-automation"]) driver = webdriver.Chrome(options=chrome_options, options=option) driver.execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) """ } ) cities = ['厦门', '泉州', '宁德'] data = [] for city in cities: url = f'https://sou.zhaopin.com/?jl={city}&kw=IT&kt=3' try: driver.get(url) last_height = driver.execute_script("return document.body.scrollHeight") while True: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 设置随机访问间隔 time.sleep(random.uniform(1, 3)) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height html = driver.page_source soup = BeautifulSoup(html, 'html.parser') job_items = soup.find_all('div', class_='joblist-box__item') for item in job_items: try: job_title = item.find('span', class_='jobname__title').text.strip() company_name = item.find('a', class_='company__title').text.strip() salary = item.find('span', class_='salary__item').text.strip() data.append({ '城市': city, '职位名称': job_title, '公司名称': company_name, '薪资': salary }) except AttributeError: continue except Exception as e: print(f"访问 {city} 页面时出现错误: {e}") driver.quit() # 数据筛选与清洗 df = pd.DataFrame(data) df = df.drop_duplicates() df = df.dropna() # 数据存储 conn = sqlite3.connect('fujian_jobs.db') df.to_sql('jobs', conn, if_exists='replace', index=False) conn.close() # 数据可视化 plt.figure(figsize=(12, 6)) sns.countplot(x='城市', data=df) plt.title('各城市招聘职位数量分布') plt.xlabel('城市') plt.ylabel('职位数量') plt.show() plt.figure(figsize=(12, 6)) sns.boxplot(x='城市', y='薪资', data=df) plt.title('各城市薪资分布') plt.xlabel('城市') plt.ylabel('薪资') plt.show() 然后呢,你没给完整
最新发布
12-24
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值