import time
import random
import requests
import json
import logging
import socket
from pathlib import Path
from openpyxl import load_workbook
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException, WebDriverException
from fake_useragent import UserAgent, FakeUserAgentError
# -------------------------- 新增:手动指定Edge浏览器和驱动路径 --------------------------
EDGE_BINARY_PATH = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe" # 浏览器路径
EDGE_DRIVER_PATH = r"C:\Users\27570\Desktop\edgedriver_win32\msedgedriver.exe" # 驱动路径,需下载并指定
# -------------------------------------------------------------------------------------
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("ip_query.log"),
logging.StreamHandler()
]
)
# -------------------------- 优化:整合网络连接检查 --------------------------
def check_internet_connection():
"""检查网络连接是否正常,尝试多种连接方式提高可靠性"""
try:
# 尝试连接到Google的公共DNS服务器
socket.create_connection(("8.8.8.8", 53), timeout=5)
logging.info("网络连接测试通过 (DNS)")
return True
except OSError:
logging.warning("无法连接到DNS服务器,尝试HTTP请求...")
try:
# 尝试HTTP请求到百度
response = requests.get("https://www.baidu.com", timeout=5)
if response.status_code == 200:
logging.info("网络连接测试通过 (HTTP)")
return True
except requests.RequestException:
logging.warning("HTTP请求失败,尝试HTTPS请求...")
try:
# 尝试HTTPS请求到百度
response = requests.get("https://www.baidu.com", timeout=5)
if response.status_code == 200:
logging.info("网络连接测试通过 (HTTPS)")
return True
except requests.RequestException:
logging.error("HTTPS请求失败")
return False
# -------------------------------------------------------------------------------------
def build_query_url(base_url, ip_address, path_format="{ip}/", use_params=True, param_name="ip"):
"""构建IP查询URL,支持路径参数和查询参数两种格式"""
if not base_url.startswith(('http://', 'https://')):
base_url = 'https://' + base_url
base_url = base_url.rstrip('/')
try:
if '.' in ip_address: # IPv4
socket.inet_pton(socket.AF_INET, ip_address)
elif ':' in ip_address: # IPv6
socket.inet_pton(socket.AF_INET6, ip_address)
else:
raise ValueError("无效的IP地址")
if ':' in ip_address:
ip_address = f"[{ip_address}]"
if use_params:
# 使用查询参数的方式构造URL
from urllib.parse import urlencode
# 添加固定参数action=2
params = {param_name: ip_address, "action": 2}
return f"{base_url}?{urlencode(params)}"
else:
# 原有的路径参数方式
from urllib.parse import quote
encoded_ip = quote(ip_address)
return f"{base_url}/{path_format.format(ip=encoded_ip)}"
except socket.error:
logging.error(f"无效的IP地址格式: {ip_address}")
return None
except Exception as e:
logging.error(f"构建查询URL时出错: {str(e)}")
return None
# 手动配置的固定请求头
MANUAL_HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-encoding': 'gzip, deflate, br, zstd',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'connection': 'keep-alive',
'cookie': '_c_WBKFRo=NWgPw1zeBaW3I2CtOcadfhJJw33TcEYmWMtyGzTE; Hm_lvt_f4f76646cd877e538aa1fbbdf351c548=1753560343,1753617545,1753793389,1753862286; HMACCOUNT=96B6BD9DE68EFF3B; PHPSESSID=o9fnnscr7sofru4b8r1khlde3f; Hm_lvt_f4f76646cd877e538aa1fbbdf351c548=1754123598; HMACCOUNT=96B6BD9DE68EFF3B; Hm_lpvt_f4f76646cd877e538aa1fbbdf351c548=1754611428; Hm_lpvt_f4f76646cd877e538aa1fbbdf351c548=1754613370',
'host': 'www.ip138.com',
'referer': 'https://www.ip138.com/iplookup.php?ip=27.154.214.154&action=2',
'sec-ch-ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Microsoft Edge";v="138"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0'
}
def configure_driver(max_retries=5):
"""配置Edge浏览器驱动"""
for attempt in range(max_retries):
try:
# 先检查网络连接
if not check_internet_connection():
raise Exception("网络连接不可用")
# 创建Edge选项
edge_options = Options()
# 1. 基础配置
edge_options.binary_location = EDGE_BINARY_PATH # 手动指定浏览器路径
edge_options.add_argument("--disable-blink-features=AutomationControlled") # 核心反检测
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
edge_options.add_experimental_option("useAutomationExtension", False)
edge_options.add_argument("--start-maximized") # 最大化窗口
# 2. 增强反检测
edge_options.add_argument("--disable-extensions")
edge_options.add_argument("--disable-plugins-discovery")
edge_options.add_argument("--disable-web-security")
# 3. 随机化配置
features_to_disable = [
"AutomationControlled", "InterestCohort", "BlinkGenPropertyTrees"
]
edge_options.add_argument(f"--disable-features={','.join(random.sample(features_to_disable, random.randint(2, 4)))}")
screen_sizes = [(1366, 768), (1920, 1080), (1536, 864)]
width, height = random.choice(screen_sizes)
edge_options.add_argument(f"--window-size={width},{height}")
if random.random() > 0.5:
edge_options.add_argument("--disable-gpu")
else:
edge_options.add_argument("--enable-gpu-rasterization")
# 4. 资源加载控制
prefs = {
"profile.managed_default_content_settings.images": 2,
"profile.managed_default_content_settings.stylesheets": 2,
}
edge_options.add_experimental_option("prefs", prefs)
edge_options.page_load_strategy = 'eager' # 只等待DOM加载
# 5. 使用手动指定的驱动路径
try:
service = Service(EDGE_DRIVER_PATH) # 手动指定驱动路径
logging.info(f"使用手动指定的驱动路径: {EDGE_DRIVER_PATH}")
except Exception as e:
logging.error(f"驱动路径配置错误: {str(e)}")
raise
# 6. 创建浏览器实例
driver = webdriver.Edge(service=service, options=edge_options)
# 7. 隐藏自动化特征
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
// 核心:隐藏webdriver标志
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// 模拟Chrome特征
window.chrome = { runtime: {} };
// 模拟时区
Intl.DateTimeFormat().resolvedOptions().timeZone =
['Asia/Shanghai', 'Asia/Beijing'][Math.floor(Math.random() * 2)];
"""
})
# 8. 设置超时
driver.set_page_load_timeout(30)
driver.set_script_timeout(30)
# 9. 应用手动配置的请求头
logging.info(f"应用手动配置的请求头: {json.dumps(MANUAL_HEADERS, indent=2)[:100]}...")
driver.execute_cdp_cmd("Network.setUserAgentOverride", {
"userAgent": MANUAL_HEADERS["user-agent"],
"accept": MANUAL_HEADERS["accept"],
"acceptLanguage": MANUAL_HEADERS["accept-language"],
})
logging.info(f"浏览器驱动初始化成功 (尝试 {attempt+1}/{max_retries})")
return driver
except Exception as e:
logging.error(f"配置浏览器驱动失败 (尝试 {attempt+1}/{max_retries}): {str(e)}")
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(5, 10)
logging.info(f"将在 {wait_time:.2f} 秒后重试")
time.sleep(wait_time)
logging.critical("达到最大重试次数,无法初始化浏览器驱动")
return None
def change_user_agent(driver):
"""更换为手动配置的请求头"""
logging.info(f"应用手动配置的请求头: {json.dumps(MANUAL_HEADERS, indent=2)[:100]}...")
driver.execute_cdp_cmd("Network.setUserAgentOverride", {
"userAgent": MANUAL_HEADERS["user-agent"],
"accept": MANUAL_HEADERS["accept"],
"acceptLanguage": MANUAL_HEADERS["accept-language"],
})
driver.refresh()
time.sleep(random.uniform(2, 4))
def handle_cookies(driver):
"""处理和保存Cookie"""
cookies = driver.get_cookies()
logging.info(f"获取到 {len(cookies)} 个Cookie")
return cookies
def is_banned(driver):
"""检测是否被封禁"""
try:
banned_xpaths = [
'//div[contains(text(), "访问被阻止")]',
'//div[contains(text(), "验证码")]',
'//div[contains(text(), "您的IP已被封禁")]',
]
for xpath in banned_xpaths:
if WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH, xpath))
):
logging.warning("检测到封禁页面")
return True
return False
except:
return False
def check_dynamic_element(driver, xpaths):
"""检查网页上是否存在任一动态XPath的元素"""
for i, xpath in enumerate(xpaths, 1):
try:
WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH, xpath))
)
logging.info(f"使用动态元素XPath {i}: {xpath}")
return True
except:
continue
return False
def get_result_element(driver, xpaths):
"""尝试获取任一结果元素"""
for i, xpath in enumerate(xpaths, 1):
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, xpath))
)
logging.info(f"使用结果元素XPath {i}: {xpath}")
return element
except:
continue
return None
def simulate_human_behavior(driver):
"""模拟人类浏览行为"""
try:
# 随机滚动页面
scroll_height = driver.execute_script("return document.body.scrollHeight")
scroll_steps = random.randint(3, 7)
for i in range(scroll_steps):
scroll_to = int(scroll_height * (i + 1) / scroll_steps)
driver.execute_script(f"window.scrollTo(0, {scroll_to})")
time.sleep(random.uniform(0.5, 1.5))
# 随机移动鼠标
actions = ActionChains(driver)
elements = driver.find_elements(By.TAG_NAME, "a")
if elements:
for _ in range(random.randint(1, 3)):
element = random.choice(elements)
actions.move_to_element(element).perform()
time.sleep(random.uniform(0.3, 0.8))
except Exception as e:
logging.warning(f"模拟人类行为时出错: {e}")
def query_ip(driver, ip_address, base_url, xpath_expressions, dynamic_xpaths, max_retries=5):
"""查询IP信息,添加封禁检测和处理"""
if not ip_address or not isinstance(ip_address, str) or ip_address.strip() == "":
logging.warning(f"无效的IP地址: {ip_address}")
return "无效IP"
ip_address = ip_address.strip()
for attempt in range(max_retries):
try:
if attempt > 0:
wait_time = 2 ** attempt + random.uniform(3, 7)
logging.info(f"第 {attempt+1} 次重试前等待 {wait_time:.2f} 秒...")
time.sleep(wait_time)
change_user_agent(driver)
query_url = build_query_url(base_url, ip_address)
if not query_url:
logging.error(f"无法构建有效的查询URL,IP: {ip_address}")
return "无效URL"
logging.info(f"访问查询URL (尝试 {attempt+1}/{max_retries}): {query_url}")
try:
driver.get(query_url)
time.sleep(random.uniform(8, 15))
if is_banned(driver):
logging.warning(f"IP {ip_address} 查询时被封禁")
driver.quit()
time.sleep(5)
driver = configure_driver()
time.sleep(5)
continue
current_url = driver.current_url
if current_url == "data:," or "about:blank" in current_url:
raise Exception("浏览器加载了空白页面")
simulate_human_behavior(driver)
time.sleep(random.uniform(2, 5))
handle_cookies(driver)
except TimeoutException:
logging.warning(f"页面加载超时,尝试重新加载")
driver.refresh()
time.sleep(15)
continue
if check_dynamic_element(driver, dynamic_xpaths):
logging.info(f"检测到动态元素,结果将设为'动态'")
return "动态"
result_element = get_result_element(driver, xpath_expressions)
if result_element:
return result_element.text.strip()
else:
raise Exception("无法找到结果元素")
except WebDriverException as e:
logging.error(f"WebDriver错误 (尝试 {attempt+1}/{max_retries}): {str(e)}")
if "ERR_EMPTY_RESPONSE" in str(e) or "ERR_CONNECTION_RESET" in str(e):
logging.warning("检测到连接错误,尝试重启浏览器...")
driver.quit()
time.sleep(15)
driver = configure_driver()
time.sleep(10)
else:
time.sleep(2 ** attempt + random.uniform(5, 10))
continue
except Exception as e:
logging.error(f"查询IP {ip_address} 失败 (尝试 {attempt+1}/{max_retries}): {str(e)}")
time.sleep(2 ** attempt + random.uniform(5, 10))
continue
logging.error(f"IP {ip_address} 查询失败,已达到最大重试次数")
driver.save_screenshot(f"error_{ip_address}.png")
return "查询失败"
def is_row_hidden(worksheet, row_idx):
"""检查Excel行是否被隐藏"""
return worksheet.row_dimensions[row_idx].hidden
def process_excel(input_file, base_url, xpath_expressions, dynamic_xpaths, ip_column='A', result_column='I', start_row=2):
"""处理Excel文件"""
wb = load_workbook(input_file)
ws = wb.active
has_filter = ws.auto_filter.ref is not None
logging.info(f"检测到筛选: {has_filter}")
driver = configure_driver()
if not driver:
logging.critical("无法初始化浏览器驱动,退出程序")
return
visible_rows = []
for row_idx in range(start_row, ws.max_row + 1):
row_dim = ws.row_dimensions.get(row_idx)
if not row_dim or not row_dim.hidden:
visible_rows.append(row_idx)
logging.info(f"可见行共{len(visible_rows)}行")
total_visible = len(visible_rows)
processed_count = 0
try:
for i, row in enumerate(visible_rows, 1):
ip_address = ws[f"{ip_column}{row}"].value
if not ip_address:
logging.info(f"第 {row} 行IP地址为空,跳过")
continue
logging.info(f"正在查询IP: {ip_address} ({i}/{total_visible})")
result = query_ip(driver, ip_address, base_url, xpath_expressions, dynamic_xpaths)
ws[f"{result_column}{row}"] = result
processed_count += 1
if i % 3 == 0 or i == total_visible:
wb.save(input_file)
logging.info(f"已保存进度: {i}/{total_visible} 到 {input_file}")
wait_time = random.uniform(20,40)
logging.info(f"等待 {wait_time:.2f} 秒后继续...")
time.sleep(wait_time)
if i % 10 == 0:
extra_wait = random.uniform(40,60)
logging.info(f"已处理 {i} 个IP,额外休息 {extra_wait:.2f} 秒...")
time.sleep(extra_wait)
if i % 20 == 0:
logging.info(f"已处理 {i} 个IP,重启浏览器以避免被检测...")
driver.quit()
time.sleep(15)
driver = configure_driver()
if not driver:
logging.critical("无法重新初始化浏览器驱动,退出程序")
return
except Exception as e:
logging.critical(f"处理过程中发生意外错误: {str(e)}")
finally:
if driver:
driver.quit()
wb.save(input_file)
logging.info(f"已保存最终结果到 {input_file}")
logging.info(f"处理完成!共处理 {processed_count}/{total_visible} 个可见IP地址")
if __name__ == "__main__":
INPUT_FILE = r"C:\Users\27570\Desktop\飞塔-福建-简版-更新版20250730.xlsx"
# 修改为新的基础URL
BASE_URL = "https://www.ip138.com/iplookup.php"
# 配置两种XPath表达式
XPATH_EXPRESSIONS = [
'/html/body/div/div[2]/div[1]/div/table/tbody/tr[2]/td[2]',
'/html/body/div/div[2]/div[2]/div/div[2]/div[1]/div/div[2]/div[2]/div[2]/table/tbody/tr[2]/td[2]'
]
DYNAMIC_XPATHS = [
'/html/body/div/div[2]/div[1]/div/p/a',
'/html/body/div/div[2]/div[2]/div/div[2]/div[1]/div/div[2]/div[2]/div[1]/p[1]/a'
]
max_main_retries = 3
for main_attempt in range(max_main_retries):
try:
logging.info(f"开始处理Excel文件 (尝试 {main_attempt+1}/{max_main_retries})")
process_excel(INPUT_FILE, BASE_URL, XPATH_EXPRESSIONS, DYNAMIC_XPATHS)
break
except Exception as e:
logging.critical(f"主程序执行失败 (尝试 {main_attempt+1}/{max_main_retries}): {str(e)}")
if main_attempt < max_main_retries - 1:
wait_time = 10 + random.uniform(10, 30)
logging.info(f"将在 {wait_time:.2f} 秒后重试")
time.sleep(wait_time)
else:
logging.critical("达到最大重试次数,程序终止")
以上代码在运行时出现了以下问题,解决问题并给我完整代码
Message: session not created: probably user data directory is already in use, please specify a unique value for --user-data-dir argument, or don't use --user-data-dir; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception
Stacktrace:
GetHandleVerifier [0x0xf6d593+37219]
(No symbol) [0x0xe1a716]
(No symbol) [0x0xbe51ce]
(No symbol) [0x0xc0d881]
(No symbol) [0x0xc08a43]
(No symbol) [0x0xc3d78b]
(No symbol) [0x0xc3d22a]
(No symbol) [0x0xc324f6]
(No symbol) [0x0xc13327]
(No symbol) [0x0xc12723]
(No symbol) [0x0xc13144]
sqlite3_dbdata_init [0x0x105a89c+518364]
sqlite3_dbdata_init [0x0x1141ab0+1465072]
sqlite3_dbdata_init [0x0x11413e5+1463333]
sqlite3_dbdata_init [0x0x11328ec+1403180]
sqlite3_dbdata_init [0x0x11422d2+1467154]
(No symbol) [0x0xe31d9d]
(No symbol) [0x0xe25108]
(No symbol) [0x0xe252fb]
(No symbol) [0x0xe0a649]
BaseThreadInitThunk [0x0x75525d49+25]
RtlInitializeExceptionChain [0x0x7706d1ab+107]
RtlGetAppContainerNamedObjectPath [0x0x7706d131+561]