Mozilla Location Service-8

本文深入探讨了geolocate定位服务的工作原理和技术细节。通过分析源代码,详细介绍了定位过程中涉及的关键步骤,包括基站信息查询及位置计算等。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

上次搞清楚了geosubmit是怎么回事,这次再来看看geolocate.

官方文档介绍

开启服务,如何访问,文档里有介绍,这里就不罗嗦了。
直接来看view:
/ProgFile/ichnaea-for-liuqiao/ichnaea/lib/python2.7/site-packages/ichnaea-1.5-py2.7-linux-x86_64.egg/ichnaea/api/locate/views.py:

class LocateV1View(BasePositionView):
    """View class for v1/geolocate HTTP API."""

    metric_path = 'v1.geolocate'  #:
    route = '/v1/geolocate'  #:
    schema = LOCATE_V1_SCHEMA  #:

    def prepare_response(self, result):
        response = {
            'location': {
                'lat': result['lat'],#经度
                'lng': result['lon'],#纬度
            },
            'accuracy': result['accuracy'],
        }

        if result['fallback']:
            response['fallback'] = result['fallback']

        return response
class BaseLocateView(BaseAPIView):
    """Common base class for all locate related views."""

    #: :exc:`ichnaea.api.exceptions.LocationNotFound`
    not_found = LocationNotFound
    searcher = None  #:

    def locate(self, api_key):
        print 'api/locate/views/locate()......'
        request_data, errors = self.preprocess_request()

        query = Query(
            fallback=request_data.get('fallbacks'),
            ip=self.request.client_addr,
            blue=request_data.get('bluetoothBeacons'),
            cell=request_data.get('cellTowers'),
            wifi=request_data.get('wifiAccessPoints'),
            api_key=api_key,
            api_type=self.view_type,
            session=self.request.db_ro_session,
            http_session=self.request.registry.http_session,
            geoip_db=self.request.registry.geoip_db,
            stats_client=self.stats_client,
        )

        searcher = getattr(self.request.registry, self.searcher)
        #重点:
        return searcher.search(query)

    def prepare_response(self, response_data):  # pragma: no cover
        return response_data

    #最开始是调用这个方法
    def view(self, api_key):
        print 'locate/views view(self,api_key)........'
        result = self.locate(api_key)
        if not result:
            raise self.prepare_exception(self.not_found())
        return self.prepare_response(result)

#重点:return searcher.search(query)
/ProgFile/ichnaea-for-liuqiao/ichnaea/lib/python2.7/site-packages/ichnaea-1.5-py2.7-linux-x86_64.egg/ichnaea/api/locate/searcher.py:

    def search(self, query):
        """
        Provide a type specific query result or return None.

        :param query: A query.
        :type query: :class:`~ichnaea.api.locate.query.Query`

        :returns: A result_type specific dict.
        """
        print 'start to search.....'
        query.emit_query_stats()
        # pdb.set_trace()
        重点:
        result = self._search(query)
        query.emit_result_stats(result)
        if result is not None:
            return self.format_result(result)

result=self._serach(query):

 def _search(self, query):
        print '_search start......'
        results = self.result_list()
        #: :class:`ichnaea.api.locate.result.ResultList`
        for name, source in self.sources:
            if source.should_search(query, results):
                pdb.set_trace()
                #重点:
                tmp=source.search(query)
                print 'tmp:', tmp,tmp.__module__
                """
                 RegionResultList: Region<region_code:FR, region_name:France, accuracy:570000.0, score:1.0,
                 fallback:None, source:DataSource.internal>, Region<region_code:YT, region_name:Mayotte,
                  accuracy:19000.0, score:1.0, fallback:None, source:DataSource.internal>
                  ichnaea.api.locate.result
                """
                results.add(tmp)

        return results.best()

tmp=source.search(query):
/ProgFile/ichnaea-for-liuqiao/ichnaea/lib/python2.7/site-packages/ichnaea-1.5-py2.7-linux-x86_64.egg/ichnaea/api/locate/internal.py:

    def search(self, query):
        results = self.result_list()

        for should, search in (
            # Search by most precise to least precise data type.
                (self.should_search_blue, self.search_blue),
                (self.should_search_wifi, self.search_wifi),
                (self.should_search_cell, self.search_cell)):

            if should(query, results):
                # pdb.set_trace()
                print search.__module__
                #重点
                tmp2=search(query)

                print 'tmp2:', tmp2, tmp2.__module__
                results.add(tmp2)

        query.emit_source_stats(self.source, results)
        return results

tmp2=search(query):
/ProgFile/ichnaea-for-liuqiao/ichnaea/lib/python2.7/site-packages/ichnaea-1.5-py2.7-linux-x86_64.egg/ichnaea/api/locate/cell.py

    def search_cell(self, query):
        results = self.result_list()

        if query.cell:
            cells = query_cells(
                query, query.cell, self.cell_model, self.raven_client)
            if cells:
                for cluster in cluster_cells(cells, query.cell):
                    lat, lon, accuracy, score = aggregate_cell_position(
                        cluster, CELL_MIN_ACCURACY, CELL_MAX_ACCURACY)
                    results.add(self.result_type(
                        lat=lat, lon=lon, accuracy=accuracy, score=score))

            if len(results):
                return results

        if query.cell_area:
            areas = query_areas(
                query, query.cell_area, self.area_model, self.raven_client)
            if areas:
                for cluster in cluster_areas(areas, query.cell_area):
                    lat, lon, accuracy, score = aggregate_cell_position(
                        cluster, CELLAREA_MIN_ACCURACY, CELLAREA_MAX_ACCURACY)
                    results.add(self.result_type(
                        lat=lat, lon=lon, accuracy=accuracy, score=score,
                        fallback='lacf'))

        return results

调用了N层终于要到最核心的做查询的地方了:

def query_cells(query, lookups, model, raven_client):
    # Given a location query and a list of lookup instances, query the
    # database and return a list of model objects.
    # print 'query_cells param lookups:', lookups
    print 'query_cell.......'
    for lookup in lookups:
        print lookup.radioType
    cellids = [lookup.cellid for lookup in lookups]
    if not cellids:  # pragma: no cover
        return []

    # load all fields used in score calculation and those we
    # need for the position
    # load_fields is a YuanZu ,cant revise and visited by offset, eg.load_fields[2] is radius
    load_fields = ('lat', 'lon', 'radius', 'region', 'samples',
                   'created', 'modified', 'last_seen',
                   'block_last', 'block_count')
    result = []
    today = util.utcnow().date()
    print 'today is:', today
    try:
        # pdb.set_trace()
        shards = defaultdict(list)# list {}
        for lookup in lookups:
            shards[model.shard_model(lookup.radioType)].append(lookup.cellid)
            # (<type 'list'>, {<class 'ichnaea.models.cell.CellShardWcdma'>: ['\x02\x00\xd0\x00\x01\x00\x02\x00\x12\xd6\x87']})
        for shard, shard_cellids in shards.items():
            rows = (
                query.session.query(shard)
                             .filter(shard.cellid.in_(shard_cellids),
                                     shard.lat.isnot(None),
                                     shard.lon.isnot(None))
                             .options(load_only(*load_fields))
            ).all()
            print 'rows are:', rows
            result.extend([row for row in rows if not row.blocked(today)])
    except Exception:
        raven_client.captureException()

    print 'result is:', result
    return result

到这里,我发现用了sqlachemy, query.session.query(A).filter(B).options(C).all(D)
A:要查询的对象,对应一个mysql表;(ORM)
B:查询条件
C:附加选项
D:查到多条记录返回一个list

然后来找,这里到底在对哪张表做查询?
在pdb里面看看shard是个什么类,因为这个类定义的tablename属性就对应一张表

class CellShardWcdma(CellShard, _Model):
    """Shard for WCDMA cells."""

    __tablename__ = 'cell_wcdma'

证明在对cell_wcdma表做查询
进入mysql客户端,发现cell_wcdma里一条数据也没有,难怪查不到了。

用 show create table cell_wcdma看看这张表都有哪些字段:

| cell_wcdma | CREATE TABLE `cell_wcdma` (
  `max_lat` double DEFAULT NULL,
  `min_lat` double DEFAULT NULL,
  `max_lon` double DEFAULT NULL,
  `min_lon` double DEFAULT NULL,
  `lat` double DEFAULT NULL,
  `lon` double DEFAULT NULL,
  `created` datetime DEFAULT NULL,
  `modified` datetime DEFAULT NULL,
  `radius` int(10) unsigned DEFAULT NULL,
  `region` varchar(2) DEFAULT NULL,
  `samples` int(10) unsigned DEFAULT NULL,
  `source` tinyint(4) DEFAULT NULL,
  `weight` double DEFAULT NULL,
  `last_seen` date DEFAULT NULL,
  `block_first` date DEFAULT NULL,
  `block_last` date DEFAULT NULL,
  `block_count` tinyint(3) unsigned DEFAULT NULL,
  `cellid` binary(11) NOT NULL,
  `radio` tinyint(4) NOT NULL,
  `mcc` smallint(6) NOT NULL,
  `mnc` smallint(6) NOT NULL,
  `lac` smallint(5) unsigned NOT NULL,
  `cid` int(10) unsigned NOT NULL,
  `psc` smallint(6) DEFAULT NULL,
  PRIMARY KEY (`cellid`),
  UNIQUE KEY `cell_wcdma_cellid_unique` (`radio`,`mcc`,`mnc`,`lac`,`cid`),
  KEY `cell_wcdma_latlon_idx` (`lat`,`lon`),
  KEY `cell_wcdma_modified_idx` (`modified`),
  KEY `cell_wcdma_region_idx` (`region`),
  KEY `cell_wcdma_created_idx` (`created`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 |

手动insert一条记录:

insert into cell_wcdma(cellid,radio,mcc,mnc,lac,cid) values(184640524,q,460,1,29448,184640524);

在测试还是无法定位,大部分程序没有运行直接返回异常了。

问题:
cell_cdma里又经度和纬度两个字段,如果在这条记录里,这两个值不为空,岂不是可以直接靠一个当前基站信息就确定当前经纬度?

原始数据从哪里弄呢?如果用户连接到基站,也只能上传基站信息,不能上传经纬度(这是定位结果),那数据库里一直没有经纬度又如何做定位?

import time import random import requests import json import logging import socket from pathlib import Path from openpyxl import load_workbook from selenium import webdriver from selenium.webdriver.edge.service import Service from selenium.webdriver.edge.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.action_chains import ActionChains from selenium.common.exceptions import TimeoutException, WebDriverException from fake_useragent import UserAgent, FakeUserAgentError # -------------------------- 新增:手动指定Edge浏览器和驱动路径 -------------------------- EDGE_BINARY_PATH = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe" # 浏览器路径 EDGE_DRIVER_PATH = r"C:\Users\27570\Desktop\edgedriver_win32\msedgedriver.exe" # 驱动路径,需下载并指定 # ------------------------------------------------------------------------------------- # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("ip_query.log"), logging.StreamHandler() ] ) # -------------------------- 优化:整合网络连接检查 -------------------------- def check_internet_connection(): """检查网络连接是否正常,尝试多种连接方式提高可靠性""" try: # 尝试连接到Google的公共DNS服务器 socket.create_connection(("8.8.8.8", 53), timeout=5) logging.info("网络连接测试通过 (DNS)") return True except OSError: logging.warning("无法连接到DNS服务器,尝试HTTP请求...") try: # 尝试HTTP请求到百度 response = requests.get("https://www.baidu.com", timeout=5) if response.status_code == 200: logging.info("网络连接测试通过 (HTTP)") return True except requests.RequestException: logging.warning("HTTP请求失败,尝试HTTPS请求...") try: # 尝试HTTPS请求到百度 response = requests.get("https://www.baidu.com", timeout=5) if response.status_code == 200: logging.info("网络连接测试通过 (HTTPS)") return True except requests.RequestException: logging.error("HTTPS请求失败") return False # ------------------------------------------------------------------------------------- def build_query_url(base_url, ip_address, path_format="{ip}/", use_params=True, param_name="ip"): """构建IP查询URL,支持路径参数和查询参数两种格式""" if not base_url.startswith(('http://', 'https://')): base_url = 'https://' + base_url base_url = base_url.rstrip('/') try: if '.' in ip_address: # IPv4 socket.inet_pton(socket.AF_INET, ip_address) elif ':' in ip_address: # IPv6 socket.inet_pton(socket.AF_INET6, ip_address) else: raise ValueError("无效的IP地址") if ':' in ip_address: ip_address = f"[{ip_address}]" if use_params: # 使用查询参数的方式构造URL from urllib.parse import urlencode # 添加固定参数action=2 params = {param_name: ip_address, "action": 2} return f"{base_url}?{urlencode(params)}" else: # 原有的路径参数方式 from urllib.parse import quote encoded_ip = quote(ip_address) return f"{base_url}/{path_format.format(ip=encoded_ip)}" except socket.error: logging.error(f"无效的IP地址格式: {ip_address}") return None except Exception as e: logging.error(f"构建查询URL时出错: {str(e)}") return None # 手动配置的固定请求头 MANUAL_HEADERS = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-encoding': 'gzip, deflate, br, zstd', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'connection': 'keep-alive', 'cookie': '_c_WBKFRo=NWgPw1zeBaW3I2CtOcadfhJJw33TcEYmWMtyGzTE; Hm_lvt_f4f76646cd877e538aa1fbbdf351c548=1753560343,1753617545,1753793389,1753862286; HMACCOUNT=96B6BD9DE68EFF3B; PHPSESSID=o9fnnscr7sofru4b8r1khlde3f; Hm_lvt_f4f76646cd877e538aa1fbbdf351c548=1754123598; HMACCOUNT=96B6BD9DE68EFF3B; Hm_lpvt_f4f76646cd877e538aa1fbbdf351c548=1754611428; Hm_lpvt_f4f76646cd877e538aa1fbbdf351c548=1754613370', 'host': 'www.ip138.com', 'referer': 'https://www.ip138.com/iplookup.php?ip=27.154.214.154&action=2', 'sec-ch-ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Microsoft Edge";v="138"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0' } def configure_driver(max_retries=5): """配置Edge浏览器驱动""" for attempt in range(max_retries): try: # 先检查网络连接 if not check_internet_connection(): raise Exception("网络连接不可用") # 创建Edge选项 edge_options = Options() # 1. 基础配置 edge_options.binary_location = EDGE_BINARY_PATH # 手动指定浏览器路径 edge_options.add_argument("--disable-blink-features=AutomationControlled") # 核心反检测 edge_options.add_experimental_option("excludeSwitches", ["enable-automation"]) edge_options.add_experimental_option("useAutomationExtension", False) edge_options.add_argument("--start-maximized") # 最大化窗口 # 2. 增强反检测 edge_options.add_argument("--disable-extensions") edge_options.add_argument("--disable-plugins-discovery") edge_options.add_argument("--disable-web-security") # 3. 随机化配置 features_to_disable = [ "AutomationControlled", "InterestCohort", "BlinkGenPropertyTrees" ] edge_options.add_argument(f"--disable-features={','.join(random.sample(features_to_disable, random.randint(2, 4)))}") screen_sizes = [(1366, 768), (1920, 1080), (1536, 864)] width, height = random.choice(screen_sizes) edge_options.add_argument(f"--window-size={width},{height}") if random.random() > 0.5: edge_options.add_argument("--disable-gpu") else: edge_options.add_argument("--enable-gpu-rasterization") # 4. 资源加载控制 prefs = { "profile.managed_default_content_settings.images": 2, "profile.managed_default_content_settings.stylesheets": 2, } edge_options.add_experimental_option("prefs", prefs) edge_options.page_load_strategy = 'eager' # 只等待DOM加载 # 5. 使用手动指定的驱动路径 try: service = Service(EDGE_DRIVER_PATH) # 手动指定驱动路径 logging.info(f"使用手动指定的驱动路径: {EDGE_DRIVER_PATH}") except Exception as e: logging.error(f"驱动路径配置错误: {str(e)}") raise # 6. 创建浏览器实例 driver = webdriver.Edge(service=service, options=edge_options) # 7. 隐藏自动化特征 driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": """ // 核心:隐藏webdriver标志 Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); // 模拟Chrome特征 window.chrome = { runtime: {} }; // 模拟时区 Intl.DateTimeFormat().resolvedOptions().timeZone = ['Asia/Shanghai', 'Asia/Beijing'][Math.floor(Math.random() * 2)]; """ }) # 8. 设置超时 driver.set_page_load_timeout(30) driver.set_script_timeout(30) # 9. 应用手动配置的请求头 logging.info(f"应用手动配置的请求头: {json.dumps(MANUAL_HEADERS, indent=2)[:100]}...") driver.execute_cdp_cmd("Network.setUserAgentOverride", { "userAgent": MANUAL_HEADERS["user-agent"], "accept": MANUAL_HEADERS["accept"], "acceptLanguage": MANUAL_HEADERS["accept-language"], }) logging.info(f"浏览器驱动初始化成功 (尝试 {attempt+1}/{max_retries})") return driver except Exception as e: logging.error(f"配置浏览器驱动失败 (尝试 {attempt+1}/{max_retries}): {str(e)}") if attempt < max_retries - 1: wait_time = 2 ** attempt + random.uniform(5, 10) logging.info(f"将在 {wait_time:.2f} 秒后重试") time.sleep(wait_time) logging.critical("达到最大重试次数,无法初始化浏览器驱动") return None def change_user_agent(driver): """更换为手动配置的请求头""" logging.info(f"应用手动配置的请求头: {json.dumps(MANUAL_HEADERS, indent=2)[:100]}...") driver.execute_cdp_cmd("Network.setUserAgentOverride", { "userAgent": MANUAL_HEADERS["user-agent"], "accept": MANUAL_HEADERS["accept"], "acceptLanguage": MANUAL_HEADERS["accept-language"], }) driver.refresh() time.sleep(random.uniform(2, 4)) def handle_cookies(driver): """处理和保存Cookie""" cookies = driver.get_cookies() logging.info(f"获取到 {len(cookies)} 个Cookie") return cookies def is_banned(driver): """检测是否被封禁""" try: banned_xpaths = [ '//div[contains(text(), "访问被阻止")]', '//div[contains(text(), "验证码")]', '//div[contains(text(), "您的IP已被封禁")]', ] for xpath in banned_xpaths: if WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.XPATH, xpath)) ): logging.warning("检测到封禁页面") return True return False except: return False def check_dynamic_element(driver, xpaths): """检查网页上是否存在任一动态XPath的元素""" for i, xpath in enumerate(xpaths, 1): try: WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.XPATH, xpath)) ) logging.info(f"使用动态元素XPath {i}: {xpath}") return True except: continue return False def get_result_element(driver, xpaths): """尝试获取任一结果元素""" for i, xpath in enumerate(xpaths, 1): try: element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, xpath)) ) logging.info(f"使用结果元素XPath {i}: {xpath}") return element except: continue return None def simulate_human_behavior(driver): """模拟人类浏览行为""" try: # 随机滚动页面 scroll_height = driver.execute_script("return document.body.scrollHeight") scroll_steps = random.randint(3, 7) for i in range(scroll_steps): scroll_to = int(scroll_height * (i + 1) / scroll_steps) driver.execute_script(f"window.scrollTo(0, {scroll_to})") time.sleep(random.uniform(0.5, 1.5)) # 随机移动鼠标 actions = ActionChains(driver) elements = driver.find_elements(By.TAG_NAME, "a") if elements: for _ in range(random.randint(1, 3)): element = random.choice(elements) actions.move_to_element(element).perform() time.sleep(random.uniform(0.3, 0.8)) except Exception as e: logging.warning(f"模拟人类行为时出错: {e}") def query_ip(driver, ip_address, base_url, xpath_expressions, dynamic_xpaths, max_retries=5): """查询IP信息,添加封禁检测和处理""" if not ip_address or not isinstance(ip_address, str) or ip_address.strip() == "": logging.warning(f"无效的IP地址: {ip_address}") return "无效IP" ip_address = ip_address.strip() for attempt in range(max_retries): try: if attempt > 0: wait_time = 2 ** attempt + random.uniform(3, 7) logging.info(f"第 {attempt+1} 次重试前等待 {wait_time:.2f} 秒...") time.sleep(wait_time) change_user_agent(driver) query_url = build_query_url(base_url, ip_address) if not query_url: logging.error(f"无法构建有效的查询URL,IP: {ip_address}") return "无效URL" logging.info(f"访问查询URL (尝试 {attempt+1}/{max_retries}): {query_url}") try: driver.get(query_url) time.sleep(random.uniform(8, 15)) if is_banned(driver): logging.warning(f"IP {ip_address} 查询时被封禁") driver.quit() time.sleep(5) driver = configure_driver() time.sleep(5) continue current_url = driver.current_url if current_url == "data:," or "about:blank" in current_url: raise Exception("浏览器加载了空白页面") simulate_human_behavior(driver) time.sleep(random.uniform(2, 5)) handle_cookies(driver) except TimeoutException: logging.warning(f"页面加载超时,尝试重新加载") driver.refresh() time.sleep(15) continue if check_dynamic_element(driver, dynamic_xpaths): logging.info(f"检测到动态元素,结果将设为'动态'") return "动态" result_element = get_result_element(driver, xpath_expressions) if result_element: return result_element.text.strip() else: raise Exception("无法找到结果元素") except WebDriverException as e: logging.error(f"WebDriver错误 (尝试 {attempt+1}/{max_retries}): {str(e)}") if "ERR_EMPTY_RESPONSE" in str(e) or "ERR_CONNECTION_RESET" in str(e): logging.warning("检测到连接错误,尝试重启浏览器...") driver.quit() time.sleep(15) driver = configure_driver() time.sleep(10) else: time.sleep(2 ** attempt + random.uniform(5, 10)) continue except Exception as e: logging.error(f"查询IP {ip_address} 失败 (尝试 {attempt+1}/{max_retries}): {str(e)}") time.sleep(2 ** attempt + random.uniform(5, 10)) continue logging.error(f"IP {ip_address} 查询失败,已达到最大重试次数") driver.save_screenshot(f"error_{ip_address}.png") return "查询失败" def is_row_hidden(worksheet, row_idx): """检查Excel行是否被隐藏""" return worksheet.row_dimensions[row_idx].hidden def process_excel(input_file, base_url, xpath_expressions, dynamic_xpaths, ip_column='A', result_column='I', start_row=2): """处理Excel文件""" wb = load_workbook(input_file) ws = wb.active has_filter = ws.auto_filter.ref is not None logging.info(f"检测到筛选: {has_filter}") driver = configure_driver() if not driver: logging.critical("无法初始化浏览器驱动,退出程序") return visible_rows = [] for row_idx in range(start_row, ws.max_row + 1): row_dim = ws.row_dimensions.get(row_idx) if not row_dim or not row_dim.hidden: visible_rows.append(row_idx) logging.info(f"可见行共{len(visible_rows)}行") total_visible = len(visible_rows) processed_count = 0 try: for i, row in enumerate(visible_rows, 1): ip_address = ws[f"{ip_column}{row}"].value if not ip_address: logging.info(f"第 {row} 行IP地址为空,跳过") continue logging.info(f"正在查询IP: {ip_address} ({i}/{total_visible})") result = query_ip(driver, ip_address, base_url, xpath_expressions, dynamic_xpaths) ws[f"{result_column}{row}"] = result processed_count += 1 if i % 3 == 0 or i == total_visible: wb.save(input_file) logging.info(f"已保存进度: {i}/{total_visible} 到 {input_file}") wait_time = random.uniform(20,40) logging.info(f"等待 {wait_time:.2f} 秒后继续...") time.sleep(wait_time) if i % 10 == 0: extra_wait = random.uniform(40,60) logging.info(f"已处理 {i} 个IP,额外休息 {extra_wait:.2f} 秒...") time.sleep(extra_wait) if i % 20 == 0: logging.info(f"已处理 {i} 个IP,重启浏览器以避免被检测...") driver.quit() time.sleep(15) driver = configure_driver() if not driver: logging.critical("无法重新初始化浏览器驱动,退出程序") return except Exception as e: logging.critical(f"处理过程中发生意外错误: {str(e)}") finally: if driver: driver.quit() wb.save(input_file) logging.info(f"已保存最终结果到 {input_file}") logging.info(f"处理完成!共处理 {processed_count}/{total_visible} 个可见IP地址") if __name__ == "__main__": INPUT_FILE = r"C:\Users\27570\Desktop\飞塔-福建-简版-更新版20250730.xlsx" # 修改为新的基础URL BASE_URL = "https://www.ip138.com/iplookup.php" # 配置两种XPath表达式 XPATH_EXPRESSIONS = [ '/html/body/div/div[2]/div[1]/div/table/tbody/tr[2]/td[2]', '/html/body/div/div[2]/div[2]/div/div[2]/div[1]/div/div[2]/div[2]/div[2]/table/tbody/tr[2]/td[2]' ] DYNAMIC_XPATHS = [ '/html/body/div/div[2]/div[1]/div/p/a', '/html/body/div/div[2]/div[2]/div/div[2]/div[1]/div/div[2]/div[2]/div[1]/p[1]/a' ] max_main_retries = 3 for main_attempt in range(max_main_retries): try: logging.info(f"开始处理Excel文件 (尝试 {main_attempt+1}/{max_main_retries})") process_excel(INPUT_FILE, BASE_URL, XPATH_EXPRESSIONS, DYNAMIC_XPATHS) break except Exception as e: logging.critical(f"主程序执行失败 (尝试 {main_attempt+1}/{max_main_retries}): {str(e)}") if main_attempt < max_main_retries - 1: wait_time = 10 + random.uniform(10, 30) logging.info(f"将在 {wait_time:.2f} 秒后重试") time.sleep(wait_time) else: logging.critical("达到最大重试次数,程序终止") 以上代码在运行时出现了以下问题,解决问题并给我完整代码 Message: session not created: probably user data directory is already in use, please specify a unique value for --user-data-dir argument, or don't use --user-data-dir; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception Stacktrace: GetHandleVerifier [0x0xf6d593+37219] (No symbol) [0x0xe1a716] (No symbol) [0x0xbe51ce] (No symbol) [0x0xc0d881] (No symbol) [0x0xc08a43] (No symbol) [0x0xc3d78b] (No symbol) [0x0xc3d22a] (No symbol) [0x0xc324f6] (No symbol) [0x0xc13327] (No symbol) [0x0xc12723] (No symbol) [0x0xc13144] sqlite3_dbdata_init [0x0x105a89c+518364] sqlite3_dbdata_init [0x0x1141ab0+1465072] sqlite3_dbdata_init [0x0x11413e5+1463333] sqlite3_dbdata_init [0x0x11328ec+1403180] sqlite3_dbdata_init [0x0x11422d2+1467154] (No symbol) [0x0xe31d9d] (No symbol) [0x0xe25108] (No symbol) [0x0xe252fb] (No symbol) [0x0xe0a649] BaseThreadInitThunk [0x0x75525d49+25] RtlInitializeExceptionChain [0x0x7706d1ab+107] RtlGetAppContainerNamedObjectPath [0x0x7706d131+561]
最新发布
08-09
--------------------------------------------------------------------------- WebDriverException Traceback (most recent call last) Cell In[9], line 8 5 options.binary_location = "/usr/bin/firefox" # 火狐浏览器位置 7 service = Service(executable_path="/home/drjizhu/.local/bin/geckodriver") ----> 8 driver = webdriver.Firefox(service=service, options=options) 9 driver.get("https://filehelper.weixin.qq.com/?from=windows&type=recommend") 10 print(driver.title) File ~/.local/lib/python3.12/site-packages/selenium/webdriver/firefox/webdriver.py:71, in WebDriver.__init__(self, options, service, keep_alive) 64 executor = FirefoxRemoteConnection( 65 remote_server_addr=self.service.service_url, 66 keep_alive=keep_alive, 67 ignore_proxy=options._ignore_local_proxy, 68 ) 70 try: ---> 71 super().__init__(command_executor=executor, options=options) 72 except Exception: 73 self.quit() File ~/.local/lib/python3.12/site-packages/selenium/webdriver/remote/webdriver.py:260, in WebDriver.__init__(self, command_executor, keep_alive, file_detector, options, locator_converter, web_element_cls, client_config) 258 self._authenticator_id = None 259 self.start_client() --> 260 self.start_session(capabilities) 261 self._fedcm = FedCM(self) 263 self._websocket_connection = None File ~/.local/lib/python3.12/site-packages/selenium/webdriver/remote/webdriver.py:357, in WebDriver.start_session(self, capabilities) 355 caps = _create_caps(capabilities) 356 try: --> 357 response = self.execute(Command.NEW_SESSION, caps)["value"] 358 self.session_id = response.get("sessionId") 359 self.caps = response.get("capabilities") File ~/.local/lib/python3.12/site-packages/selenium/webdriver/remote/webdriver.py:448, in WebDriver.execute(self, driver_command, params) 446 response = self.command_executor.execute(driver_command, params) 447 if response: --> 448 self.error_handler.check_response(response) 449 response["value"] = self._unwrap_value(response.get("value", None)) 450 return response File ~/.local/lib/python3.12/site-packages/selenium/webdriver/remote/errorhandler.py:232, in ErrorHandler.check_response(self, response) 230 alert_text = value["alert"].get("text") 231 raise exception_class(message, screen, stacktrace, alert_text) # type: ignore[call-arg] # mypy is not smart enough here --> 232 raise exception_class(message, screen, stacktrace) WebDriverException: Message: Process unexpectedly closed with status 1
05-14
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值