BrowserManager 解析 URL

本文介绍了一种使用Flex解析URL的方法,并展示了如何获取完整的URL、基础URL及URL片段。通过实例代码,读者可以了解到如何利用BrowserManager来初始化并读取这些URL部分。

 

 

公共属性


  属性 定义方
    base : String
[] 当前 URL 显示在浏览器地址栏中时,“#”之前的部分。 IBrowserManager
    fragment : String
[] 当前 URL 显示在浏览器地址栏中时,“#”之后的部分。 IBrowserManager
    title : String
[] 应用程序在浏览器历史记录中应显示的标题。 IBrowserManager
    url : String
[] 当前显示在浏览器地址栏中的 URL。 IBrowserManager

 

公共方法
 
    init(value:String = null, title:String = null):void
初始化 BrowserManager。 IBrowserManager
    initForHistoryManager():void
初始化 BrowserManager。 IBrowserManager
   removeEventListener(type:String, listener:Function, useCapture:Boolean = false):void
从 EventDispatcher 对象中删除侦听器。 IEventDispatcher
    setFragment(value:String):void
更改浏览器中“#”后面的 URL 片段。 IBrowserManager
    setTitle(value:String):void
更改浏览器标题栏中的文本。

# demo_scrapy_playwright_split.py # 拆分版本:Scrapy 爬虫管理浏览器生命周期(启动单例、关闭、可选监控) # BrowserManager 独立管理浏览器,任务类管理上下文和超时 # Scrapyd 友好,支持 Windows/Linux import os import asyncio import logging import time from pathlib import Path import scrapy from scrapy import signals from playwright.async_api import async_playwright, TimeoutError as PWTimeout ''' 我已经把浏览器管理逻辑独立成 BrowserManager 类: BrowserManager → 负责启动、关闭、监控浏览器; PlaywrightTaskManager → 负责任务执行与超时控制; DemoSpider → 只负责调度和生命周期绑定。 这样结构更清晰,后续要扩展资源监控(CPU、内存、上下文数等)时,可以直接在 BrowserManager 里实现。 ''' SCREENSHOT_DIR = Path("screenshots") SCREENSHOT_DIR.mkdir(exist_ok=True) class BrowserManager: def __init__(self): self._pw = None self.browser = None self._started = False async def start(self): if not self._started: self._pw = await async_playwright().start() self.browser = await self._pw.chromium.launch(headless=True) self._started = True logging.getLogger(__name__).info("Browser singleton started") async def close(self): if self._started: try: await self.browser.close() finally: await self._pw.stop() self._started = False logging.getLogger(__name__).info("Browser singleton closed") def is_running(self): return self._started and self.browser is not None class PlaywrightTaskManager: def __init__(self, browser_manager: BrowserManager, timeout: int = 20): self.browser_manager = browser_manager self.timeout = timeout async def _task_logic(self, url: str): context = await self.browser_manager.browser.new_context() page = await context.new_page() screenshot_path = None try: await page.goto(url, wait_until="load") title = await page.title() # time.sleep(5) await asyncio.sleep(5) screenshot_path = str(SCREENSHOT_DIR / f"{int(asyncio.get_event_loop().time())}.png") await page.screenshot(path=screenshot_path, full_page=True) return {"url": url, "title": title, "screenshot_path": screenshot_path} finally: try: await context.close() except Exception: pass async def run_task_with_timeout(self, url: str): try: result = await asyncio.wait_for(self._task_logic(url), timeout=self.timeout) return result except asyncio.TimeoutError: return {"url": url, "error": f"timeout after {self.timeout}s"} except PWTimeout: return {"url": url, "error": f"Playwright timeout after {self.timeout}s"} except Exception as e: return {"url": url, "error": str(e)} class DemoSpider(scrapy.Spider): name = "demo_playwright_task_manager1" start_urls = [ "https://www.hao123.com/", "http://www.people.com.cn/", "http://renshi.people.com.cn/", ] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.timeout = 3 self._loop = None self.browser_manager = BrowserManager() self.task_manager = None @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(DemoSpider, cls).from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.spider_opened, signal=signals.spider_opened) crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed) return spider def spider_opened(self): # 独立事件循环,避免 "no running event loop" self._loop = asyncio.new_event_loop() asyncio.set_event_loop(self._loop) self._loop.run_until_complete(self.browser_manager.start()) self.task_manager = PlaywrightTaskManager(self.browser_manager, timeout=self.timeout) def spider_closed(self): if self.browser_manager.is_running(): self._loop.run_until_complete(self.browser_manager.close()) self._loop.close() def start_requests(self): for url in self.start_urls: self.logger.info("Dispatching task for %s", url) result = self._loop.run_until_complete(self.task_manager.run_task_with_timeout(url)) print(result) yield { "url": result.get("url"), "title": result.get("title"), "screenshot": result.get("screenshot_path"), "error": result.get("error"), }
10-12
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值