(() => {
// ---------------- 1. 全局存储与状态 ----------------
window.__collected_actions = [];
window.__element_identity_map = new Map(); // 元素唯一性标记
window.__event_map = new Map(); // 事件记录
window.__analysis_state = { completed: false, error: null, progress: 0 };
// ---------------- 2. 元素唯一 ID ----------------
function getElementUniqueId(el) {
if (!el || el.nodeType !== 1) return null;
if (el.dataset._uniqueId) return el.dataset._uniqueId;
const parts = [];
if (el.id) parts.push(`id:${el.id}`);
const dataAttrs = ['data-id','data-key','data-uuid','data-component-id'];
for (const attr of dataAttrs) {
if (el.hasAttribute(attr)) { parts.push(`${attr}:${el.getAttribute(attr)}`); break; }
}
if (parts.length === 0) parts.push(`xpath:${getPreciseXPath(el)}`);
const ts = Date.now();
el.dataset._uniqueId = parts.join('|') + `|ts:${ts}`;
return el.dataset._uniqueId;
}
// ---------------- 3. 精确 XPath ----------------
function getPreciseXPath(el) {
if (!el || el.nodeType !== 1) return '';
let path = '', current = el;
while (current && current.nodeType === 1 && current !== document.documentElement) {
let idx = 1, sibling = current.previousSibling;
while (sibling) { if (sibling.nodeType===1 && sibling.tagName===current.tagName) idx++; sibling=sibling.previousSibling; }
path = `/${current.tagName.toLowerCase()}[${idx}]` + path;
current = current.parentNode;
}
return `/html${path}`;
}
// ---------------- 4. 精确 CSS Selector ----------------
function getPreciseCssSelector(el) {
if (!el || el.nodeType !== 1) return '';
const parts = [];
let current = el, level=0;
while (current && current.nodeType===1 && level<10) {
let part = current.tagName.toLowerCase();
if (current.id) { part += `#${CSS.escape(current.id)}`; parts.unshift(part); break; }
if (current.className) part += '.' + current.className.trim().split(/\s+/).map(c=>CSS.escape(c)).join('.');
const keyAttrs = ['name','data-id','data-name','role','type'];
for (const attr of keyAttrs) if (current.hasAttribute(attr)) part += `[${attr}="${CSS.escape(current.getAttribute(attr))}"]`;
let idx=1, sib=current.previousSibling;
while (sib){ if(sib.nodeType===1 && sib.tagName===current.tagName) idx++; sib=sib.previousSibling; }
part += `:nth-child(${idx})`;
parts.unshift(part);
current=current.parentNode; level++;
}
return parts.join(' > ');
}
// ---------------- 5. 元素位置信息 ----------------
function getPrecisePosition(el) {
const rect = el.getBoundingClientRect();
let scrollX=0, scrollY=0, current=el;
while(current){ scrollX+=current.scrollLeft||0; scrollY+=current.scrollTop||0; current=current.parentElement; }
let viewport = { x:rect.x, y:rect.y, width:rect.width, height:rect.height, top:rect.top,right:rect.right,bottom:rect.bottom,left:rect.left };
let docPos = { x:rect.x+window.scrollX, y:rect.y+window.scrollY, width:viewport.width, height:viewport.height,
top:rect.top+window.scrollY,right:rect.right+window.scrollX,bottom:rect.bottom+window.scrollY,left:rect.left+window.scrollX };
let depth=0; current=el; while(current.parentElement){ depth++; current=current.parentElement; }
return { viewport, document: docPos, depth, scrollOffset:{x:scrollX,y:scrollY} };
}
// ---------------- 6. 判断滚动容器 ----------------
function isScrollableContainer(el) {
if (!el || el.nodeType !== 1) return false;
// HTML / BODY 特判
if (el === document.documentElement || el === document.body) {
return Math.abs(document.documentElement.scrollHeight - window.innerHeight) > 5;
}
try {
const style = window.getComputedStyle(el);
const hasScrollProp = ['auto', 'scroll'].includes(style.overflowY) || ['auto', 'scroll'].includes(style.overflowX);
const canScrollSize = el.scrollHeight - el.clientHeight > 5 || el.scrollWidth - el.clientWidth > 5;
// 即使 overflow:hidden,但高度明显超出也判定为滚动容器
const likelyScroll = (!hasScrollProp && canScrollSize);
return (hasScrollProp && canScrollSize) || likelyScroll;
} catch {
return false;
}
}
// ---------------- 7. 创建动作对象 ----------------
function createActionObject(el){
const uid = getElementUniqueId(el);
if(window.__element_identity_map.has(uid)) return null;
window.__element_identity_map.set(uid,true);
const tag=el.tagName.toLowerCase(), pos=getPrecisePosition(el);
if(pos.viewport.width<1||pos.viewport.height<1) return null;
let type='click';
if(isScrollableContainer(el)) type='scroll';
else if(tag==='input'||tag==='textarea') type='input';
else if(tag==='select') type='select';
else if(el.hasAttribute('onmouseover')||el.hasAttribute('onmouseenter')) type='hover';
else if(el.type==='submit'||el.form) type='submit';
let targetUrl=null;
if(tag==='a' && el.href) targetUrl=el.href;
else if(el.dataset.url) targetUrl=el.dataset.url;
else if(el.hasAttribute('onclick')){ const m=el.getAttribute('onclick').match(/https?:\/\/[^\s'"]+/); if(m) targetUrl=m[0]; }
return {
unique_id: uid,
type,
full_xpath: getPreciseXPath(el),
css_selector: getPreciseCssSelector(el),
text: el.innerText?el.innerText.trim():'',
attributes: Array.from(el.attributes).reduce((acc,a)=>{acc[a.name]=a.value;return acc;},{}),
position: pos,
visible: pos.viewport.width>0 && pos.viewport.height>0,
event_listeners: [],
target_url: targetUrl,
features:{tag_name:tag,is_scrollable:type==='scroll',is_visible:pos.viewport.width>0 && pos.viewport.height>0,dom_depth:pos.depth},
timestamp:new Date().toISOString()
};
}
// ---------------- 8. Hook事件监听 ----------------
function hookEventListeners(){
const orig=Element.prototype.addEventListener;
Element.prototype.addEventListener=function(type,listener,options){
const uid=getElementUniqueId(this);
if(uid){
if(!window.__event_map.has(uid)) window.__event_map.set(uid,new Set());
window.__event_map.get(uid).add(type);
}
return orig.call(this,type,listener,options);
};
}
// ---------------- 9. 扫描可交互元素 ----------------
function scanInteractiveElements(){
const selectors=['a,button,[role="button"],[onclick]','input,textarea,select,[contenteditable]','div,section,main,article,nav,ul,ol','[data-action],[data-click],[data-scroll]','[onmouseover],[onmousedown],[onmouseup]'];
const elementsSet=new Set();
selectors.forEach(sel=>document.querySelectorAll(sel).forEach(el=>elementsSet.add(el)));
document.querySelectorAll('*').forEach(el=>{ if(isScrollableContainer(el)) elementsSet.add(el); });
Array.from(elementsSet).forEach((el,i)=>{
window.__analysis_state.progress=Math.round((i/elementsSet.size)*80);
const a=createActionObject(el); if(a) window.__collected_actions.push(a);
});
// 全局窗口滚动处理
if(document.documentElement.scrollHeight > window.innerHeight + 1){
const uid = 'window-scroll';
if(!window.__element_identity_map.has(uid)){
window.__element_identity_map.set(uid,true);
window.__collected_actions.push({
unique_id: uid,
type: 'scroll',
full_xpath: '/html',
css_selector: 'html, body',
text: '',
attributes: {},
position: getPrecisePosition(document.documentElement),
visible: true,
event_listeners: [],
target_url: null,
features: {tag_name:'html', is_scrollable:true, is_visible:true, dom_depth:0},
timestamp: new Date().toISOString()
});
}
}
}
// ---------------- 10. 合并事件 ----------------
function mergeEventListeners(){
window.__collected_actions.forEach((a,i)=>{
if(window.__event_map.has(a.unique_id)) a.event_listeners=Array.from(window.__event_map.get(a.unique_id));
if(i%10===0) window.__analysis_state.progress=80+Math.round((i/window.__collected_actions.length)*20);
});
}
// ---------------- 11. 主分析函数 ----------------
function main(){
const start=performance.now(), maxTime=60000;
hookEventListeners();
const executeAnalysis=()=>{
if(performance.now()-start<maxTime-5000){
scanInteractiveElements();
mergeEventListeners();
}else console.warn('分析时间紧张,优先处理已收集元素');
window.__collected_actions.sort((a,b)=>a.position.depth-b.position.depth);
window.__analysis_state.completed=true;
console.log(`Action analysis completed. Found ${window.__collected_actions.length} unique actions.`);
};
if(document.readyState==='complete') executeAnalysis();
else window.addEventListener('load',()=>setTimeout(executeAnalysis,500));
setTimeout(executeAnalysis,8000);
}
main();
})();
import json
import time
import re
from urllib.parse import urlparse
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
def load_analysis_script(file_path: Path) -> str:
"""加载注入浏览器的 JavaScript 分析脚本"""
try:
return file_path.read_text(encoding="utf-8")
except Exception as e:
print(f"[-] 加载分析脚本失败: {str(e)}")
raise
def safe_folder_name(url: str) -> str:
"""将 URL 转换为安全的文件夹名"""
parsed = urlparse(url)
domain = parsed.netloc.replace(":", "_")
path = re.sub(r"[^a-zA-Z0-9_-]", "_", parsed.path.strip("/"))
if not path:
path = "root"
return f"{domain}_{path}_{int(time.time())}"
def ensure_dir(path: Path):
"""确保目录存在"""
path.mkdir(parents=True, exist_ok=True)
def main():
target_url = "https://www.ccidgroup.com/info/2110/44774.htm"
max_analysis_timeout = 100 * 1000
load_wait_time = 4000
post_analysis_delay = 1500
output_root = Path("./results")
ensure_dir(output_root)
with sync_playwright() as p:
browser = p.chromium.launch(
headless=False,
args=[
"--disable-blink-features=AutomationControlled",
"--disable-features=IsolateOrigins,site-per-process",
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
],
)
try:
page = browser.new_page()
page.set_default_timeout(60000)
page.set_default_navigation_timeout(60000)
print("[*] 加载修复版分析脚本...")
analysis_code = load_analysis_script(Path("analysis_script.js"))
page.add_init_script(script=analysis_code)
print(f"[*] 导航至: {target_url}")
page.goto(target_url)
page.wait_for_load_state("networkidle", timeout=60000)
print("[*] 初始页面加载完成")
print("[*] 滚动到页面底部...")
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
print(f"[*] 等待 {load_wait_time / 1000} 秒,确保动态内容加载...")
time.sleep(load_wait_time / 1000)
print("[*] 滚动回页面顶部...")
page.evaluate("window.scrollTo(0, 0)")
time.sleep(1)
print("[*] 等待 JS 分析可交互元素...")
try:
page.wait_for_event(
"console",
lambda msg: "Action analysis completed" in msg.text,
timeout=max_analysis_timeout,
)
page.wait_for_timeout(post_analysis_delay)
except PlaywrightTimeoutError:
print(f"[!] 分析超时(超过 {max_analysis_timeout / 1000} 秒),提取现有结果...")
except Exception as e:
print(f"[!] 等待分析出错: {str(e)},提取现有结果...")
print("[*] 提取分析结果...")
all_actions = page.evaluate("() => window.__collected_actions || []")
if not isinstance(all_actions, list):
all_actions = []
if len(all_actions) == 0:
print("\n[!] 警告:未收集到任何可交互元素!")
print(" 可能原因:1. JS脚本未正常执行 2. 页面加载失败 3. 浏览器环境限制")
else:
print(f"\n[+] 分析完成!共发现 {len(all_actions)} 个可交互元素。")
action_types = {}
for action in all_actions:
action_type = action.get("type", "unknown")
action_types[action_type] = action_types.get(action_type, 0) + 1
print("[+] 动作类型分布:")
for type_name, count in action_types.items():
print(f" {type_name}: {count} 个")
# ========== 📁 创建输出目录 ==========
folder_name = safe_folder_name(target_url)
save_dir = output_root / folder_name
ensure_dir(save_dir)
print(f"[*] 创建输出目录: {save_dir}")
# ========== 📝 保存 URL ==========
url_path = save_dir / "url.txt"
url_path.write_text(target_url, encoding="utf-8")
print(f"[+] URL 已保存: {url_path}")
# ========== 📸 保存截图 ==========
screenshot_path = save_dir / "page_screenshot.png"
try:
page.screenshot(path=str(screenshot_path), full_page=True)
print(f"[+] 页面截图已保存: {screenshot_path}")
except Exception as e:
print(f"[!] 保存截图失败: {e}")
# ========== 🧩 保存 MHTML ==========
mhtml_path = save_dir / "page_snapshot.mhtml"
try:
cdp = page.context.new_cdp_session(page)
mhtml_data = cdp.send("Page.captureSnapshot", {"format": "mhtml"})
mhtml_path.write_text(mhtml_data.get("data", ""), encoding="utf-8")
print(f"[+] 页面 MHTML 存档已保存: {mhtml_path}")
except Exception as e:
print(f"[!] 保存 MHTML 失败: {e}")
# ========== 💾 保存分析结果 ==========
json_path = save_dir / "all_actions_result.json"
json_path.write_text(
json.dumps(all_actions, indent=2, ensure_ascii=False),
encoding="utf-8",
)
print(f"[+] 交互元素结果已保存: {json_path}")
except Exception as e:
print(f"[-] 执行错误: {str(e)}")
finally:
print("\n[*] 5 秒后关闭浏览器...")
time.sleep(5)
browser.close()
if __name__ == "__main__":
main()