heritrix中ExtractorJS扩展源代码

本文介绍了一种处理JS文件的方法,用于在JS代码中提取可能的可爬取URL。通过正则表达式匹配JS字符串,判断其是否为有效的URL,并创建相应的链接。同时提供了代码实现细节及一些特殊案例的处理策略。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

以下是heritrix中对JS的扩展,在自己写扩展的时候可以参考这个源代码或值对html或css扩展的源代码


/* Copyright (C) 2003 Internet Archive.
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Created on Nov 17, 2003
*
* To change the template for this generated file go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/
package org.archive.crawler.extractor;

import java.io.IOException;
import java.util.logging.Logger;
import java.util.regex.Matcher;

import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.CrawlController;
import org.archive.io.ReplayCharSequence;
import org.archive.net.UURI;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;
import org.archive.util.UriUtils;

/**
* Processes Javascript files for strings that are likely to be
* crawlable URIs.
*
* @contributor gojomo
* @contributor szznax
*
*/
public class ExtractorJS extends Extractor implements CoreAttributeConstants {

private static final long serialVersionUID = -2231962381454717720L;

private static Logger LOGGER =
Logger.getLogger("org.archive.crawler.extractor.ExtractorJS");

// finds whitespace-free strings in Javascript
// (areas between paired ' or " characters, possibly backslash-quoted
// on the ends, but not in the middle)
static final String JAVASCRIPT_STRING_EXTRACTOR =
"(\\\\{0,8}+(?:\"|\'))(\\S{0,"+UURI.MAX_URL_LENGTH+"}?)(?:\\1)";
// GROUPS:
// (G1) ' or " with optional leading backslashes
// (G2) whitespace-free string delimited on boths ends by G1


protected long numberOfCURIsHandled = 0;
protected static long numberOfLinksExtracted = 0;


// URIs known to produce false-positives with the current JS extractor.
// e.g. currently (2.0.3) the JS extractor produces 13 false-positive
// URIs from http://www.google-analytics.com/urchin.js and only 2
// good URIs, which are merely one pixel images.
// TODO: remove this blacklist when JS extractor is improved
protected final static String[] EXTRACTOR_URI_EXCEPTIONS = {
"http://www.google-analytics.com/urchin.js"
};

/**
* @param name
*/
public ExtractorJS(String name) {
super(name, "JavaScript extractor. Link extraction on JavaScript" +
" files (.js).");
}

/* (non-Javadoc)
* @see org.archive.crawler.framework.Processor#process(org.archive.crawler.datamodel.CrawlURI)
*/
public void extract(CrawlURI curi) {
// special-cases, for when we know our current JS extractor does poorly.
// TODO: remove this test when JS extractor is improved
for (String s: EXTRACTOR_URI_EXCEPTIONS) {
if (curi.toString().equals(s))
return;
}

if (!isHttpTransactionContentToProcess(curi)) {
return;
}
String contentType = curi.getContentType();
if ((contentType == null)) {
return;
}
// If content type is not js and if the viaContext
// does not begin with 'script', return.
if((contentType.indexOf("javascript") < 0) &&
(contentType.indexOf("jscript") < 0) &&
(contentType.indexOf("ecmascript") < 0) &&
(!curi.toString().toLowerCase().endsWith(".js")) &&
(curi.getViaContext() == null || !curi.getViaContext().
toString().toLowerCase().startsWith("script"))) {
return;
}

this.numberOfCURIsHandled++;

ReplayCharSequence cs = null;
try {
cs = curi.getHttpRecorder().getReplayCharSequence();
} catch (IOException e) {
curi.addLocalizedError(this.getName(), e,
"Failed get of replay char sequence.");
}
if (cs == null) {
LOGGER.warning("Failed getting ReplayCharSequence: " +
curi.toString());
return;
}

try {
try {
numberOfLinksExtracted += considerStrings(curi, cs,
getController(), true);
} catch (StackOverflowError e) {
DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
}
// Set flag to indicate that link extraction is completed.
curi.linkExtractorFinished();
} finally {
// Done w/ the ReplayCharSequence. Close it.
if (cs != null) {
try {
cs.close();
} catch (IOException ioe) {
LOGGER.warning(TextUtils.exceptionToString(
"Failed close of ReplayCharSequence.", ioe));
}
}
}
}

public static long considerStrings(CrawlURI curi, CharSequence cs,
CrawlController controller, boolean handlingJSFile) {
long foundLinks = 0;
Matcher strings =
TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs);
while(strings.find()) {
CharSequence subsequence =
cs.subSequence(strings.start(2), strings.end(2));

if(UriUtils.isLikelyUriJavascriptContextLegacy(subsequence)) {
String string = subsequence.toString();
string = UriUtils.speculativeFixup(string, curi.getUURI());
foundLinks++;
try {
if (handlingJSFile) {
curi.createAndAddLinkRelativeToVia(string,
Link.JS_MISC, Link.SPECULATIVE_HOP);
} else {
curi.createAndAddLinkRelativeToBase(string,
Link.JS_MISC, Link.SPECULATIVE_HOP);
}
} catch (URIException e) {
// There may not be a controller (e.g. If we're being run
// by the extractor tool).
if (controller != null) {
controller.logUriError(e, curi.getUURI(), string);
} else {
LOGGER.info(curi + ", " + string + ": " +
e.getMessage());
}
}
} else {
foundLinks += considerStrings(curi, subsequence,
controller, handlingJSFile);
}
}
TextUtils.recycleMatcher(strings);
return foundLinks;
}
/*
* (non-Javadoc)
*
* @see org.archive.crawler.framework.Processor#report()
*/
public String report() {
StringBuffer ret = new StringBuffer();
ret.append("Processor: org.archive.crawler.extractor.ExtractorJS\n");
ret.append(" Function: Link extraction on JavaScript code\n");
ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");

return ret.toString();
}
}


电动汽车数据集:2025年3K+记录 真实电动汽车数据:特斯拉、宝马、日产车型,含2025年电池规格和销售数据 关于数据集 电动汽车数据集 这个合成数据集包含许多品牌和年份的电动汽车和插电式车型的记录,捕捉技术规格、性能、定价、制造来源、销售和安全相关属性。每一行代表由vehicle_ID标识的唯一车辆列表。 关键特性 覆盖范围:全球制造商和车型组合,包括纯电动汽车和插电式混合动力汽车。 范围:电池化学成分、容量、续航里程、充电标准和速度、价格、产地、自主水平、排放、安全等级、销售和保修。 时间跨度:模型跨度多年(包括传统和即将推出的)。 数据质量说明: 某些行可能缺少某些字段(空白)。 几个分类字段包含不同的、特定于供应商的值(例如,Charging_Type、Battery_Type)。 各列中的单位混合在一起;注意kWh、km、hr、USD、g/km和额定值。 列 列类型描述示例 Vehicle_ID整数每个车辆记录的唯一标识符。1 制造商分类汽车品牌或OEM。特斯拉 型号类别特定型号名称/变体。型号Y 与记录关联的年份整数模型。2024 电池_类型分类使用的电池化学/技术。磷酸铁锂 Battery_Capacity_kWh浮充电池标称容量,单位为千瓦时。75.0 Range_km整数表示充满电后的行驶里程(公里)。505 充电类型主要充电接口或功能。CCS、NACS、CHAdeMO、DCFC、V2G、V2H、V2L Charge_Time_hr浮动充电的大致时间(小时),上下文因充电方法而异。7.5 价格_USD浮动参考车辆价格(美元).85000.00 颜色类别主要外观颜色或饰面。午夜黑 制造国_制造类别车辆制造/组装的国家。美国 Autonomous_Level浮点自动化能力级别(例如0-5),可能包括子级别的小
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值