public class CrawlConfig {
/**
* The folder which will be used by crawler for storing the intermediate
* crawl data. The content of this folder should not be modified manually.
*/
//爬取的存储文件夹
private String crawlStorageFolder;
/**
* If this feature is enabled, you would be able to resume a previously
* stopped/crashed crawl. However, it makes crawling slightly slower
*/
//如果设置为true的话,可以恢复停止以前的状态。
private boolean resumableCrawling = false;
/**
* Maximum depth of crawling For unlimited depth this parameter should be
* set to -1
*/
//爬取的最大深度
private int maxDepthOfCrawling = -1;
/**
* Maximum number of pages to fetch For unlimited number of pages, this
* parameter should be set to -1
*/
//爬取页面的最大数量
private int maxPagesToFetch = -1;
/**
* user-agent string that is used for representing your crawler to web
* servers. See http://en.wikipedia.org/wiki/User_agent for more details
*/
//设置用户代理
private String userAgentString = "crawler4j (http://code.google.com/p/crawler4j/)";
/**
* Politeness delay in milliseconds (delay between sending two requests to
* the same host).
*/
//设置向同一主机发送请求的间隔时间
private int politenessDelay = 200;
/**
* Should we also crawl https pages?
*/
//设置是否只抓取https的页面。
private boolean includeHttpsPages = false;
/**
* Should we fetch binary content such as images, audio, ...?
*/
//是否抓取多媒体文件,如图像,音频等。
private boolean includeBinaryContentInCrawling = false;
/**
* Maximum Connections per host
*/
//对于同一个主机的最大连接数
private int maxConnectionsPerHost = 100;
/**
* Maximum total connections
*/
private int maxTotalConnections = 100;
/**
* Socket timeout in milliseconds
*/
private int socketTimeout = 20000;
/**
* Connection timeout in milliseconds
*/
private int connectionTimeout = 30000;
/**
* Max number of outgoing links which are processed from a page
*/
//每个页面的最大出链数。
private int maxOutgoingLinksToFollow = 5000;
/**
* Max allowed size of a page. Pages larger than this size will not be
* fetched.
*/
//抓取页面的最大size
private int maxDownloadSize = 1048576;
/**
* Should we follow redirects?
*/
private boolean followRedirects = true;
/**
* If crawler should run behind a proxy, this parameter can be used for
* specifying the proxy host.
*/
//配置代理主机
private String proxyHost = null;
/**
* If crawler should run behind a proxy, this parameter can be used for
* specifying the proxy port.
*/
//配置代理主机的端口号
private int proxyPort = 80;
/**
* If crawler should run behind a proxy and user/pass is needed for
* authentication in proxy, this parameter can be used for specifying the
* username.
*/
//代理主机的用户名
private String proxyUsername = null;
/**
* If crawler should run behind a proxy and user/pass is needed for
* authentication in proxy, this parameter can be used for specifying the
* password.
*/
//代理主机的密码
private String proxyPassword = null;
public CrawlConfig() {
}
/**
* Validates the configs specified by this instance.
*
* @throws Exception
*/
public void validate() throws Exception {
if (crawlStorageFolder == null) {
throw new Exception("Crawl storage folder is not set in the CrawlConfig.");
}
if (politenessDelay < 0) {
throw new Exception("Invalid value for politeness delay: " + politenessDelay);
}
if (maxDepthOfCrawling < -1) {
throw new Exception("Maximum crawl depth should be either a positive number or -1 for unlimited depth.");
}
if (maxDepthOfCrawling > Short.MAX_VALUE) {
throw new Exception("Maximum value for crawl depth is " + Short.MAX_VALUE);
}
}
}
/**
* The folder which will be used by crawler for storing the intermediate
* crawl data. The content of this folder should not be modified manually.
*/
//爬取的存储文件夹
private String crawlStorageFolder;
/**
* If this feature is enabled, you would be able to resume a previously
* stopped/crashed crawl. However, it makes crawling slightly slower
*/
//如果设置为true的话,可以恢复停止以前的状态。
private boolean resumableCrawling = false;
/**
* Maximum depth of crawling For unlimited depth this parameter should be
* set to -1
*/
//爬取的最大深度
private int maxDepthOfCrawling = -1;
/**
* Maximum number of pages to fetch For unlimited number of pages, this
* parameter should be set to -1
*/
//爬取页面的最大数量
private int maxPagesToFetch = -1;
/**
* user-agent string that is used for representing your crawler to web
* servers. See http://en.wikipedia.org/wiki/User_agent for more details
*/
//设置用户代理
private String userAgentString = "crawler4j (http://code.google.com/p/crawler4j/)";
/**
* Politeness delay in milliseconds (delay between sending two requests to
* the same host).
*/
//设置向同一主机发送请求的间隔时间
private int politenessDelay = 200;
/**
* Should we also crawl https pages?
*/
//设置是否只抓取https的页面。
private boolean includeHttpsPages = false;
/**
* Should we fetch binary content such as images, audio, ...?
*/
//是否抓取多媒体文件,如图像,音频等。
private boolean includeBinaryContentInCrawling = false;
/**
* Maximum Connections per host
*/
//对于同一个主机的最大连接数
private int maxConnectionsPerHost = 100;
/**
* Maximum total connections
*/
private int maxTotalConnections = 100;
/**
* Socket timeout in milliseconds
*/
private int socketTimeout = 20000;
/**
* Connection timeout in milliseconds
*/
private int connectionTimeout = 30000;
/**
* Max number of outgoing links which are processed from a page
*/
//每个页面的最大出链数。
private int maxOutgoingLinksToFollow = 5000;
/**
* Max allowed size of a page. Pages larger than this size will not be
* fetched.
*/
//抓取页面的最大size
private int maxDownloadSize = 1048576;
/**
* Should we follow redirects?
*/
private boolean followRedirects = true;
/**
* If crawler should run behind a proxy, this parameter can be used for
* specifying the proxy host.
*/
//配置代理主机
private String proxyHost = null;
/**
* If crawler should run behind a proxy, this parameter can be used for
* specifying the proxy port.
*/
//配置代理主机的端口号
private int proxyPort = 80;
/**
* If crawler should run behind a proxy and user/pass is needed for
* authentication in proxy, this parameter can be used for specifying the
* username.
*/
//代理主机的用户名
private String proxyUsername = null;
/**
* If crawler should run behind a proxy and user/pass is needed for
* authentication in proxy, this parameter can be used for specifying the
* password.
*/
//代理主机的密码
private String proxyPassword = null;
public CrawlConfig() {
}
/**
* Validates the configs specified by this instance.
*
* @throws Exception
*/
public void validate() throws Exception {
if (crawlStorageFolder == null) {
throw new Exception("Crawl storage folder is not set in the CrawlConfig.");
}
if (politenessDelay < 0) {
throw new Exception("Invalid value for politeness delay: " + politenessDelay);
}
if (maxDepthOfCrawling < -1) {
throw new Exception("Maximum crawl depth should be either a positive number or -1 for unlimited depth.");
}
if (maxDepthOfCrawling > Short.MAX_VALUE) {
throw new Exception("Maximum value for crawl depth is " + Short.MAX_VALUE);
}
}
}