crawler4j 源码解读之配置文件configurable

最新推荐文章于 2025-01-26 10:28:40 发布

自然鸟神

最新推荐文章于 2025-01-26 10:28:40 发布

阅读量800

点赞数

CC 4.0 BY-SA版权

文章标签： java 开源爬虫

本文链接：https://blog.youkuaiyun.com/chenpeng19910926/article/details/49871791

本文详细解析了爬虫配置中的关键参数及其作用，包括存储文件夹、最大深度、页面数量、用户代理、礼貌延迟、HTTPS页面、多媒体文件抓取、并发连接数、超时设置、出链数限制、页面大小、重定向遵循、代理配置等，并介绍了如何验证配置的有效性。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

public class CrawlConfig {

   /**
   * The folder which will be used by crawler for storing the intermediate
   * crawl data. The content of this folder should not be modified manually.
   */
   //爬取的存储文件夹
   private String crawlStorageFolder;

   /**
   * If this feature is enabled, you would be able to resume a previously
   * stopped/crashed crawl. However, it makes crawling slightly slower
   */
   //如果设置为true的话，可以恢复停止以前的状态。
   private boolean resumableCrawling = false;

   /**
   * Maximum depth of crawling For unlimited depth this parameter should be
   * set to -1
   */
   //爬取的最大深度
   private int maxDepthOfCrawling = -1;

   /**
   * Maximum number of pages to fetch For unlimited number of pages, this
   * parameter should be set to -1
   */
   //爬取页面的最大数量
   private int maxPagesToFetch = -1;

   /**
   * user-agent string that is used for representing your crawler to web
   * servers. See http://en.wikipedia.org/wiki/User_agent for more details
   */
   //设置用户代理
   private String userAgentString = "crawler4j (http://code.google.com/p/crawler4j/)";

   /**
   * Politeness delay in milliseconds (delay between sending two requests to
   * the same host).
   */
   //设置向同一主机发送请求的间隔时间
   private int politenessDelay = 200;

   /**
   * Should we also crawl https pages?
   */
   //设置是否只抓取https的页面。
   private boolean includeHttpsPages = false;

   /**
   * Should we fetch binary content such as images, audio, ...?
   */
   //是否抓取多媒体文件，如图像，音频等。
   private boolean includeBinaryContentInCrawling = false;

   /**
   * Maximum Connections per host
   */
   //对于同一个主机的最大连接数
   private int maxConnectionsPerHost = 100;

   /**
   * Maximum total connections
   */
   private int maxTotalConnections = 100;

   /**
   * Socket timeout in milliseconds
   */
   private int socketTimeout = 20000;

   /**
   * Connection timeout in milliseconds
   */
   private int connectionTimeout = 30000;

   /**
   * Max number of outgoing links which are processed from a page
   */
   //每个页面的最大出链数。
   private int maxOutgoingLinksToFollow = 5000;

   /**
   * Max allowed size of a page. Pages larger than this size will not be
   * fetched.
   */
   //抓取页面的最大size
   private int maxDownloadSize = 1048576;

   /**
   * Should we follow redirects?
   */
   private boolean followRedirects = true;

   /**
   * If crawler should run behind a proxy, this parameter can be used for
   * specifying the proxy host.
   */
   //配置代理主机
   private String proxyHost = null;

   /**
   * If crawler should run behind a proxy, this parameter can be used for
   * specifying the proxy port.
   */
   //配置代理主机的端口号
   private int proxyPort = 80;

   /**
   * If crawler should run behind a proxy and user/pass is needed for
   * authentication in proxy, this parameter can be used for specifying the
   * username.
   */
   //代理主机的用户名
   private String proxyUsername = null;

   /**
   * If crawler should run behind a proxy and user/pass is needed for
   * authentication in proxy, this parameter can be used for specifying the
   * password.
   */
   //代理主机的密码
   private String proxyPassword = null;

   public CrawlConfig() {
   }

   /**
   * Validates the configs specified by this instance.
   *
   * @throws Exception
   */
   public void validate() throws Exception {
       if (crawlStorageFolder == null) {
           throw new Exception("Crawl storage folder is not set in the CrawlConfig.");
       }
       if (politenessDelay < 0) {
           throw new Exception("Invalid value for politeness delay: " + politenessDelay);
       }
       if (maxDepthOfCrawling < -1) {
           throw new Exception("Maximum crawl depth should be either a positive number or -1 for unlimited depth.");
       }
       if (maxDepthOfCrawling > Short.MAX_VALUE) {
           throw new Exception("Maximum value for crawl depth is " + Short.MAX_VALUE);
       }

   }

}