crawler4j 爬爬知多少-优快云博客

本文介绍crawler4j爬虫库的使用方法，包括如何获取、安装配置及编写爬虫程序。通过实例演示了如何抓取指定网站的内容，并解析网页数据。

　1. Crawler是什么？

　　crawler4j是一个开源的java爬虫类库，可以用来构建多线程的web爬虫来抓取页面内容。

　　2. 如何获取Crawler？

　　crawler4j的官方地址在这里，目前版本为4.1。如果你使用Maven，可以通过下面的pom的方式，如直接下载，点击这里。

　　3. Crawler怎么用？

　　crawler4j的使用分为两个步骤：一是实现一个继承自edu.uci.ics.crawler4j.crawler.WebCrawler的爬虫类；另外就是通过CrawController调用实现的爬虫类。

 
          package 
          com.favccxx.favsoft.favcrawler; 
         
          import 
          java.util.Set; 
         
          import 
          java.util.regex.Pattern; 
         
          import 
          org.slf4j.Logger; 
         
          import 
          org.slf4j.LoggerFactory; 
         
          import 
          edu.uci.ics.crawler4j.crawler.Page; 
         
          import 
          edu.uci.ics.crawler4j.crawler.WebCrawler; 
         
          import 
          edu.uci.ics.crawler4j.parser.HtmlParseData; 
         
          import 
          edu.uci.ics.crawler4j.url.WebURL; 
         
          public 
          class 
          FavWebCrawler 
          extends 
          WebCrawler { 
         
          private 
          static 
          final 
          Logger logger = LoggerFactory.getLogger(WebCrawler.
          class
          ); 
         
          private 
          final 
          static 
          Pattern FILTERS = Pattern.compile(
          ".*(\\.(css|js|gif|jpg" 
          + 
          "|png|mp3|mp3|zip|gz))$"
          ); 
         
          @Override 
         
          public 
          boolean 
          shouldVisit(Page referringPage, WebURL url) { 
         
          String href = url.getURL().toLowerCase(); 
         
          return 
          !FILTERS.matcher(href).matches() && href.startsWith(
          "http://www.oschina.net/"
          ); 
         
          } 
         
          /** 
         
          * 处理抓取到的页面时，调用该方法      
         
          */ 
         
          @Override 
         
          public 
          void 
          visit(Page page) { 
         
          int 
          docid = page.getWebURL().getDocid(); 
         
          String url = page.getWebURL().getURL(); 
         
          String domain = page.getWebURL().getDomain(); 
         
          String path = page.getWebURL().getPath(); 
         
          String subDomain = page.getWebURL().getSubDomain(); 
         
          String parentUrl = page.getWebURL().getParentUrl(); 
         
          String anchor = page.getWebURL().getAnchor(); 
         
          logger.debug(
          "Docid: {}"
          , docid); 
         
          logger.info(
          "URL: {}"
          , url); 
         
          logger.debug(
          "Domain: '{}'"
          , domain); 
         
          logger.debug(
          "Sub-domain: '{}'"
          , subDomain); 
         
          logger.debug(
          "Path: '{}'"
          , path); 
         
          logger.debug(
          "Parent page: {}"
          , parentUrl); 
         
          logger.debug(
          "Anchor text: {}"
          , anchor); 
         
          if 
          (page.getParseData() 
          instanceof 
          HtmlParseData) { 
         
          HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); 
         
          String text = htmlParseData.getText(); 
         
          String html = htmlParseData.getHtml(); 
         
          Set<WebURL> links = htmlParseData.getOutgoingUrls(); 
         
          logger.debug(
          "Text length: " 
          + text.length()); 
         
          logger.debug(
          "Html length: " 
          + html.length()); 
         
          logger.debug(
          "Number of outgoing links: " 
          + links.size()); 
         
          } 
         
          } 
         
          }

 
          package 
          com.favccxx.favsoft.favcrawler; 
         
          import 
          java.util.Set; 
         
          import 
          java.util.regex.Pattern; 
         
          import 
          org.slf4j.Logger; 
         
          import 
          org.slf4j.LoggerFactory; 
         
          import 
          edu.uci.ics.crawler4j.crawler.CrawlConfig; 
         
          import 
          edu.uci.ics.crawler4j.crawler.CrawlController; 
         
          import 
          edu.uci.ics.crawler4j.crawler.Page; 
         
          import 
          edu.uci.ics.crawler4j.crawler.WebCrawler; 
         
          import 
          edu.uci.ics.crawler4j.fetcher.PageFetcher; 
         
          import 
          edu.uci.ics.crawler4j.parser.HtmlParseData; 
         
          import 
          edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; 
         
          import 
          edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; 
         
          import 
          edu.uci.ics.crawler4j.url.WebURL; 
         
          public 
          class 
          MyCrawler 
          extends 
          WebCrawler { 
         
          private 
          static 
          final 
          Logger logger = LoggerFactory.getLogger(WebCrawler.
          class
          ); 
         
          private 
          final 
          static 
          Pattern FILTERS = Pattern.compile(
          ".*(\\.(css|js|gif|jpg"  
          + 
          "|png|mp3|mp3|zip|gz))$"
          ); 
         
          @Override 
         
          public 
          boolean 
          shouldVisit(Page referringPage, WebURL url) { 
         
          String href = url.getURL().toLowerCase(); 
         
          return 
          !FILTERS.matcher(href).matches() 
         
          && href.startsWith(
          "http://www.oschina.net/"
          ); 
         
          } 
         
          /** 
         
          * This function is called when a page is fetched and ready 
         
          * to be processed by your program. 
         
          */ 
         
          @Override 
         
          public 
          void 
          visit(Page page) { 
         
          int 
          docid = page.getWebURL().getDocid(); 
         
          String url = page.getWebURL().getURL(); 
         
          String domain = page.getWebURL().getDomain(); 
         
          String path = page.getWebURL().getPath(); 
         
          String subDomain = page.getWebURL().getSubDomain(); 
         
          String parentUrl = page.getWebURL().getParentUrl(); 
         
          String anchor = page.getWebURL().getAnchor(); 
         
          //        page.getWebURL().getTag()
         
          System.out.println(
          "********************************"
          ); 
         
          //         
         
          //         System.out.println("Docid: {}" + docid);
         
          //         System.out.println("URL: {}"+ url);
         
          //         System.out.println("Domain: '{}'"+ domain);
         
          //          System.out.println("Sub-domain: '{}'"+ subDomain);
         
          //           System.out.println("Path: '{}'"+ path);
         
          //           System.out.println("Parent page: {}"+ parentUrl);
         
          //         System.out.println("Anchor text: {}"+ anchor);
         
          logger.debug(
          "Docid: {}"
          , docid); 
         
          logger.info(
          "URL: {}"
          , url); 
         
          logger.debug(
          "Domain: '{}'"
          , domain); 
         
          logger.debug(
          "Sub-domain: '{}'"
          , subDomain); 
         
          logger.debug(
          "Path: '{}'"
          , path); 
         
          logger.debug(
          "Parent page: {}"
          , parentUrl); 
         
          logger.debug(
          "Anchor text: {}"
          , anchor); 
         
          //         String url = page.getWebURL().getURL();
         
          System.out.println(
          "URL: " 
          + url); 
         
          if 
          (page.getParseData() 
          instanceof 
          HtmlParseData) { 
         
          HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); 
         
          String text = htmlParseData.getText(); 
         
          String html = htmlParseData.getHtml(); 
         
          Set<WebURL> links = htmlParseData.getOutgoingUrls(); 
         
          System.out.println(
          "--------------------------"
          ); 
         
          //             System.out.println(text);
         
          System.out.println(
          "--------------------------"
          ); 
         
          System.out.println(
          "Text length: " 
          + text.length()); 
         
          System.out.println(
          "Html length: " 
          + html.length()); 
         
          System.out.println(
          "Number of outgoing links: " 
          + links.size()); 
         
          } 
         
          } 
         
          public 
          static 
          void 
          main(String[] args) 
          throws 
          Exception{ 
         
          String crawlStorageFolder = 
          "/data/crawl/root"
          ; 
         
          int 
          numberOfCrawlers = 
          7
          ; 
         
          CrawlConfig config = 
          new 
          CrawlConfig(); 
         
          config.setCrawlStorageFolder(crawlStorageFolder); 
         
          /* 
         
          * Instantiate the controller for this crawl. 
         
          */ 
         
          PageFetcher pageFetcher = new PageFetcher(config); 
         
          RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); 
         
          RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); 
         
          CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); 
         
          /* 
         
          * For each crawl, you need to add some seed urls. These are the first 
         
          * URLs that are fetched and then the crawler starts following links 
         
          * which are found in these pages 
         
          */ 
         
          controller.addSeed("http://www.oschina.net/"); 
         
          //         controller.addSeed("http://www.ics.uci.edu/~welling/");
         
          //         controller.addSeed("http://www.ics.uci.edu/");
         
          /* 
         
          * Start the crawl. This is a blocking operation, meaning that your code 
         
          * will reach the line after this only when crawling is finished. 
         
          */ 
         
          controller.start(MyCrawler.
          class
          , numberOfCrawlers); 
         
          } 
         
          }