Nutch1.2增加插件例子-优快云博客

本文详细介绍了如何在Nutch 1.2中开发一个插件来实现推荐网站功能，包括创建插件结构、编写代码、配置插件、测试等关键步骤。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

今尝试下给nutch1.2增加一个插件，于是到官网找了个例子，链接如下：

http://wiki.apache.org/nutch/WritingPluginExample-0.9

这个例子实现的的是推荐网站，就是写关键字在content里，当别人搜索这个关键字时，你推荐的网站在搜索结果中排前，要实现推荐必须在你的网页上加上

[xhtml]view plaincopy 
   
 <meta name="recommended" content="plugins" />  

这条属性才能被插件识别。

由于它这个例子是用nutch0.9的，而且1.2和0.9有些区别，于是要修改一些代码。步骤如下：

1.插件开放

1.1在src/plugin中新建一个文件夹recommend

1.2.在recommend目录下新建Plugin.xml和Build.xml文件，内容如下：

Plugin.xml

[xhtml]view plaincopy 
   
 <?xml version="1.0" encoding="UTF-8"?>  
 <plugin  
    id="recommended"  
    name="Recommended Parser/Filter"  
    version="0.0.1"  
    provider-name="nutch.org">  
   
    <runtime>  
       <!-- As defined in build.xml this plugin will end up bundled as recommended.jar -->  
       <library name="recommended.jar">  
          <export name="*"/>  
       </library>  
    </runtime>  
   
    <!-- The RecommendedParser extends the HtmlParseFilter to grab the contents of  
         any recommended meta tags -->  
    <extension id="org.apache.nutch.parse.recommended.recommendedfilter"  
               name="Recommended Parser"  
               point="org.apache.nutch.parse.HtmlParseFilter">  
       <implementation id="RecommendedParser"  
                       class="org.apache.nutch.parse.recommended.RecommendedParser"/>  
    </extension>  
   
    <!-- TheRecommendedIndexer extends the IndexingFilter in order to add the contents  
         of the recommended meta tags (as found by the RecommendedParser) to the lucene  
         index. -->  
    <extension id="org.apache.nutch.parse.recommended.recommendedindexer"  
               name="Recommended identifier filter"  
               point="org.apache.nutch.indexer.IndexingFilter">  
       <implementation id="RecommendedIndexer"  
                       class="org.apache.nutch.parse.recommended.RecommendedIndexer"/>  
    </extension>  
   
    <!-- The RecommendedQueryFilter gets called when you perform a search. It runs a  
         search for the user's query against the recommended fields.  In order to get  
         add this to the list of filters that gets run by default, you have to use  
         "fields=DEFAULT". -->     
    <extension id="org.apache.nutch.parse.recommended.recommendedSearcher"  
               name="Recommended Search Query Filter"  
               point="org.apache.nutch.searcher.QueryFilter">  
       <implementation id="RecommendedQueryFilter"  
                       class="org.apache.nutch.parse.recommended.RecommendedQueryFilter">  
         <parameter name="fields" value="recommended"/>  
         </implementation>  
    </extension>  
   
 </plugin>  

Build.xml

[xhtml]view plaincopy 
   
 <?xml version="1.0"?>  
   
 <project name="recommended" default="jar-core">  
   
   <import file="../build-plugin.xml"/>  
     
  <!-- Build compilation dependencies -->  
  <target name="deps-jar">  
    <ant target="jar" inheritall="false" dir="../lib-xml"/>  
  </target>  
   
   <!-- Add compilation dependencies to classpath -->  
  <path id="plugin.deps">  
    <fileset dir="${nutch.root}/build">  
      <include name="**/lib-xml/*.jar" />  
    </fileset>  
  </path>  
   
   <!-- Deploy Unit test dependencies -->  
  <target name="deps-test">  
    <ant target="deploy" inheritall="false" dir="../lib-xml"/>  
    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>  
    <ant target="deploy" inheritall="false" dir="../protocol-file"/>  
  </target>  
   
    
   <!-- for junit test -->  
   <mkdir dir="${build.test}/data"/>  
   <copy file="data/recommended.html" todir="${build.test}/data"/>  
 </project>  

1.3.在recommended目录下建立/src/java/org/apache/nutch/parse/recommended目录。

1.4.增加RecommendedIndexer.java,RecommendedParser.java,RecommendedQueryFilter.java三个类，内容如下：

RecommendedIndexer.java

[java]view plaincopy 
   
 package org.apache.nutch.parse.recommended;  
   
 // JDK import  
 import java.util.logging.Logger;  
   
 // Commons imports  
 import org.apache.commons.logging.Log;  
 import org.apache.commons.logging.LogFactory;  
   
   
 // Nutch imports  
 import org.apache.nutch.util.LogUtil;  
 import org.apache.nutch.fetcher.FetcherOutput;  
 import org.apache.nutch.indexer.IndexingFilter;  
 import org.apache.nutch.indexer.IndexingException;  
 import org.apache.nutch.indexer.NutchDocument;  
 import org.apache.nutch.parse.Parse;  
   
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.io.Text;  
 import org.apache.nutch.crawl.CrawlDatum;  
 import org.apache.nutch.crawl.Inlinks;  
   
 // Lucene imports  
 import org.apache.lucene.document.Field;  
 import org.apache.lucene.document.Document;  
   
 public class RecommendedIndexer implements IndexingFilter {  
       
   public static final Log LOG = LogFactory.getLog(RecommendedIndexer.class.getName());  
     
   private Configuration conf;  
     
   public RecommendedIndexer() {  
   }  
   @Override  
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,   
     CrawlDatum datum, Inlinks inlinks)  
     throws IndexingException {  
   
     String recommendation = parse.getData().getMeta("recommended");  
   
         if (recommendation != null) {  
             Field recommendedField =   
                 new Field("recommended", recommendation,   
                     Field.Store.YES, Field.Index.NOT_ANALYZED);  
             recommendedField.setBoost(5.0f);  
             doc.add("recommended",recommendedField);  
             LOG.info("Added " + recommendation + " to the recommended Field");  
         }  
   
     return doc;  
   }  
     
   public void setConf(Configuration conf) {  
     this.conf = conf;  
   }  
   
   public Configuration getConf() {  
     return this.conf;  
   }  
   
 @Override  
 public void addIndexBackendOptions(Configuration conf) {  
     // TODO Auto-generated method stub  
 }  
 }  

RecommendedParser.java

[java]view plaincopy 
   
 package org.apache.nutch.parse.recommended;  
   
 // JDK imports  
 import java.util.Enumeration;  
 import java.util.Properties;  
 import java.util.logging.Logger;  
   
 // Nutch imports  
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.nutch.metadata.Metadata;  
 import org.apache.nutch.parse.HTMLMetaTags;  
 import org.apache.nutch.parse.Parse;  
 import org.apache.nutch.parse.HtmlParseFilter;  
 import org.apache.nutch.parse.ParseResult;  
 import org.apache.nutch.protocol.Content;  
   
 // Commons imports  
 import org.apache.commons.logging.Log;  
 import org.apache.commons.logging.LogFactory;  
   
 // W3C imports  
 import org.w3c.dom.DocumentFragment;  
   
 public class RecommendedParser implements HtmlParseFilter {  
   
   private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());  
     
   private Configuration conf;  
   
   /** The Recommended meta data attribute name */  
   public static final String META_RECOMMENDED_NAME="recommended";  
   
   /** 
    * Scan the HTML document looking for a recommended meta tag. 
    */  
     
   @Override  
   public ParseResult filter(Content content, ParseResult parseResult,  
     HTMLMetaTags metaTags, DocumentFragment doc) {  
     // Trying to find the document's recommended term  
     String recommendation = null;  
   
     Properties generalMetaTags = metaTags.getGeneralTags();  
   
     for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames.hasMoreElements(); ) {  
         if (tagNames.nextElement().equals("recommended")) {  
             System.out.println(generalMetaTags.getProperty("recommended"));  
             recommendation = generalMetaTags.getProperty("recommended");  
            LOG.info("Found a Recommendation for " + recommendation);  
         }  
     }  
   
     if (recommendation == null) {  
         LOG.info("No Recommendation");  
     } else {  
         LOG.info("Adding Recommendation for " + recommendation);  
         Parse parse = parseResult.get(content.getUrl());  
           
         parse.getData().getContentMeta().set(META_RECOMMENDED_NAME, recommendation);  
     }  
   
     return parseResult;  
   }  
     
   public void setConf(Configuration conf) {  
     this.conf = conf;  
   }  
   
   public Configuration getConf() {  
     return this.conf;  
   }  
   
   
   
 }  

RecommendedQueryFilter.java

[java]view plaincopy 
   
 package org.apache.nutch.parse.recommended;  
   
 import org.apache.nutch.searcher.FieldQueryFilter;  
   
 import java.util.logging.Logger;  
   
 // Commons imports  
 import org.apache.commons.logging.Log;  
 import org.apache.commons.logging.LogFactory;  
   
   
 public class RecommendedQueryFilter extends FieldQueryFilter {  
     private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());  
   
     public RecommendedQueryFilter() {  
         super("recommended", 5f);  
         LOG.info("Added a recommended query");  
     }  
     
 }  

1.5.在 src/plugin/build.xml 中的<target name="deploy"></target>中增加一行：

[xhtml]view plaincopy 
   
 <ant dir="recommended" target="deploy" />  

1.6.运行cmd，切换到recommend目录，运行ant命令编译，插件开发完成。

1.7 让nutch识别你的插件

在conf/nutch-site.xml 中增加一下属性

[c-sharp]view plaincopy 
   
 <property>  
   <name>plugin.includes</name>  
   <value>recommended|protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>  <description>Regular expression naming plugin id names to  
   include.  Any plugin not matching this expression is excluded.  
   In any case you need at least include the nutch-extensionpoints plugin. By  
   default Nutch includes crawling just HTML and plain text via HTTP,  
   and basic indexing and search plugins.  
   </description>  
 </property>  

2.编写插件测试类

2.1 在src/plugin中/recommend目录下新建一个data目录，在data目录下新建一个html文件recommended.html内容如下：

[xhtml]view plaincopy 
   
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">  
   
 <html lang="en">  
 <head>  
     <meta http-equiv="Content-Type" content="text/html; charset=utf-8">  
     <title>recommended</title>  
     <meta name="generator" content="TextMate http://macromates.com/">  
     <meta name="author" content="Ricardo J. Méndez">  
     <meta name="recommended" content="recommended-content"/>  
     <!-- Date: 2007-02-12 -->  
 </head>  
 <body>  
     Recommended meta tag test.  
 </body>  
 </html>  

2.2 在src/plugin中/recommend目录下新建src/test/org/apache/nutch/parse/recommended目录，增加TestRecommendedParser.java类，内容如下：

[xhtml]view plaincopy 
   
 package org.apache.nutch.parse.recommended;  
   
   
 import org.apache.nutch.metadata.Metadata;  
 import org.apache.nutch.parse.Parse;  
 import org.apache.nutch.parse.ParseResult;  
 import org.apache.nutch.parse.ParseUtil;  
 import org.apache.nutch.protocol.Content;  
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.nutch.util.NutchConfiguration;  
   
 import java.util.Properties;  
 import java.io.*;  
 import java.net.URL;  
   
 import junit.framework.TestCase;  
   
 /*  
  * Loads test page recommended.html and verifies that the recommended   
  * meta tag has recommended-content as its value.  
  *  
  */  
 public class TestRecommendedParser extends TestCase {  
   
   private static final File testDir =  
     new File("H:/project/SearchEngine/Nutch1.2/src/plugin/recommended/data");  
   
   public void testPages() throws Exception {  
     pageTest(new File(testDir, "recommended.html"), "http://foo.com/",  
              "recommended-content");  
   
   }  
   
   
   public void pageTest(File file, String url, String recommendation)  
     throws Exception {  
   
     String contentType = "text/html";  
     InputStream in = new FileInputStream(file);  
       
     ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());  
     byte[] buffer = new byte[1024];  
     int i;  
     while ((i = in.read(buffer)) != -1) {  
       out.write(buffer, 0, i);  
     }  
     in.close();  
     byte[] bytes = out.toByteArray();  
     Configuration conf = NutchConfiguration.create();  
   
     Content content =  
       new Content(url, url, bytes, contentType, new Metadata(), conf);  
       
     Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content).get(content.getUrl());  
       
     Metadata metadata = parse.getData().getContentMeta();  
     
     assertEquals(recommendation, metadata.get("recommended"));  
     assertTrue("somesillycontent" != metadata.get("recommended"));  
   }  
     
 }