今尝试下给nutch1.2增加一个插件,于是到官网找了个例子,链接如下:
http://wiki.apache.org/nutch/WritingPluginExample-0.9
这个例子实现的的是推荐网站,就是写关键字在content里,当别人搜索这个关键字时,你推荐的网站在搜索结果中排前,要实现推荐必须在你的网页上加上
<meta name="recommended" content="plugins" />
这条属性才能被插件识别。
由于它这个例子是用nutch0.9的,而且1.2和0.9有些区别,于是要修改一些代码。步骤如下:
1.插件开放
1.1在src/plugin中新建一个文件夹recommend
1.2.在recommend目录下新建Plugin.xml和Build.xml文件,内容如下:
Plugin.xml
<?xml version="1.0" encoding="UTF-8"?> <plugin id="recommended" name="Recommended Parser/Filter" version="0.0.1" provider-name="nutch.org"> <runtime> <!-- As defined in build.xml this plugin will end up bundled as recommended.jar --> <library name="recommended.jar"> <export name="*"/> </library> </runtime> <!-- The RecommendedParser extends the HtmlParseFilter to grab the contents of any recommended meta tags --> <extension id="org.apache.nutch.parse.recommended.recommendedfilter" name="Recommended Parser" point="org.apache.nutch.parse.HtmlParseFilter"> <implementation id="RecommendedParser" class="org.apache.nutch.parse.recommended.RecommendedParser"/> </extension> <!-- TheRecommendedIndexer extends the IndexingFilter in order to add the contents of the recommended meta tags (as found by the RecommendedParser) to the lucene index. --> <extension id="org.apache.nutch.parse.recommended.recommendedindexer" name="Recommended identifier filter" point="org.apache.nutch.indexer.IndexingFilter"> <implementation id="RecommendedIndexer" class="org.apache.nutch.parse.recommended.RecommendedIndexer"/> </extension> <!-- The RecommendedQueryFilter gets called when you perform a search. It runs a search for the user's query against the recommended fields. In order to get add this to the list of filters that gets run by default, you have to use "fields=DEFAULT". --> <extension id="org.apache.nutch.parse.recommended.recommendedSearcher" name="Recommended Search Query Filter" point="org.apache.nutch.searcher.QueryFilter"> <implementation id="RecommendedQueryFilter" class="org.apache.nutch.parse.recommended.RecommendedQueryFilter"> <parameter name="fields" value="recommended"/> </implementation> </extension> </plugin>
Build.xml
<?xml version="1.0"?> <project name="recommended" default="jar-core"> <import file="../build-plugin.xml"/> <!-- Build compilation dependencies --> <target name="deps-jar"> <ant target="jar" inheritall="false" dir="../lib-xml"/> </target> <!-- Add compilation dependencies to classpath --> <path id="plugin.deps"> <fileset dir="${nutch.root}/build"> <include name="**/lib-xml/*.jar" /> </fileset> </path> <!-- Deploy Unit test dependencies --> <target name="deps-test"> <ant target="deploy" inheritall="false" dir="../lib-xml"/> <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> <ant target="deploy" inheritall="false" dir="../protocol-file"/> </target> <!-- for junit test --> <mkdir dir="${build.test}/data"/> <copy file="data/recommended.html" todir="${build.test}/data"/> </project>
1.3.在recommended目录下建立/src/java/org/apache/nutch/parse/recommended目录。
1.4.增加RecommendedIndexer.java,RecommendedParser.java,RecommendedQueryFilter.java三个类,内容如下:
RecommendedIndexer.java
package org.apache.nutch.parse.recommended; // JDK import import java.util.logging.Logger; // Commons imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; // Nutch imports import org.apache.nutch.util.LogUtil; import org.apache.nutch.fetcher.FetcherOutput; import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.parse.Parse; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; // Lucene imports import org.apache.lucene.document.Field; import org.apache.lucene.document.Document; public class RecommendedIndexer implements IndexingFilter { public static final Log LOG = LogFactory.getLog(RecommendedIndexer.class.getName()); private Configuration conf; public RecommendedIndexer() { } @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { String recommendation = parse.getData().getMeta("recommended"); if (recommendation != null) { Field recommendedField = new Field("recommended", recommendation, Field.Store.YES, Field.Index.NOT_ANALYZED); recommendedField.setBoost(5.0f); doc.add("recommended",recommendedField); LOG.info("Added " + recommendation + " to the recommended Field"); } return doc; } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return this.conf; } @Override public void addIndexBackendOptions(Configuration conf) { // TODO Auto-generated method stub } }
RecommendedParser.java
package org.apache.nutch.parse.recommended; // JDK imports import java.util.Enumeration; import java.util.Properties; import java.util.logging.Logger; // Nutch imports import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.HTMLMetaTags; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.HtmlParseFilter; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.protocol.Content; // Commons imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; // W3C imports import org.w3c.dom.DocumentFragment; public class RecommendedParser implements HtmlParseFilter { private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName()); private Configuration conf; /** The Recommended meta data attribute name */ public static final String META_RECOMMENDED_NAME="recommended"; /** * Scan the HTML document looking for a recommended meta tag. */ @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { // Trying to find the document's recommended term String recommendation = null; Properties generalMetaTags = metaTags.getGeneralTags(); for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames.hasMoreElements(); ) { if (tagNames.nextElement().equals("recommended")) { System.out.println(generalMetaTags.getProperty("recommended")); recommendation = generalMetaTags.getProperty("recommended"); LOG.info("Found a Recommendation for " + recommendation); } } if (recommendation == null) { LOG.info("No Recommendation"); } else { LOG.info("Adding Recommendation for " + recommendation); Parse parse = parseResult.get(content.getUrl()); parse.getData().getContentMeta().set(META_RECOMMENDED_NAME, recommendation); } return parseResult; } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return this.conf; } }
RecommendedQueryFilter.java
package org.apache.nutch.parse.recommended; import org.apache.nutch.searcher.FieldQueryFilter; import java.util.logging.Logger; // Commons imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class RecommendedQueryFilter extends FieldQueryFilter { private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName()); public RecommendedQueryFilter() { super("recommended", 5f); LOG.info("Added a recommended query"); } }
1.5.在 src/plugin/build.xml 中的<target name="deploy"></target>中增加一行:
<ant dir="recommended" target="deploy" />
1.6.运行cmd,切换到recommend目录,运行ant命令编译,插件开发完成。
1.7 让nutch识别你的插件
在conf/nutch-site.xml 中增加一下属性
<property> <name>plugin.includes</name> <value>recommended|protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> <description>Regular expression naming plugin id names to include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By default Nutch includes crawling just HTML and plain text via HTTP, and basic indexing and search plugins. </description> </property>
2.编写插件测试类
2.1 在src/plugin中/recommend目录下新建一个data目录,在data目录下新建一个html文件recommended.html内容如下:
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"> <html lang="en"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <title>recommended</title> <meta name="generator" content="TextMate http://macromates.com/"> <meta name="author" content="Ricardo J. Méndez"> <meta name="recommended" content="recommended-content"/> <!-- Date: 2007-02-12 --> </head> <body> Recommended meta tag test. </body> </html>
2.2 在src/plugin中/recommend目录下新建src/test/org/apache/nutch/parse/recommended目录,增加TestRecommendedParser.java类,内容如下:
package org.apache.nutch.parse.recommended; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; import java.util.Properties; import java.io.*; import java.net.URL; import junit.framework.TestCase; /* * Loads test page recommended.html and verifies that the recommended * meta tag has recommended-content as its value. * */ public class TestRecommendedParser extends TestCase { private static final File testDir = new File("H:/project/SearchEngine/Nutch1.2/src/plugin/recommended/data"); public void testPages() throws Exception { pageTest(new File(testDir, "recommended.html"), "http://foo.com/", "recommended-content"); } public void pageTest(File file, String url, String recommendation) throws Exception { String contentType = "text/html"; InputStream in = new FileInputStream(file); ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length()); byte[] buffer = new byte[1024]; int i; while ((i = in.read(buffer)) != -1) { out.write(buffer, 0, i); } in.close(); byte[] bytes = out.toByteArray(); Configuration conf = NutchConfiguration.create(); Content content = new Content(url, url, bytes, contentType, new Metadata(), conf); Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content).get(content.getUrl()); Metadata metadata = parse.getData().getContentMeta(); assertEquals(recommendation, metadata.get("recommended")); assertTrue("somesillycontent" != metadata.get("recommended")); } }
2.3 用junit运行TestRecommendedParser.java测试。
2010-10-09