今尝试下给nutch1.2增加一个插件,于是到官网找了个例子,链接如下:
http://wiki.apache.org/nutch/WritingPluginExample-0.9
这个例子实现的的是推荐网站,就是写关键字在content里,当别人搜索这个关键字时,你推荐的网站在搜索结果中排前,要实现推荐必须在你的网页上加上
- <meta name="recommended" content="plugins" />
这条属性才能被插件识别。
由于它这个例子是用nutch0.9的,而且1.2和0.9有些区别,于是要修改一些代码。步骤如下:
1.插件开放
1.1在src/plugin中新建一个文件夹recommend
1.2.在recommend目录下新建Plugin.xml和Build.xml文件,内容如下:
Plugin.xml
- <?xml version="1.0" encoding="UTF-8"?>
- <plugin
- id="recommended"
- name="Recommended Parser/Filter"
- version="0.0.1"
- provider-name="nutch.org">
- <runtime>
- <!-- As defined in build.xml this plugin will end up bundled as recommended.jar -->
- <library name="recommended.jar">
- <export name="*"/>
- </library>
- </runtime>
- <!-- The RecommendedParser extends the HtmlParseFilter to grab the contents of
- any recommended meta tags -->
- <extension id="org.apache.nutch.parse.recommended.recommendedfilter"
- name="Recommended Parser"
- point="org.apache.nutch.parse.HtmlParseFilter">
- <implementation id="RecommendedParser"
- class="org.apache.nutch.parse.recommended.RecommendedParser"/>
- </extension>
- <!-- TheRecommendedIndexer extends the IndexingFilter in order to add the contents
- of the recommended meta tags (as found by the RecommendedParser) to the lucene
- index. -->
- <extension id="org.apache.nutch.parse.recommended.recommendedindexer"
- name="Recommended identifier filter"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="RecommendedIndexer"
- class="org.apache.nutch.parse.recommended.RecommendedIndexer"/>
- </extension>
- <!-- The RecommendedQueryFilter gets called when you perform a search. It runs a
- search for the user's query against the recommended fields. In order to get
- add this to the list of filters that gets run by default, you have to use
- "fields=DEFAULT". -->
- <extension id="org.apache.nutch.parse.recommended.recommendedSearcher"
- name="Recommended Search Query Filter"
- point="org.apache.nutch.searcher.QueryFilter">
- <implementation id="RecommendedQueryFilter"
- class="org.apache.nutch.parse.recommended.RecommendedQueryFilter">
- <parameter name="fields" value="recommended"/>
- </implementation>
- </extension>
- </plugin>
Build.xml
- <?xml version="1.0"?>
- <project name="recommended" default="jar-core">
- <import file="../build-plugin.xml"/>
- <!-- Build compilation dependencies -->
- <target name="deps-jar">
- <ant target="jar" inheritall="false" dir="../lib-xml"/>
- </target>
- <!-- Add compilation dependencies to classpath -->
- <path id="plugin.deps">
- <fileset dir="${nutch.root}/build">
- <include name="**/lib-xml/*.jar" />
- </fileset>
- </path>
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../lib-xml"/>
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- <ant target="deploy" inheritall="false" dir="../protocol-file"/>
- </target>
- <!-- for junit test -->
- <mkdir dir="${build.test}/data"/>
- <copy file="data/recommended.html" todir="${build.test}/data"/>
- </project>
1.3.在recommended目录下建立/src/java/org/apache/nutch/parse/recommended目录。
1.4.增加RecommendedIndexer.java,RecommendedParser.java,RecommendedQueryFilter.java三个类,内容如下:
RecommendedIndexer.java
- package org.apache.nutch.parse.recommended;
- // JDK import
- import java.util.logging.Logger;
- // Commons imports
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- // Nutch imports
- import org.apache.nutch.util.LogUtil;
- import org.apache.nutch.fetcher.FetcherOutput;
- import org.apache.nutch.indexer.IndexingFilter;
- import org.apache.nutch.indexer.IndexingException;
- import org.apache.nutch.indexer.NutchDocument;
- import org.apache.nutch.parse.Parse;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.io.Text;
- import org.apache.nutch.crawl.CrawlDatum;
- import org.apache.nutch.crawl.Inlinks;
- // Lucene imports
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.Document;
- public class RecommendedIndexer implements IndexingFilter {
- public static final Log LOG = LogFactory.getLog(RecommendedIndexer.class.getName());
- private Configuration conf;
- public RecommendedIndexer() {
- }
- @Override
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks)
- throws IndexingException {
- String recommendation = parse.getData().getMeta("recommended");
- if (recommendation != null) {
- Field recommendedField =
- new Field("recommended", recommendation,
- Field.Store.YES, Field.Index.NOT_ANALYZED);
- recommendedField.setBoost(5.0f);
- doc.add("recommended",recommendedField);
- LOG.info("Added " + recommendation + " to the recommended Field");
- }
- return doc;
- }
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
- public Configuration getConf() {
- return this.conf;
- }
- @Override
- public void addIndexBackendOptions(Configuration conf) {
- // TODO Auto-generated method stub
- }
- }
RecommendedParser.java
- package org.apache.nutch.parse.recommended;
- // JDK imports
- import java.util.Enumeration;
- import java.util.Properties;
- import java.util.logging.Logger;
- // Nutch imports
- import org.apache.hadoop.conf.Configuration;
- import org.apache.nutch.metadata.Metadata;
- import org.apache.nutch.parse.HTMLMetaTags;
- import org.apache.nutch.parse.Parse;
- import org.apache.nutch.parse.HtmlParseFilter;
- import org.apache.nutch.parse.ParseResult;
- import org.apache.nutch.protocol.Content;
- // Commons imports
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- // W3C imports
- import org.w3c.dom.DocumentFragment;
- public class RecommendedParser implements HtmlParseFilter {
- private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());
- private Configuration conf;
- /** The Recommended meta data attribute name */
- public static final String META_RECOMMENDED_NAME="recommended";
- /**
- * Scan the HTML document looking for a recommended meta tag.
- */
- @Override
- public ParseResult filter(Content content, ParseResult parseResult,
- HTMLMetaTags metaTags, DocumentFragment doc) {
- // Trying to find the document's recommended term
- String recommendation = null;
- Properties generalMetaTags = metaTags.getGeneralTags();
- for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames.hasMoreElements(); ) {
- if (tagNames.nextElement().equals("recommended")) {
- System.out.println(generalMetaTags.getProperty("recommended"));
- recommendation = generalMetaTags.getProperty("recommended");
- LOG.info("Found a Recommendation for " + recommendation);
- }
- }
- if (recommendation == null) {
- LOG.info("No Recommendation");
- } else {
- LOG.info("Adding Recommendation for " + recommendation);
- Parse parse = parseResult.get(content.getUrl());
- parse.getData().getContentMeta().set(META_RECOMMENDED_NAME, recommendation);
- }
- return parseResult;
- }
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
- public Configuration getConf() {
- return this.conf;
- }
- }
RecommendedQueryFilter.java
- package org.apache.nutch.parse.recommended;
- import org.apache.nutch.searcher.FieldQueryFilter;
- import java.util.logging.Logger;
- // Commons imports
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- public class RecommendedQueryFilter extends FieldQueryFilter {
- private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());
- public RecommendedQueryFilter() {
- super("recommended", 5f);
- LOG.info("Added a recommended query");
- }
- }
1.5.在 src/plugin/build.xml 中的<target name="deploy"></target>中增加一行:
- <ant dir="recommended" target="deploy" />
1.6.运行cmd,切换到recommend目录,运行ant命令编译,插件开发完成。
1.7 让nutch识别你的插件
在conf/nutch-site.xml 中增加一下属性
- <property>
- <name>plugin.includes</name>
- <value>recommended|protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> <description>Regular expression naming plugin id names to
- include. Any plugin not matching this expression is excluded.
- In any case you need at least include the nutch-extensionpoints plugin. By
- default Nutch includes crawling just HTML and plain text via HTTP,
- and basic indexing and search plugins.
- </description>
- </property>
2.编写插件测试类
2.1 在src/plugin中/recommend目录下新建一个data目录,在data目录下新建一个html文件recommended.html内容如下:
- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">
- <html lang="en">
- <head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
- <title>recommended</title>
- <meta name="generator" content="TextMate http://macromates.com/">
- <meta name="author" content="Ricardo J. Méndez">
- <meta name="recommended" content="recommended-content"/>
- <!-- Date: 2007-02-12 -->
- </head>
- <body>
- Recommended meta tag test.
- </body>
- </html>
2.2 在src/plugin中/recommend目录下新建src/test/org/apache/nutch/parse/recommended目录,增加TestRecommendedParser.java类,内容如下:
- package org.apache.nutch.parse.recommended;
- import org.apache.nutch.metadata.Metadata;
- import org.apache.nutch.parse.Parse;
- import org.apache.nutch.parse.ParseResult;
- import org.apache.nutch.parse.ParseUtil;
- import org.apache.nutch.protocol.Content;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.nutch.util.NutchConfiguration;
- import java.util.Properties;
- import java.io.*;
- import java.net.URL;
- import junit.framework.TestCase;
- /*
- * Loads test page recommended.html and verifies that the recommended
- * meta tag has recommended-content as its value.
- *
- */
- public class TestRecommendedParser extends TestCase {
- private static final File testDir =
- new File("H:/project/SearchEngine/Nutch1.2/src/plugin/recommended/data");
- public void testPages() throws Exception {
- pageTest(new File(testDir, "recommended.html"), "http://foo.com/",
- "recommended-content");
- }
- public void pageTest(File file, String url, String recommendation)
- throws Exception {
- String contentType = "text/html";
- InputStream in = new FileInputStream(file);
- ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
- byte[] buffer = new byte[1024];
- int i;
- while ((i = in.read(buffer)) != -1) {
- out.write(buffer, 0, i);
- }
- in.close();
- byte[] bytes = out.toByteArray();
- Configuration conf = NutchConfiguration.create();
- Content content =
- new Content(url, url, bytes, contentType, new Metadata(), conf);
- Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content).get(content.getUrl());
- Metadata metadata = parse.getData().getContentMeta();
- assertEquals(recommendation, metadata.get("recommended"));
- assertTrue("somesillycontent" != metadata.get("recommended"));
- }
- }
2.3 用junit运行TestRecommendedParser.java测试。
转自http://blog.youkuaiyun.com/laigood/article/details/5929388