Nutch需要对抓取回来的网页进行分析,但是传统的编写分析太麻烦,这里用了一个很方便的javacc工具来方便编写分析的java(javacc会自动
根据.jj文件生成java文件),如果您对于javacc的语法不甚了解的话不妨先看看javacc的语法,在这里也会对原代码进行比较详细的注释。用户
如果要给nutch添加中文分词,都可以从这个文件着手。
NutchAnalysis.jj:
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** JavaCC code for the Nutch lexical analyzer. */
options {//javacc允许用户在编译的时候添加参数,也可以在这里指定
STATIC = false;//javacc默认的是所有的在这里的函数都是static的,但是这部允许多个实例,尽管增快的速度,但是不符合nutch多线程
的需求,所以设定为false
USER_CHAR_STREAM = true;//默认的是false,会在输出的时候转换为Java Unicode(/u...),设置为true表明要从CharStream.java中读取编码
(实现在FastCharStream.java)
OPTIMIZE_TOKEN_MANAGER = true;//用于token管理的最佳化
UNICODE_INPUT = true;//设定输入流为unicode
//DEBUG_TOKEN_MANAGER = true;//默认为false,打开debug来跟踪错误,编译的时候会生成一个跟踪文件(trace)
}
PARSER_BEGIN(NutchAnalysis)//类名
package org.apache.nutch.analysis;
import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.*;
import java.util.*;
/** The JavaCC-generated Nutch lexical analyzer and query parser. */
public class NutchAnalysis {
private static final String[] STOP_WORDS = {
"a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "s", "such",
"t", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);
private Analyzer analyzer = null;
private String queryString;
private QueryFilters queryFilters;
/** Constructs a nutch analysis. */
public NutchAnalysis(String query, Analyzer analyzer) {
this(new FastCharStream(new StringReader(query)));
this.analyzer = analyzer;
}
/** True iff word is a stop word. Stop words are only removed from queries.
* Every word is indexed. */
public static boolean isStopWord(String word) {
return STOP_SET.contains(word);
}
/** Construct a query parser for the text in a reader. */
public static Query parseQuery(String queryString, Configuration conf) throws IOException {
return parseQuery(queryString, null, conf);
}
/** Construct a query parser for the text in a reader. */
public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf)
throws IOException {
NutchAnalysis parser = new NutchAnalysis(
queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf));
parser.queryString = queryString;
parser.queryFilters = new QueryFilters(conf);
return parser.parse(conf);//parse在下面定义,javacc语法
}
/** For debugging. */
public static void main(String[] args) throws Exception {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
while (true) {
System.out.print("Query: ");
String line = in.readLine();
System.out.println(parseQuery(line, NutchConfiguration.create()));
}
}
}
PARSER_END(NutchAnalysis)
TOKEN_MGR_DECLS : {//设置一个所有分析函数都可以用的变量或者函数,将自动生成NutchAnalysisTokenManager.java
/** Constructs a token manager for the provided Reader. */
public NutchAnalysisTokenManager(Reader reader) {
this(new FastCharStream(reader));
}
}
TOKEN : { // token regular expressions
// basic word -- lowercase it
<WORD: ((<LETTER>|<DIGIT>|<WORD_PUNCT>)+ | <IRREGULAR_WORD>)>//WORD可以是LETTER,DIGIT,WORD_PUNCT的一个或多个或者
IRREGULAR_WORD(为避免同其他分词混淆,特意订制了C++ C#)见下方定义:
{ matchedToken.image = matchedToken.image.toLowerCase(); }//符合WORD的调用这个方法
// special handling for acronyms: U.S.A., I.B.M., etc: dots are removed
| <ACRONYM: <LETTER> "." (<LETTER> ".")+ >
{ // 去掉.
for (int i = 0; i < image.length(); i++) {
if (image.charAt(i) == '.')
image.deleteCharAt(i--);
}
matchedToken.image = image.toString().toLowerCase();
}
// chinese, japanese and korean characters
| <SIGRAM: <CJK> >
// irregular words
| <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)>
| <#C_PLUS_PLUS: ("C"|"c") "++" >
| <#C_SHARP: ("C"|"c") "#" >
// query syntax characters
| <PLUS: "+" >
| <MINUS: "-" >
| <QUOTE: "/"" >
| <COLON: ":" >
| <SLASH: "/" >
| <DOT: "." >
| <ATSIGN: "@" >
| <APOSTROPHE: "'" >
| <WHITE: ~[] > // treat unrecognized chars
// as whitespace
// primitive, non-token patterns
| <#WORD_PUNCT: ("_"|"&")> // allowed anywhere in words
| < #LETTER: // alphabets
[
"/u0041"-"/u005a",
"/u0061"-"/u007a",
"/u00c0"-"/u00d6",
"/u00d8"-"/u00f6",
"/u00f8"-"/u00ff",
"/u0100"-"/u1fff"
]
>
| <#CJK: // non-alphabets
[
"/u3040"-"/u318f",
"/u3300"-"/u337f",
"/u3400"-"/u3d2d",
"/u4e00"-"/u9fff",
"/uf900"-"/ufaff"
]
>
| < #DIGIT: // unicode digits
[
"/u0030"-"/u0039",
"/u0660"-"/u0669",
"/u06f0"-"/u06f9",
"/u0966"-"/u096f",
"/u09e6"-"/u09ef",
"/u0a66"-"/u0a6f",
"/u0ae6"-"/u0aef",
"/u0b66"-"/u0b6f",
"/u0be7"-"/u0bef",
"/u0c66"-"/u0c6f",
"/u0ce6"-"/u0cef",
"/u0d66"-"/u0d6f",
"/u0e50"-"/u0e59",
"/u0ed0"-"/u0ed9",
"/u1040"-"/u1049"
]
>
}
/** Parse a query. */
Query parse(Configuration conf) ://变量+方法
{
Query query = new Query(conf);
ArrayList terms;
Token token;
String field;
boolean stop;
boolean prohibited;
}
{
nonOpOrTerm() // skip noise,这句话的意思是如果遇到 : +that are @ a good way / to do that
之类的语句的时候,就会把that are和a good way和to do that 分开,分别去执行nonOpOrTerm之间的语句。举个例子,如果输入是:+this的
话,那么就会
(
{ stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; }
// optional + or - operator
( <PLUS> {stop=false;} | (<MINUS> { stop=false;prohibited=true; } ))?
// optional field spec.
( LOOKAHEAD(<WORD><COLON>(phrase(field)|compound(field)))
token=<WORD> <COLON> { field = token.image; } )?
( terms=phrase(field) {stop=false;} | // quoted terms or
terms=compound(field)) // single or compound term
nonOpOrTerm() // skip noise
{
String[] array = (String[])terms.toArray(new String[terms.size()]);
if (stop
&& field == Clause.DEFAULT_FIELD
&& terms.size()==1
&& isStopWord(array[0])) {
// ignore stop words only when single, unadorned terms in default field
} else {
if (prohibited)
query.addProhibitedPhrase(array, field);
else
query.addRequiredPhrase(array, field);
}
}
)*
{ return query; }
}
/** Parse an explcitly quoted phrase query. Note that this may return a single
* term, a trivial phrase.*//*输入的格式类似于:" that is a dog ",QueryFilters类来判断是否对于本类需要进行分析,如果是
rawfield的话,就不需要分析,直接使用。看上面对于本类是引用了一个conf文件来进行判断的。parser.queryFilters = new QueryFilters
(conf);*/
ArrayList phrase(String field) :
{
int start;
int end;
ArrayList result = new ArrayList();
String term;
}
{
<QUOTE>
{ start = token.endColumn; }
(nonTerm())* // skip noise
( term = term() { result.add(term); } // parse a term
(nonTerm())*)* // skip noise
{ end = token.endColumn; }
(<QUOTE>|<EOF>)
{
if (this.queryFilters.isRawField(field)) {
result.clear();
result.add(queryString.substring(start, end));
}
return result;
}
}
/** Parse a compound term that is interpreted as an implicit phrase query.
* Compounds are a sequence of terms separated by infix characters. Note that
* htis may return a single term, a trivial compound. */
ArrayList compound(String field) :
{
int start;
ArrayList result = new ArrayList();
String term;
StringBuffer terms = new StringBuffer();
}
{
{ start = token.endColumn; }
term = term() {
terms.append(term).append(" ");
//result.add(term);
}
( LOOKAHEAD( (infix())+ term() )//LOOKAHEAD是指定在下面要出现的语法,来方便编译
(infix())+
term = term() {
terms.append(term).append(" ");
//result.add(term);
})*//以上是吧一个输入字符流去掉不规则的,完全转换成类似于"word1 word2 word3 word4..."的形式
{
if (this.queryFilters.isRawField(field)) {
// result.clear();
result.add(queryString.substring(start, token.endColumn));
} else {
org.apache.lucene.analysis.Token token;//Token是lucene的词基本单位,代表不能继续分割的单词,比如"take"
TokenStream tokens = analyzer.tokenStream(
field, new StringReader(terms.toString()));//TokenStream是lucene的以Token为单位的流比如"take a
cake"包含三个token,这句是把上面分开的词转换为TokenStream的形式
while (true) {
try {
token = tokens.next();
} catch (IOException e) {
token = null;
}
if (token == null) { break; }
result.add(token.termText());
}
try {
tokens.close();
} catch (IOException e) {
// ignore
}
}
return result;
}
}
/** Parse a single term. *///以下不再赘述
String term() :
{
Token token;
}
{
( token=<WORD> | token=<ACRONYM> | token=<SIGRAM>)
{ return token.image; }
}
/** Parse anything but a term or a quote. */
void nonTerm() :
{}
{
<WHITE> | infix()
}
void nonTermOrEOF() :
{}
{
nonTerm() | <EOF>
}
/** Parse anything but a term or an operator (plur or minus or quote). */
void nonOpOrTerm() :
{}
{
(LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))*
}
/** Characters which can be used to form compound terms. */
void infix() :
{}
{
<PLUS> | <MINUS> | nonOpInfix()
}
/** Parse infix characters except plus and minus. */
void nonOpInfix() :
{}
{
<COLON>|<SLASH>|<DOT>|<ATSIGN>|<APOSTROPHE>
}