1、自定义TokenFilter过滤器
package com.lkt.analyzer;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* 定義同義詞過濾器
* @author lkt
*
*/
public class MyMmsegFilter extends TokenFilter {
//用來存儲同義詞集合
private Map<String, String[]> sameMap=new HashMap<String, String[]>();
//用來存儲當前詞的同義詞
private Stack<String> sameStack;
private int flag;
//存儲當前狀態
private AttributeSource.State currState;
private CharTermAttribute cta;
private PositionIncrementAttribute pia;
protected MyMmsegFilter(TokenStream input) {
super(input);
sameMap.put("中国", new String[]{"兲朝","大陸","China"});
sameMap.put("北京", new String[]{"首都","燕京","Beijing"});
sameMap.put("南京", new String[]{"六朝古都","建業","Nanjing"});
cta = this.addAttribute(CharTermAttribute.class);
pia = this.addAttribute(PositionIncrementAttribute.class);
sameStack=new Stack<String>();
}
@Override
public boolean incrementToken() throws IOException {
while(sameStack.size()>0){
String str = sameStack.pop();
//還原狀態
restoreState(currState);
cta.setEmpty();
cta.append(str);
//設置它和前一個單詞的距離
pia.setPositionIncrement(0);
return true;
}
if(!input.incrementToken())return false;
if(getSameWord(cta.toString())){
//獲取當前狀態,使用restoreState可以返回記錄的狀態
currState=captureState();
flag=0;
}
return true;
}
private boolean getSameWord(String word){
String[] sm = sameMap.get(word);
if(sm!=null&&sm.length>0){
for (String s : sm) {
sameStack.push(s);
}
return true;
}
return false;
}
}
2、自定义分词器
package com.lkt.analyzer;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.store.Directory;
import com.chenlb.mmseg4j.Chunk;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.Seg;
import com.chenlb.mmseg4j.Sentence;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
public class MyMmsegAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
Dictionary dic =Dictionary.getInstance("F:\\学习资料\\Lucene\\mmseg4j-1.8.5\\data");
return new MyMmsegFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader)) ;
}
}
package com.lkt.lucene;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
import com.chenlb.mmseg4j.MMSeg;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
import com.lkt.analyzer.MyMmsegAnalyzer;
import com.lkt.analyzer.MyStopAnalyzer;
import com.lkt.util.AnalyzerUtil;
public class TestAnalyzerUtil {
@Test
public void testDisplayAnalyzer(){
String str = "北京上海南京江苏南京北京中国重庆天津";
// new AnalyzerUtil().displayAnalyzer(str,new StandardAnalyzer(Version.LUCENE_35) );
// new AnalyzerUtil().displayAnalyzer(str,new StopAnalyzer(Version.LUCENE_35) );
// new AnalyzerUtil().displayAnalyzer(str,new SimpleAnalyzer(Version.LUCENE_35) );
// new AnalyzerUtil().displayAnalyzer(str,new WhitespaceAnalyzer(Version.LUCENE_35) );
// new AnalyzerUtil().displayAnalyzer(str,new MyStopAnalyzer(new String[]{"my","dog"}) );
try {
//new AnalyzerUtil().displayAnalyzer(str,new MyMmsegAnalyzer());
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir,new IndexWriterConfig(Version.LUCENE_35,new MyMmsegAnalyzer()));
Document doc =new Document();
doc.add(new Field("content",str,Store.YES,Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexSearcher sercher =new IndexSearcher(IndexReader.open(dir));
TermQuery tq =new TermQuery(new Term("content","首都"));
TopDocs td= sercher.search(tq, 10);
for (ScoreDoc sd : td.scoreDocs) {
Document dd = sercher.doc(sd.doc);
System.out.println(dd.get("content"));
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}