出自:http://blog.youkuaiyun.com/wxwzy738/article/details/8799656 的整理
1
、
2、语汇单元的结构解释
3、同义词的设计思路
4、分词器的比较和测试
- packageorg.lucene.test;
- importjava.io.File;
- importjava.io.IOException;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.SimpleAnalyzer;
- importorg.apache.lucene.analysis.StopAnalyzer;
- importorg.apache.lucene.analysis.WhitespaceAnalyzer;
- importorg.apache.lucene.analysis.standard.StandardAnalyzer;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.document.Field;
- importorg.apache.lucene.index.CorruptIndexException;
- importorg.apache.lucene.index.IndexReader;
- importorg.apache.lucene.index.IndexWriter;
- importorg.apache.lucene.index.IndexWriterConfig;
- importorg.apache.lucene.index.Term;
- importorg.apache.lucene.search.IndexSearcher;
- importorg.apache.lucene.search.ScoreDoc;
- importorg.apache.lucene.search.TermQuery;
- importorg.apache.lucene.search.TopDocs;
- importorg.apache.lucene.store.Directory;
- importorg.apache.lucene.store.RAMDirectory;
- importorg.apache.lucene.util.Version;
- importorg.junit.Test;
- importorg.lucene.util.AnalyzerUtils;
- importorg.lucene.util.MySameAnalyzer;
- importorg.lucene.util.MyStopAnalyzer;
- importcom.chenlb.mmseg4j.analysis.MMSegAnalyzer;
- publicclassTestAnalyzer{
- /**
- *几种分词器在英文分词下面的比较
- */
- @Test
- publicvoidtest01(){
- //标准分词器
- Analyzera1=newStandardAnalyzer(Version.LUCENE_35);
- //停用词分词器
- Analyzera2=newStopAnalyzer(Version.LUCENE_35);
- //简单分词器
- Analyzera3=newSimpleAnalyzer(Version.LUCENE_35);
- //空格分词器
- Analyzera4=newWhitespaceAnalyzer(Version.LUCENE_35);
- Stringtxt="thisismyhouse,Iamcomefromyunnangzhaotong,"+
- "Myemailisynkonghao@gmail.com,MyQQis707807876";
- AnalyzerUtils.displayToken(txt,a1);
- //[my][house][i][am][come][from][yunnang][zhaotong][my][email][ynkonghao][gmail.com][my][qq][707807876]
- AnalyzerUtils.displayToken(txt,a2);
- //[my][house][i][am][come][from][yunnang][zhaotong][my][email][ynkonghao][gmail][com][my][qq]
- AnalyzerUtils.displayToken(txt,a3);
- //[this][is][my][house][i][am][come][from][yunnang][zhaotong][my][email][is][ynkonghao][gmail][com][my][qq][is]
- AnalyzerUtils.displayToken(txt,a4);
- //[this][is][my][house,I][am][come][from][yunnang][zhaotong,My][email][is][ynkonghao@gmail.com,My][QQ][is][707807876]
- }
- /**
- *几种分词器在中文分词下面的比较
- */
- @Test
- publicvoidtest02(){
- //标准分词器
- Analyzera1=newStandardAnalyzer(Version.LUCENE_35);
- //停用词分词器
- Analyzera2=newStopAnalyzer(Version.LUCENE_35);
- //简单分词器
- Analyzera3=newSimpleAnalyzer(Version.LUCENE_35);
- //空格分词器
- Analyzera4=newWhitespaceAnalyzer(Version.LUCENE_35);
- Stringtxt="我来自云南昭通昭阳区师专";
- AnalyzerUtils.displayToken(txt,a1);
- //[我][来][自][云][南][昭][通][昭][阳][区][师][专]
- AnalyzerUtils.displayToken(txt,a2);
- //[我来自云南昭通昭阳区师专]
- AnalyzerUtils.displayToken(txt,a3);
- //[我来自云南昭通昭阳区师专]
- AnalyzerUtils.displayToken(txt,a4);
- //[我来自云南昭通昭阳区师专]
- }
- /**
- *打印分词的详细信息
- */
- @Test
- publicvoidtest03(){
- //标准分词器
- Analyzera1=newStandardAnalyzer(Version.LUCENE_35);
- //停用词分词器
- Analyzera2=newStopAnalyzer(Version.LUCENE_35);
- //简单分词器
- Analyzera3=newSimpleAnalyzer(Version.LUCENE_35);
- //空格分词器
- Analyzera4=newWhitespaceAnalyzer(Version.LUCENE_35);
- Stringtxt="howareyouthankyou";
- AnalyzerUtils.displayAllToken(txt,a1);
- AnalyzerUtils.displayAllToken(txt,a2);
- AnalyzerUtils.displayAllToken(txt,a3);
- AnalyzerUtils.displayAllToken(txt,a4);
- }
- /**
- *停用词的测试
- */
- @Test
- publicvoidtest04(){
- Analyzera1=newMyStopAnalyzer(newString[]{"I","you","hate"});
- Analyzera2=newStopAnalyzer(Version.LUCENE_35);
- Stringtxt="howareYouthAnk'syouIhateyou";
- AnalyzerUtils.displayToken(txt,a1);
- AnalyzerUtils.displayToken(txt,a2);
- }
- /**
- *中文分词测试
- *使用词库分词,自己可扩展词库
- */
- @Test
- publicvoidtest05(){
- //Analyzera1=newMMSegAnalyzer();//未加入该分词器自带的词库
- //[我][来][自][云][南][昭][通][昭][阳][区][师][专]
- //导入分词的词典便有词库
- Analyzera1=newMMSegAnalyzer(newFile("D:\\Workspaces\\03_lucene_analyzer\\mmseg4j-1.8.4\\data"));
- //[我][来自][云南][昭][通][昭][阳][区][师专]
- //可以在data文件下面的words-my.dic扩展自己的词典,比如加了昭通,分词结果为:
- //[我][来自][云南][昭通][昭][阳][区][师专]
- Stringtxt="我来自云南昭通昭阳区师专";
- AnalyzerUtils.displayToken(txt,a1);
- }
- /**
- *同义词测试
- *@throwsIOException
- *@throwsCorruptIndexException
- */
- @Test
- publicvoidtest06()throwsCorruptIndexException,IOException{
- Analyzera1=newMySameAnalyzer();
- Stringtxt="我来自中国云南昭通昭阳区师专";
- AnalyzerUtils.displayAllToken(txt,a1);
- Stringkeyword="俺";
- Directorydire=newRAMDirectory();
- IndexWriterindexWriter=newIndexWriter(dire,newIndexWriterConfig(Version.LUCENE_35,a1));
- Documentdoc=newDocument();
- doc.add(newField("content",txt,Field.Store.YES,Field.Index.ANALYZED));
- indexWriter.addDocument(doc);
- indexWriter.close();
- IndexSearchersearch=newIndexSearcher(IndexReader.open(dire));
- TopDocstopDoc=search.search(newTermQuery(newTerm("content",keyword)),10);
- ScoreDoc[]scoreDoc=topDoc.scoreDocs;
- for(ScoreDocscore:scoreDoc){
- Documentdoc1=search.doc(score.doc);
- System.out.println(doc1.get("content"));
- }
- }
- }
5、扩展自己的停用词分词器
- packageorg.lucene.util;
- importjava.io.IOException;
- importjava.io.Reader;
- importjava.util.Set;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.LetterTokenizer;
- importorg.apache.lucene.analysis.LowerCaseFilter;
- importorg.apache.lucene.analysis.StopAnalyzer;
- importorg.apache.lucene.analysis.StopFilter;
- importorg.apache.lucene.analysis.TokenStream;
- importorg.apache.lucene.analysis.Tokenizer;
- importorg.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- importorg.apache.lucene.util.Version;
- /**
- *扩展自己的停用词分词器
- *@authoruser
- *
- */
- publicclassMyStopAnalyzerextendsAnalyzer{
- privateSetstops;
- publicMyStopAnalyzer(String[]sws){
- //会自动将字符串数组转化为Set
- stops=StopFilter.makeStopSet(Version.LUCENE_35,sws,true);
- //把原来的停用词给加进来
- stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
- }
- publicMyStopAnalyzer(){
- //获取原有的停用词
- stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
- }
- @Override
- publicTokenStreamtokenStream(StringfieldName,Readerreader){
- System.out.println("//------------------------------------");
- Tokenizertokenizer=newLetterTokenizer(Version.LUCENE_35,reader);
- //Tokenizertokenizer=newStandardTokenizer(Version.LUCENE_35,reader);
- CharTermAttributecta=tokenizer.addAttribute(CharTermAttribute.class);
- try{
- while(tokenizer.incrementToken()){
- System.out.println(cta);
- }
- }catch(IOExceptione){
- e.printStackTrace();
- }
- System.out.println("------------------------------------\\");
- //为这个分词器设定过滤链和Tokenizer
- returnnewStopFilter(Version.LUCENE_35,
- newLowerCaseFilter(Version.LUCENE_35,
- newLetterTokenizer(Version.LUCENE_35,reader)),
- stops);
- }
- }
- packageorg.lucene.util;
- importjava.io.Reader;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.TokenStream;
- importcom.chenlb.mmseg4j.Dictionary;
- importcom.chenlb.mmseg4j.MaxWordSeg;
- importcom.chenlb.mmseg4j.analysis.MMSegTokenizer;
- /**
- *分词器的扩展,同义词分词器
- *@authoruser
- *
- */
- publicclassMySameAnalyzerextendsAnalyzer{
- @Override
- publicTokenStreamtokenStream(StringfieldName,Readerreader){
- Dictionarydic=Dictionary.getInstance("D:\\Workspaces\\03_lucene_analyzer\\mmseg4j-1.8.4\\data");
- returnnewMySameTokenFilter(newMMSegTokenizer(newMaxWordSeg(dic),reader));
- }
- }
7、同义词过滤器的扩展
- packageorg.lucene.util;
- importjava.io.IOException;
- importjava.util.HashMap;
- importjava.util.Map;
- importjava.util.Stack;
- importorg.apache.lucene.analysis.TokenFilter;
- importorg.apache.lucene.analysis.TokenStream;
- importorg.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- importorg.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
- importorg.apache.lucene.util.AttributeSource;
- /**
- *同义词过滤器的扩展
- *@authoruser
- *
- */
- publicclassMySameTokenFilterextendsTokenFilter{
- privateCharTermAttributecta=null;
- privatePositionIncrementAttributepia=null;
- privateAttributeSource.Statecurrent=null;
- privateStack<String>sames=null;
- protectedMySameTokenFilter(TokenStreaminput){
- super(input);
- cta=this.addAttribute(CharTermAttribute.class);
- pia=this.addAttribute(PositionIncrementAttribute.class);
- sames=newStack<String>();
- }
- /**
- *思想如下:
- *其实每个同义词都要放在CharTermAttribute里面,但是如果直接cta.append("大陆");的话
- *那会直接把原来的词和同义词连接在同一个语汇单元里面[中国大陆],这样是不行的
- *要的是这样的效果[中国][大陆]
- *那么就要在遇到同义词的时候把当前的状态保存一份,并把同义词的数组放入栈中,
- *这样在下一个语汇单元的时候判断同义词数组是否为空,不为空的话把之前的保存的一份状态
- *还原,然后在修改之前状态的值cta.setEmpty(),然后在把同义词的值加入cta.append("大陆")
- *再把位置增量设为0,pia.setPositionIncrement(0),这样的话就表示是同义词,
- *接着把该同义词的语汇单元返回
- */
- @Override
- publicbooleanincrementToken()throwsIOException{
- while(sames.size()>0){
- //将元素出栈,并获取这个同义词
- Stringstr=sames.pop();
- //还原状态
- restoreState(current);
- cta.setEmpty();
- cta.append(str);
- //设置位置
- pia.setPositionIncrement(0);
- returntrue;
- }
- if(!input.incrementToken())returnfalse;
- if(getSameWords(cta.toString())){
- //如果有同义词将当前状态先保存
- current=captureState();
- }
- returntrue;
- }
- /*
- *使用这种方式是不行的,这种会把的结果是[中国]替换成了[大陆]
- *而不是变成了[中国][大陆]
- @Override
- publicbooleanincrementToken()throwsIOException{
- if(!input.incrementToken())returnfalse;
- if(cta.toString().equals("中国")){
- cta.setEmpty();
- cta.append("大陆");
- }
- returntrue;
- }
- */
- privatebooleangetSameWords(Stringname){
- Map<String,String[]>maps=newHashMap<String,String[]>();
- maps.put("中国",newString[]{"大陆","天朝"});
- maps.put("我",newString[]{"咱","俺"});
- String[]sws=maps.get(name);
- if(sws!=null){
- for(Strings:sws){
- sames.push(s);
- }
- returntrue;
- }
- returnfalse;
- }
- }
8、打印语汇单元的信息
- packageorg.lucene.util;
- importjava.io.IOException;
- importjava.io.StringReader;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.TokenStream;
- importorg.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- importorg.apache.lucene.analysis.tokenattributes.OffsetAttribute;
- importorg.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
- importorg.apache.lucene.analysis.tokenattributes.TypeAttribute;
- /**
- *打印语汇单元的信息
- *@authoruser
- *
- */
- publicclassAnalyzerUtils{
- publicstaticvoiddisplayToken(Stringstr,Analyzera){
- TokenStreamstream=a.tokenStream("content",newStringReader(str));
- /*
- *TokenStream相当于一条流
- *CharTermAttribute相当于一个碗
- *然后把碗丢进流里面,当碗得到一个元素后,碗又会自动流到了下
- *一个元素进行取值
- *这是一种设计模式:创建一个属性,这个属性会添加流中,
- *随着这个TokenStream增加
- */
- CharTermAttributecta=stream.addAttribute(CharTermAttribute.class);
- try{
- while(stream.incrementToken()){
- System.out.print("["+cta+"]");
- //System.out.println(stream);
- //如果直接打印Stream的话,toString打印如下:
- //(来,startOffset=1,endOffset=2,positionIncrement=1,type=<IDEOGRAPHIC>)
- }
- System.out.println();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *打印详细信息的语汇单元
- *@paramstr
- *@parama
- */
- publicstaticvoiddisplayAllToken(Stringstr,Analyzera){
- TokenStreamstream=a.tokenStream("content",newStringReader(str));
- //位置增量
- PositionIncrementAttributepia=stream.addAttribute(PositionIncrementAttribute.class);
- //偏移量
- OffsetAttributeoa=stream.addAttribute(OffsetAttribute.class);
- //词元
- CharTermAttributecta=stream.addAttribute(CharTermAttribute.class);
- //分词的类型
- TypeAttributeta=stream.addAttribute(TypeAttribute.class);
- try{
- while(stream.incrementToken()){
- System.out.print(pia.getPositionIncrement()+":");
- System.out.print(cta+"["+oa.startOffset()+"-"+
- oa.endOffset()+"-"+ta.type());
- System.out.println();
- }
- System.out.println();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
工程下载路径: http://download.youkuaiyun.com/detail/wxwzy738/5284705