Lucene5中编写自定义同义词分词器(基于IK中文分词器)

本文介绍了一个基于Lucene的同义词搜索实现方案,包括自定义同义词引擎、同义词过滤器及分析器,并通过测试验证了同义词检索的有效性。

编写一个专门获取同义词的引擎:

package com.daelly.sample.lucene.analyzer.custom;

import java.io.IOException;

public interface SynonymEngine {
	String[] getSynonyms(String s) throws IOException;
}

package com.daelly.sample.lucene.analyzer.custom;

import java.io.IOException;
import java.util.HashMap;

public class SimpleSynonymEngine implements SynonymEngine {
	
	private static HashMap<String, String[]> map = new HashMap<>();
	
	{
		map.put("我", new String[]{"俺","咱"});
		map.put("中国", new String[]{"天朝"});
		map.put("广州", new String[]{"五羊城"});
	}

	@Override
	public String[] getSynonyms(String s) throws IOException {
		return map.get(s);
	}

}


编写同义词filter

package com.daelly.sample.lucene.analyzer.custom;

import java.io.IOException;
import java.util.Stack;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;

public class SynonymFilter extends TokenFilter {
	
	public static final String TOKEN_TYPE_SYNONYM = "SYNONYM";
	
	private final Stack<String> synonymStack;
	private SynonymEngine synonymEngine;
	private AttributeSource.State current;
	
	private final CharTermAttribute termAttr;
	private final PositionIncrementAttribute posIncrAttr;

	protected SynonymFilter(TokenStream input, SynonymEngine engine) {
		super(input);
		synonymStack = new Stack<>();
		this.synonymEngine = engine;
		
		this.termAttr = addAttribute(CharTermAttribute.class);
		this.posIncrAttr = addAttribute(PositionIncrementAttribute.class);
	}

	@Override
	public boolean incrementToken() throws IOException {
		if(!synonymStack.isEmpty()) {
			String syn = synonymStack.pop();
			restoreState(current);
			//这是4.x的写法
			//termAttr.setTermBuffer(syn);
			//这是5.x的写法
			termAttr.copyBuffer(syn.toCharArray(), 0, syn.length());
			posIncrAttr.setPositionIncrement(0);
			return true;
		}
		
		if(!input.incrementToken()) {
			return false;
		}
		
		//当前的token有同义词,将当前token的状态记录下来
		//调到下一个token的时候恢复这个状态,将同义词插入到位置偏移量为0的上个位置
		//为什么要到下一个token再处理我也不是很懂
		if(addAliasesToStack()) {
			current = captureState();
		}
		return true;
	}

	/*
	 * 
	 */
	private boolean addAliasesToStack() throws IOException {
		String termVal = termAttr.toString();
		String[] synonyms = synonymEngine.getSynonyms(termVal);
		
		if(synonyms == null) {
			return false;
		}
		
		for (String synonym : synonyms) {
			synonymStack.push(synonym);
		}
		return true;
	}
}


编写analyzer:

package com.daelly.sample.lucene.analyzer.custom;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.lucene.IKTokenizer;

public class SynonymAnalyzer extends Analyzer {
	
	private final SynonymEngine engine;
	
	public SynonymAnalyzer(SynonymEngine engine) {
		this.engine = engine;
	}

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
		Tokenizer tokenizer = new IKTokenizer(true);
		TokenStream tokenStream = new SynonymFilter(tokenizer, engine);
		return new TokenStreamComponents(tokenizer,tokenStream);
	}

}

编写测试:

package com.daelly.sample.lucene;

import static org.junit.Assert.assertEquals;

import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Before;
import org.junit.Test;

import com.daelly.sample.lucene.analyzer.custom.SimpleSynonymEngine;
import com.daelly.sample.lucene.analyzer.custom.SynonymAnalyzer;

public class CommonAnalyzerTest {

	Directory dir;

	@Before
	public void setUp() throws Exception {
		dir = new RAMDirectory();
		IndexWriterConfig conf = new IndexWriterConfig(new SynonymAnalyzer(new SimpleSynonymEngine()));
		IndexWriter writer = new IndexWriter(dir, conf );
		Document doc = new Document();
		doc.add(new TextField("content", "我来自中国广州", Field.Store.YES));
		writer.addDocument(doc);
		writer.close();
	}

	@Test
	public void test1() throws IOException {
		Term term = new Term("content", "天朝");
		Query query = new TermQuery(term);
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		TopDocs docs = searcher.search(query, 10);
		assertEquals(1, docs.totalHits);
	}
	
	@Test
	public void test2() throws IOException {
		Term term = new Term("content", "俺");
		Query query = new TermQuery(term);
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		TopDocs docs = searcher.search(query, 10);
		assertEquals(1, docs.totalHits);
	}
	
	@Test
	public void test3() throws IOException {
		Term term = new Term("content", "五羊城");
		Query query = new TermQuery(term);
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		TopDocs docs = searcher.search(query, 10);
		assertEquals(1, docs.totalHits);
	}

}


测试结果,同义词分词器生效。

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值