Lucene_demo02_分词-优快云博客

本文链接：https://blog.youkuaiyun.com/mohaiyong/article/details/84440551

本文介绍了使用Lucene进行文本分词的方法，并通过具体示例展示了如何针对英文和中文文本应用不同的分词器，包括标准分词器、单字分词器、二分法分词器及IK分词器。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

[size=xx-large][color=orange][b]Lucene_demo02_分词[/b][/color][/size]


/**
 * 英文的分词器 中文的分词器
 */
public class AnalyzerTest {

	/**
	 * 英文分词：(Lucene自带包)
	 * @throws Exception
	 */
	@Test
	public void testEN() throws Exception {
		String text = "Creates a searcher searching the index in the named directory";
		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
		this.testAnalyzer(analyzer, text);
	}

	/**
	 * 中文分词：单字分词器(Lucene自带包)
	 * @throws Exception
	 */
	@Test
	public void testCH1() throws Exception {
		String text = "LBJ和韦德能带领热火在2013赛季拿到NBA总冠军吗？";
		Analyzer analyzer = new ChineseAnalyzer();
		this.testAnalyzer(analyzer, text);
	}

	/**
	 * 中文分词：二分法分词器(Lucene自带包)
	 * @throws Exception
	 */
	@Test
	public void testCH2() throws Exception {
		String text = "LBJ和韦德能带领热火在2013赛季拿到NBA总冠军吗";
		Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
		this.testAnalyzer(analyzer, text);
	}

	/**
	 * 中文分词：IK分词器(Lucene自带包)
	 * @throws Exception
	 */
	@Test
	public void testCH3() throws Exception {
		String text = "fasd";
		Analyzer analyzer = new IKAnalyzer();
		this.testAnalyzer(analyzer, text);
	}

	/**
	 * 输出分词后的结果
	 * @param analyzer
	 * @param text
	 * @throws Exception
	 */
	private void testAnalyzer(Analyzer analyzer, String text) throws Exception {
		TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
		tokenStream.addAttribute(TermAttribute.class);
		while (tokenStream.incrementToken()) {
			TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
			System.out.println(termAttribute.term());
		}
	}
}