Lucene从入门到熟悉(三)分词

分词

分词是用来对 文本按语言特征按算法进行过滤、分组处理一种技术。
分词的对象是文本,而不是图像动画脚本等等
分词的方式就是 过滤分组
过滤主要是把文本中那些没有实际意义的字或词过滤掉
分组就是按照"分词数据库"内已添加好的词进行匹配。

Lucene提供的分词器
// Analyzer   analyzer=new StandardAnalyzer();
// Analyzer   analyzer=new SimpleAnalyzer();   
// Analyzer   analyzer=new WhitespaceAnalyzer();
// Analyzer   analyzer=new ChineseAnalyzer();
// Analyzer   analyzer=new CJKAnalyzer();  // 两个汉字一组


package com.lucene.test.T03;


import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class TestAnalyzer {

	/**
	 * @param args
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		
//		Analyzer   analyzer=new StandardAnalyzer();
//		Analyzer   analyzer=new SimpleAnalyzer();
//		Analyzer   analyzer=new WhitespaceAnalyzer();
//		Analyzer   analyzer=new ChineseAnalyzer();
//		Analyzer   analyzer=new CJKAnalyzer();
		Analyzer   analyzer=new IKAnalyzer();
		TokenStream tokenStream=analyzer.tokenStream("", new StringReader("welcome to use lucene! ?"));
//		TokenStream tokenStream=analyzer.tokenStream("", new StringReader("明天会更美好!"));

		Token token =new Token();
		while(tokenStream.next(token)!=null)
		{
			System.out.println(token.term());
		}
	}

}


paoding开源中文分词器
package com.lucene.test.T03;

import java.io.IOException;
import java.io.StringReader;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class AnalyzerPaoding {
	private static Logger logger = LoggerFactory
			.getLogger(AnalyzerPaoding.class);

	public static void main(String[] args) throws IOException {
		Analyzer analyzer = new PaodingAnalyzer();

		TokenStream ts = analyzer.tokenStream("", new StringReader("法律实践奥利弗论文集饿哦土建类士大夫接待来访将阿隆索"));
		Token token = new Token();
		while ((token = ts.next()) != null) {
			logger.debug("read result from token");
			System.out.println(token.term());
		}
	}
}

paoding中用Queryparse
package com.lucene.test.T03;

import java.io.IOException;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestIndexPaoding {

	/**
	 * @param args
	 * @throws IOException
	 */
	public static void main(String[] args) throws IOException {
		String[] ids = { "1", "2", "3", "4" };
		String[] names = { "张三", "李四", "王五", "赵六" };
		// String[] names = { "zhangsan", "zhangsun", "zhangson", "zhaoliu" };
		String[] address = { "居住北京", "南京", "北京海淀", "dalian" };
		String[] birthday = { "19880101", "19860105", "19760205", "19550719" };
		Analyzer analyzer = new PaodingAnalyzer();
		String indexDir = "c:/temp/luceneindex";
		Directory dir = FSDirectory.getDirectory(indexDir);
		// true 表示创建或覆盖当前索引;false表示对当前索引进行追加
		// Default value is 128
		IndexWriter writer = new IndexWriter(dir, analyzer, true,
				IndexWriter.MaxFieldLength.LIMITED);
		for (int i = 0; i < ids.length; i++) {
			Document document = new Document();
			document.add(new Field("id", ids[i], Field.Store.YES,
					Field.Index.ANALYZED));
			document.add(new Field("name", names[i], Field.Store.YES,
					Field.Index.ANALYZED)); // Field.Index.NO表示不建立索引
			document.add(new Field("address", address[i], Field.Store.YES,
					Field.Index.ANALYZED));
			document.add(new Field("birthday", birthday[i], Field.Store.YES,
					Field.Index.ANALYZED));
			writer.addDocument(document);
		}
		writer.optimize();
		writer.close();
		System.out.println("index created ....");

	}
}

package com.lucene.test.T03;

import java.io.IOException;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestSearcherPaoding {
	public static void main(String[] args) throws IOException, ParseException {
		String indexDir = "c:/temp/luceneindex";
		Directory dir = FSDirectory.getDirectory(indexDir);
		IndexSearcher searcher = new IndexSearcher(dir);
		Analyzer analyzer = new PaodingAnalyzer();
		ScoreDoc[] hits = null;

		QueryParser parser = new QueryParser("name", analyzer);
		Query qury = parser.parse("address:北京 AND NOT name:张三");// 高级查询(adress包含北京,但name不为张三的)

		TopDocs topDocs = searcher.search(qury, 10);
		hits = topDocs.scoreDocs;
		for (int i = 0; i < hits.length; i++) {
			Document doc = searcher.doc(hits[i].doc);
			System.out.print(hits[i].score + " ");
			System.out.print(doc.get("id") + " ");
			System.out.print(doc.get("name") + " ");
			System.out.print(doc.get("address") + " ");
			System.out.println(doc.get("birthday") + " ");
		}

		searcher.close();
		dir.close();
	}
}


读取文件建立索引并查询:
package com.lucene.test.T04;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TestFileIndex {
	private static Logger logger = LoggerFactory.getLogger(TestFileIndex.class);

	public static void main(String[] args) throws FileNotFoundException,
			IOException {
		String indexDir = "c:/temp/lucene/index";
		Analyzer analyzer = new PaodingAnalyzer();
		Directory dir = FSDirectory.getDirectory(indexDir);
		IndexWriter writer = new IndexWriter(indexDir, analyzer, true,
				IndexWriter.MaxFieldLength.LIMITED);

		// read data from dataDir and create index
		String dataDir = "c:/temp/lucene/data";
		File[] files = new File(dataDir).listFiles();
		System.out.println("file numbers:" + files.length);
		for (int i = 0; i < files.length; i++) {
			// read file content
			StringBuffer strBuff = new StringBuffer();
			String line = "";
			FileInputStream is = new FileInputStream(files[i].getPath());
			BufferedReader br = new BufferedReader(new InputStreamReader(is));
			line = br.readLine();
			while (line != null) {
				strBuff.append(line);
				strBuff.append("\n");
				line = br.readLine();
			}

			// create index
			Document document = new Document();
			document.add(new Field("title", files[i].getName(),
					Field.Store.YES, Field.Index.ANALYZED));
			document.add(new Field("content", strBuff.toString(),
					Field.Store.YES, Field.Index.ANALYZED));

			// write index
			writer.addDocument(document);

			is.close();
			br.close();
		}

		writer.close();
		dir.close();
	}
}

package com.lucene.test.T04;

import java.io.IOException;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestFileSearch {
	public static void main(String[] args) throws IOException, ParseException {
		String indexDir = "c:/temp/lucene/index";
		Directory dir = FSDirectory.getDirectory(indexDir);
		IndexSearcher searcher = new IndexSearcher(dir);
		Analyzer analyzer = new PaodingAnalyzer();
		ScoreDoc[] hits = null;

		QueryParser parser = new QueryParser("content", analyzer);
		Query qury = parser.parse("软件");

		TopDocs topDocs = searcher.search(qury, 10);
		hits = topDocs.scoreDocs;
		for (int i = 0; i < hits.length; i++) {
			Document doc = searcher.doc(hits[i].doc);
			// System.out.print(hits[i].score + " ");
			System.out.print(doc.get("title") + " ");
			System.out.print(doc.get("content") + " ");
		}

		searcher.close();
		dir.close();
	}
}

高亮

package com.lucene.test.T05;

import java.io.StringReader;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

public class TestHighlight {

	public static void main(String[] args) throws Exception {

		Searcher searcher = new IndexSearcher("c:/temp/lucene/index");
		Analyzer analyzer = new PaodingAnalyzer();

		String filed = "content";
		String queryStr = "分词";

		QueryParser parser = new QueryParser(filed, analyzer);
		Query query = parser.parse(queryStr);

		TopDocCollector collector = new TopDocCollector(10);
		searcher.search(query, collector);
		ScoreDoc[] hits = collector.topDocs().scoreDocs;

		// highlight setup
		Highlighter highlight = null;

		SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(
				"<font color='red'>", "</font>");

		highlight = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
		highlight.setTextFragmenter(new SimpleFragmenter(200));

		Document doc;
		for (int i = 0; i < hits.length; i++) {
			System.out.println(hits[i].doc);
			System.out.println("---------------------------------------1");
			System.out.println(hits[i].score);
			System.out.println("---------------------------------------2");
			doc = searcher.doc(hits[i].doc);
//			System.out.println(doc.toString());
			System.out.println("---------------------------------------3");

			// hightlight view
			TokenStream tokenStream = new PaodingAnalyzer().tokenStream(
					"content", new StringReader(doc.get("content")));
			System.out.println(highlight.getBestFragment(tokenStream,
					doc.get("content")));

		}
	}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值