import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
public class SearcherUtil {
private Directory directory;
//创建IndexReader的开销特别大,一个项目最好使用一个
private static IndexReader reader = null;
//IndexSearcher是线程安全的,公用一个实例就可以
private static IndexSearcher searcher = null;
private String[] ids = {"1","2","3","4","5","6"};
private String[] emails = {"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
private String[] contents = {
"welcome to visited the space,I like book",
"hello boy, I like pingpeng ball",
"my name is cc I like game",
"I like football",
"I like dojo piky plewk saas hsga football and I like basketball too",
"I like movie and swim"
};
private Date[] dates = null;
private int[] attachs = {2,3,1,4,5,5};
private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"};
private Map<String,Float> scores = new HashMap<String,Float>();
private static String Index_File_Path="indexs\\index03";
public static final FieldType TYPE_STORED = new FieldType();
static {
TYPE_STORED.setIndexed(true);
TYPE_STORED.setOmitNorms(false);
TYPE_STORED.setIndexOptions(IndexOptions.DOCS_ONLY);
TYPE_STORED.setStored(true);
TYPE_STORED.setTokenized(false);
TYPE_STORED.freeze();
}
public SearcherUtil() {
// directory = new RAMDirectory();
try {
directory = FSDirectory.open(new File(Index_File_Path));
setDates();
scores.put("itat.org",2.0f);
scores.put("zttc.edu", 1.5f);
index();
} catch (IOException e) {
e.printStackTrace();
}
}
private void setDates() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
try {
dates = new Date[ids.length];
dates[0] = sdf.parse("2010-02-19");
dates[1] = sdf.parse("2012-01-11");
dates[2] = sdf.parse("2011-09-19");
dates[3] = sdf.parse("2010-12-22");
dates[4] = sdf.parse("2012-01-01");
dates[5] = sdf.parse("2011-05-19");
} catch (ParseException e) {
e.printStackTrace();
}
}
/**
* 创建索引
*/
public void index() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_4_9, new StandardAnalyzer(Version.LUCENE_4_9)));
writer.deleteAll();
Document doc = null;
for(int i=0;i<ids.length;i++) {
doc = new Document();
//创建字符串型field
doc.add(new StringField("id",ids[i],Field.Store.YES));
//创建文本型field
doc.add(new TextField("content",contents[i],Field.Store.NO));
doc.add(new StringField("name",names[i],Field.Store.YES));
//创建数字型field
doc.add(new IntField("attach",attachs[i],Field.Store.YES));
//创建日期型field
doc.add(new LongField("date",dates[i].getTime(),Field.Store.YES));
Field emailfield=new Field("email",emails[i],TYPE_STORED);
String et = emails[i].substring(emails[i].lastIndexOf("@")+1);
if(scores.containsKey(et)) {
//设置权值,默认权值为1.0f
emailfield.setBoost(scores.get(et));
} else {
//设置权值,默认权值为1.0f
emailfield.setBoost(0.5f);
}
doc.add(emailfield);
writer.addDocument(doc);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(writer!=null){
writer.close();
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 获取IndexSearcher
* @return
*/
public IndexSearcher getSearcher() {
boolean hasChanged=false;
try {
if(reader==null) {
reader = DirectoryReader.open(directory);
} else {
IndexReader tr = DirectoryReader.openIfChanged((DirectoryReader)reader);
if(tr!=null) {
reader.close();
reader = tr;
hasChanged=true;
}
}
if(searcher==null || hasChanged){
searcher=new IndexSearcher(reader);
}
return searcher;
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* 获取IndexSearcher
* @param directory
* @return
*/
public IndexSearcher getSearcher(Directory directory) {
boolean hasChanged=false;
try {
if(reader==null) {
reader = DirectoryReader.open(directory);
} else {
IndexReader tr = DirectoryReader.openIfChanged((DirectoryReader)reader);
if(tr!=null) {
reader.close();
reader = tr;
hasChanged=true;
}
}
if(searcher==null || hasChanged){
searcher=new IndexSearcher(reader);
}
return searcher;
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* term查询,按照分词来精确查询
* @param field
* @param name
* @param num
*/
public void searchByTerm(String field,String name,int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new TermQuery(new Term(field,name));
TopDocs tds = searcher.search(query, num);
System.out.println("查询到的文档总数:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* term范围查询
* 按照域的分词的范围进行查询,即查询域的分词>=start,并且域的分词<=end
* @param field
* @param start
* @param end
* @param num
*/
public void searchByTermRange(String field,String start,String end,int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = TermRangeQuery.newStringRange(field,start,end,true, true);
TopDocs tds = searcher.search(query, num);
System.out.println("查询到的文档总数:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 数字域范围查询,即查询数字域的值大于等于start,并且小于等于end
* @param field
* @param start
* @param end
* @param num
*/
public void searchByNumricRange(String field,int start,int end,int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = NumericRangeQuery.newIntRange(field,start, end,true,true);
TopDocs tds = searcher.search(query, num);
System.out.println("查询到的文档总数:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 前缀查询,即查询域的分词中有以value开头的
* @param field
* @param value
* @param num
*/
public void searchByPrefix(String field,String value,int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new PrefixQuery(new Term(field,value));
TopDocs tds = searcher.search(query, num);
System.out.println("查询到的文档总数:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 通配符查询
* @param field
* @param value
* @param num
*/
public void searchByWildcard(String field,String value,int num) {
try {
IndexSearcher searcher = getSearcher();
//参数value可以使用通配符:*和?
//*表示任意数目字符,?表示一个字符
Query query = new WildcardQuery(new Term(field,value));
TopDocs tds = searcher.search(query, num);
System.out.println("查询到的文档总数:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 布尔查询
* 可以组合多个查询条件
* @param num
*/
public void searchByBoolean(int num) {
try {
IndexSearcher searcher = getSearcher();
BooleanQuery query = new BooleanQuery();
/*
* BooleanQuery可以连接多个子查询
* Occur.MUST表示必须出现
* Occur.SHOULD表示可以出现,也可以不出现
* Occur.MUSE_NOT表示不能出现
*/
//域name中必须包含zhangsan的分词
query.add(new TermQuery(new Term("name","zhangsan")), Occur.MUST_NOT);
//域content中可以包含game的分词
query.add(new TermQuery(new Term("content","game")),Occur.SHOULD);
TopDocs tds = searcher.search(query, num);
System.out.println("查询到的文档总数:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 短语查询
* 某个域的一个分词开始,中间经过若干跳数,以另一个分词结尾的短语查询
* @param num
*/
public void searchByPhrase(int num) {
try {
IndexSearcher searcher = getSearcher();
PhraseQuery query = new PhraseQuery();
//查询域content中的分词pingpeng开始,中间间隔3跳,并且以分词i结尾的文档
//跳数,
query.setSlop(3);
//第一个分词
query.add(new Term("content","pingpeng"));
//产生距离之后的第二个分词
query.add(new Term("content","i"));
//
//query.add(new Term("content","football"));
TopDocs tds = searcher.search(query, num);
System.out.println("查询到的文档总数:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 模糊查询
* 对某个域的分词进行模糊匹配查询
* @param num
*/
public void searchByFuzzy(int num) {
try {
IndexSearcher searcher = getSearcher();
//查询域name的分词模糊匹配mase的文档
FuzzyQuery query = new FuzzyQuery(new Term("name","mase"));
System.out.println(query.getPrefixLength());
System.out.println(query.getMaxEdits());
TopDocs tds = searcher.search(query, num);
System.out.println("查询到的文档总数:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 使用QueryParser查询
* @param query
* @param num
*/
public void searchByQueryParse(Query query,int num) {
try {
IndexSearcher searcher = getSearcher();
TopDocs tds = searcher.search(query, num);
System.out.println("查询到的文档总数:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date")+"=="+sd.score);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 分页查询
* @param query
* @param pageIndex
* @param pageSize
*/
public void searchPage(String query,int pageIndex,int pageSize) {
try {
Directory dir = FileIndexUtils.getDirectory();
IndexSearcher searcher = getSearcher(dir);
QueryParser parser = new QueryParser(Version.LUCENE_4_9,"content",new StandardAnalyzer(Version.LUCENE_4_9));
Query q = parser.parse(query);
//每次都是查出所有的文档,占用内存大
TopDocs tds = searcher.search(q, 500);
ScoreDoc[] sds = tds.scoreDocs;
int start = (pageIndex-1)*pageSize;
int end = pageIndex*pageSize;
for(int i=start;i<end;i++) {
Document doc = searcher.doc(sds[i].doc);
System.out.println(sds[i].doc+":"+doc.get("path")+"-->"+doc.get("filename"));
}
} catch (IOException e) {
e.printStackTrace();
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
}
/**
* 获取上一页的最后一个文档的ScoreDoc
*/
private ScoreDoc getLastScoreDoc(int pageIndex,int pageSize,Query query,IndexSearcher searcher) throws IOException {
if(pageIndex==1) return null;//如果为第一页,则返回null
int num = pageSize*(pageIndex-1);//获取上一页的文档数目
TopDocs tds = searcher.search(query, num);
return tds.scoreDocs[num-1];
}
/**
* 以searchAfter的形式进行分页查询
* @param query
* @param pageIndex
* @param pageSize
*/
public void searchPageByAfter(String query,int pageIndex,int pageSize) {
try {
Directory dir = FileIndexUtils.getDirectory();
IndexSearcher searcher = getSearcher(dir);
QueryParser parser = new QueryParser(Version.LUCENE_4_9,"content",new StandardAnalyzer(Version.LUCENE_4_9));
Query q = parser.parse(query);
//获取上一页的最后一个文档的ScoreDoc
ScoreDoc lastSd = getLastScoreDoc(pageIndex, pageSize, q, searcher);
//从上一页的最后一个文档处开始向后查询,占用内存小
TopDocs tds = searcher.searchAfter(lastSd,q, pageSize);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(sd.doc+":"+doc.get("path")+"-->"+doc.get("filename"));
}
} catch (IOException e) {
e.printStackTrace();
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
}
/**
* 不分页查询
* @param query
*/
public void searchNoPage(String query) {
try {
Directory dir = FileIndexUtils.getDirectory();
IndexSearcher searcher = getSearcher(dir);
QueryParser parser = new QueryParser(Version.LUCENE_4_9,"content",new StandardAnalyzer(Version.LUCENE_4_9));
Query q = parser.parse(query);
TopDocs tds = searcher.search(q, 500);
ScoreDoc[] sds = tds.scoreDocs;
for(int i=0;i<sds.length;i++) {
Document doc = searcher.doc(sds[i].doc);
System.out.println(sds[i].doc+":"+doc.get("path")+"-->"+doc.get("filename"));
}
} catch (IOException e) {
e.printStackTrace();
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
}
}
package org.test.test;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.QueryParser.Operator;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;
import org.test.searcher.FileIndexUtils;
import org.test.searcher.SearcherUtil;
public class TestSearch {
private SearcherUtil su;
@Before
public void init() {
su = new SearcherUtil();
}
@Test
public void testCopyFiles() {
try {
File file = new File("docs");
for(File f:file.listFiles()) {
String destFileName = FilenameUtils.getFullPath(f.getAbsolutePath())+
FilenameUtils.getBaseName(f.getName())+".kvb";
FileUtils.copyFile(f, new File(destFileName));
}
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void searchByTerm() {
//查询content域中有分词为i的文档
su.searchByTerm("content","i",3);
}
@Test
public void searchByTermRange() {
//查询name域的分词大于等于a,并且小于等于s
//su.searchByTermRange("name","a","s",10);
//由于attach是数字类型,使用TermRange无法查询
su.searchByTermRange("attach","2","10", 5);
}
@Test
public void searchByNumRange() {
//查询数字域attach的值大于等于2,并且小于等于10
su.searchByNumricRange("attach",2,10, 5);
}
@Test
public void searchByPrefix() {
//查询域content中的分词有以s开头的
su.searchByPrefix("content", "s", 10);
}
@Test
public void searchByWildcard() {
//匹配域email中存在以@itat.org结尾的分词
su.searchByWildcard("email", "*@itat.org", 10);
//匹配域name中存在以j开头的后面有3个字符的分词
su.searchByWildcard("name", "j???", 10);
}
@Test
public void searchByBoolean() {
//组合条件查询
su.searchByBoolean(10);
}
@Test
public void searchByPhrase() {
//短语搜索
su.searchByPhrase(10);
}
@Test
public void searchByFuzzy() {
//模糊查询
su.searchByFuzzy(10);
}
/**
* 可以使用小括号,也可以使用双引号来将多个分词组成一个整体查询,可以使用AND,OR,NOT作为boolean连接
* @throws ParseException
*/
@Test
public void searchByQueryParse() throws ParseException {
//1.创建QueryParser,默认搜索域为content
QueryParser parser = new QueryParser(Version.LUCENE_4_9, "content", new StandardAnalyzer(Version.LUCENE_4_9));
//改变空格的默认操作符,默认为OR,以下可以改成AND
//parser.setDefaultOperator(Operator.AND);
//开启第一个字符的通配符查询,默认关闭,因为效率不高
parser.setAllowLeadingWildcard(true);
//搜索content域的分词中包含like的文档
Query query = parser.parse("like");
//搜索content域的分词中包含basketball或者football的文档,空格默认就是OR
query = parser.parse("basketball football");
//搜索content域的分词中包含basketball和football的文档
query = parser.parse("basketball AND football");
//搜索content域的分词中包含like的文档
query = parser.parse("content:like");
//可以使用*和?来进行通配符查询
//搜索name域的分词中包含以j开头的文档
query = parser.parse("name:j*");
//通配符默认不能放在首位,可以通过parser.setAllowLeadingWildcard(true);来设置
//搜索email域的分词中包含以@itat.org结尾的文档
query = parser.parse("email:*@itat.org");
//匹配域name的分词中没有mike,但是content域的分词中必须出现like的文档
//注意:+,-要放在域说明的前面
query = parser.parse("-name:mike +like");
query = parser.parse("NOT name:mike AND like");
//在域id中匹配一个区间(闭区间),注意:TO必须大写
query = parser.parse("id:[1 TO 6]");
//在域id中匹配一个开区间,注意:TO必须大写,此时只能匹配到2
query = parser.parse("id:{1 TO 3}");
//完全匹配content域中包含I like football
query = parser.parse("\"I like football\"");
//在content域中匹配I 和 football之间分词距离小于等于3的文档
query = parser.parse("\"I football\"~3");
//模糊搜索
//查询name域中的分词能模糊匹配上make的文档
query = parser.parse("name:make~");
//没有办法匹配数字范围,需要自定义扩展QueryParser
query = parser.parse("attach:[2 TO 10]");
su.searchByQueryParse(query, 10);
}
@Test
public void indexFile() {
FileIndexUtils.index(true);
}
@Test
public void testSearchPage01() {
su.searchPage("localhost", 1,5);
System.out.println("-------------------------------");
su.searchNoPage("localhost");
System.out.println("-------------------------------");
su.searchPageByAfter("localhost", 2,5);
}
@Test
public void testSearchPage02() {
su.searchNoPage("localhost");
System.out.println("-------------------------------");
su.searchPageByAfter("localhost", 3,5);
}
}