出自:http://blog.youkuaiyun.com/wxwzy738/article/details/8799184 的整理
1、工程结构
2、索引创建时的属性:
Field.Store.YES或者NO(存储域选项)
设置为YES表示或把这个域中的内容完全存储到文件中,方便进行文本的还原
设置为NO表示把这个域的内容不存储到文件中,但是可以被索引,此时内容无法完全还原(doc.get)
Field.Index(索引选项)
Index.ANALYZED:进行分词和索引,适用于标题、内容等
Index.NOT_ANALYZED:进行索引,但是不进行分词,如果身份证号,姓名,ID等,适用于精确搜索
Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息,这个norms中包括了创建索引的时间和权值等信息
norms中存储了很多排序的信息,
Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息
Index.NO:不进行索引
3、lucene的增删改查类
- packageorg.itat.index;
- importjava.io.IOException;
- importjava.text.ParseException;
- importjava.text.SimpleDateFormat;
- importjava.util.Date;
- importjava.util.HashMap;
- importjava.util.Map;
- importorg.apache.lucene.analysis.standard.StandardAnalyzer;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.document.Field;
- importorg.apache.lucene.document.NumericField;
- importorg.apache.lucene.index.CorruptIndexException;
- importorg.apache.lucene.index.IndexReader;
- importorg.apache.lucene.index.IndexWriter;
- importorg.apache.lucene.index.IndexWriterConfig;
- importorg.apache.lucene.index.StaleReaderException;
- importorg.apache.lucene.index.Term;
- importorg.apache.lucene.search.IndexSearcher;
- importorg.apache.lucene.search.ScoreDoc;
- importorg.apache.lucene.search.TermQuery;
- importorg.apache.lucene.search.TopDocs;
- importorg.apache.lucene.store.Directory;
- importorg.apache.lucene.store.LockObtainFailedException;
- importorg.apache.lucene.store.RAMDirectory;
- importorg.apache.lucene.util.Version;
- publicclassIndexUtil{
- privateString[]ids={"1","2","3","4","5","6"};
- privateString[]emails={"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
- privateString[]contents={
- "welcometovisitedthespace,Ilikebook",
- "helloboy,Ilikepingpengball",
- "mynameisccIlikegame",
- "Ilikefootball",
- "IlikefootballandIlikebasketballtoo",
- "Ilikemovieandswim"
- };
- privateDate[]dates=null;
- privateint[]attachs={2,3,1,4,5,5};
- privateString[]names={"zhangsan","lisi","john","jetty","mike","jake"};
- privateDirectorydirectory=null;
- privateMap<String,Float>scores=newHashMap<String,Float>();
- privatestaticIndexReaderreader=null;
- publicIndexUtil(){
- try{
- setDates();
- scores.put("itat.org",2.0f);
- scores.put("zttc.edu",1.5f);
- //directory=FSDirectory.open(newFile("d:/lucene/index02"));
- directory=newRAMDirectory();
- index();
- reader=IndexReader.open(directory,false);
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *对于IndexReader而言,反复使用Index.open打开会有很大的开销,所以一般在整个程序的生命周期中
- *只会打开一个IndexReader,通过这个IndexReader来创建不同的IndexSearcher,如果使用单例模式,
- *可能出现的问题有:
- *1、当使用Writer修改了索引之后不会更新信息,所以需要使用IndexReader.openIfChange方法操作
- *如果IndexWriter在创建完成之后,没有关闭,需要进行commit操作之后才能提交
- *@return
- */
- publicIndexSearchergetSearcher(){
- try{
- if(reader==null){
- reader=IndexReader.open(directory,false);
- }else{
- IndexReadertr=IndexReader.openIfChanged(reader);
- //如果原来的reader没改变,返回null
- //如果原来的reader改变,则更新为新的索引
- if(tr!=null){
- reader.close();
- reader=tr;
- }
- }
- returnnewIndexSearcher(reader);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- returnnull;
- }
- privatevoidsetDates(){
- SimpleDateFormatsdf=newSimpleDateFormat("yyyy-MM-dd");
- try{
- dates=newDate[ids.length];
- dates[0]=sdf.parse("2010-02-19");
- dates[1]=sdf.parse("2012-01-11");
- dates[2]=sdf.parse("2011-09-19");
- dates[3]=sdf.parse("2010-12-22");
- dates[4]=sdf.parse("2012-01-01");
- dates[5]=sdf.parse("2011-05-19");
- }catch(ParseExceptione){
- e.printStackTrace();
- }
- }
- /**
- *把之前删除的索引数据进行恢复
- */
- publicvoidundelete(){
- //使用IndexReader进行恢复
- try{
- IndexReaderreader=IndexReader.open(directory,false);
- //恢复时,必须把IndexReader的只读(readOnly)设置为false
- reader.undeleteAll();
- reader.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(StaleReaderExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *forceMerge是lucene3.5之前替代optimize方法的,其实只是改了个名称,因为优化的使效率变低
- *因为一到优化它就会全部更新索引,这个所涉及到的负载是很大的
- *所以改了个名称,不推荐使用,在做优化的时候会把索引回收站中的数据文件全部删除
- *lucene会在你写索引的时候根据你的索引的段越来越多会自动帮忙优化的,force是强制优化
- */
- publicvoidmerge(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- //会将索引合并为2段,这两段中的被删除的数据会被清空
- //特别注意:此处Lucene在3.5之后不建议使用,因为会消耗大量的开销,
- //Lucene会根据情况自动处理的
- writer.forceMerge(2);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- /**
- *假如你想要强制删除回收站的信息可以调用writer.forceMergeDeletes()这个方法,
- *但是这个方法不推荐使用,比较消耗内存,lucene会自动根据容量的大小删除所删除的文件
- */
- publicvoidforceDelete(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- writer.forceMergeDeletes();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- /**
- *删除索引数据,默认不会完全删除,被放入索引回收站
- */
- publicvoiddelete(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- //参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值
- //此时删除的文档并不会被完全删除,而是存储在一个回收站中的,可以恢复
- //执行完这个操作,索引文件夹下就会多出一个名叫_0_1.del的文件,也就是删除的文件在这个文件中记录了
- writer.deleteDocuments(newTerm("id","1"));
- writer.commit();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- /**
- *使用reader删除,其实里面也会调用writer删除,
- *优点是使用reader删除马上会更新索引信息
- *现在一般还是使用writer来删除,reader.getWriter这个方法被过时了
- */
- publicvoiddelete02(){
- try{
- reader.deleteDocuments(newTerm("id","1"));
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *更新操作
- *Lucene并没有提供更新,这里的更新操作其实是如下两个操作的合集
- *先删除之后再添加
- */
- publicvoidupdate(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- Documentdoc=newDocument();
- doc.add(newField("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(newField("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(newField("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(newField("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- writer.updateDocument(newTerm("id","1"),doc);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoidquery(){
- try{
- IndexReaderreader=IndexReader.open(directory);
- //通过reader可以有效的获取到文档的数量
- System.out.println("numDocs:"+reader.numDocs());//存储的文档数//不包括被删除的
- System.out.println("maxDocs:"+reader.maxDoc());//总存储量,包括在回收站中的索引
- System.out.println("deleteDocs:"+reader.numDeletedDocs());
- reader.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *索引文件后缀为.fmn为保存的是域的名称等
- *.fdt和.fdx保存的是Store.YES的信息,保存域里面存储的数据
- *.frq表示这里的域哪些出现多少次,哪些单词出现多少次,
- *.nrm存储一些评分信息
- *.prx存储一些偏移量等
- *.tii和.tis专门存储索引里面的所有内容信息
- */
- publicvoidindex(){
- IndexWriterwriter=null;
- try{
- //在2.9版本之后,lucene的就不是全部的索引格式都兼容的了,所以在使用的时候必须写明版本号
- writer=newIndexWriter(directory,newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- writer.deleteAll();//清空索引
- Documentdoc=null;
- for(inti=0;i<ids.length;i++){
- doc=newDocument();
- doc.add(newField("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(newField("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(newField("email","test"+i+"@test.com",Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(newField("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(newField("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- //存储数字
- //NumberTools.stringToLong("");已经被标记为过时了
- doc.add(newNumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
- //存储日期
- doc.add(newNumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
- Stringet=emails[i].substring(emails[i].lastIndexOf("@")+1);
- System.out.println(et);
- if(scores.containsKey(et)){
- doc.setBoost(scores.get(et));
- }else{
- doc.setBoost(0.5f);//默认是1.0f
- }
- writer.addDocument(doc);
- }
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoidsearch01(){
- try{
- IndexReaderreader=IndexReader.open(directory);
- IndexSearchersearcher=newIndexSearcher(reader);
- TermQueryquery=newTermQuery(newTerm("email","test0@test.com"));
- TopDocstds=searcher.search(query,10);
- for(ScoreDocsd:tds.scoreDocs){
- Documentdoc=searcher.doc(sd.doc);
- System.out.println("("+sd.doc+"-"+doc.getBoost()+"-"+sd.score+")"+
- doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
- doc.get("attach")+","+doc.get("date")+","+doc.getValues("email")[1]);
- }
- reader.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearch02(){
- try{
- IndexSearchersearcher=getSearcher();
- TermQueryquery=newTermQuery(newTerm("content","like"));
- TopDocstds=searcher.search(query,10);
- for(ScoreDocsd:tds.scoreDocs){
- Documentdoc=searcher.doc(sd.doc);
- System.out.println(doc.get("id")+"---->"+
- doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
- doc.get("attach")+","+doc.get("date")+","+doc.getValues("email")[1]);
- }
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- packageorg.itat.test;
- importorg.itat.index.IndexUtil;
- importorg.junit.Test;
- publicclassTestIndex{
- @Test
- publicvoidtestIndex(){
- IndexUtiliu=newIndexUtil();
- iu.index();
- }
- @Test
- publicvoidtestQuery(){
- IndexUtiliu=newIndexUtil();
- iu.query();
- }
- @Test
- publicvoidtestDelete(){
- IndexUtiliu=newIndexUtil();
- iu.delete();
- }
- @Test
- publicvoidtestDelete02(){
- IndexUtiliu=newIndexUtil();
- iu.delete02();
- }
- @Test
- publicvoidtestUnDelete(){
- IndexUtiliu=newIndexUtil();
- iu.undelete();
- }
- @Test
- publicvoidtestForceDelete(){
- IndexUtiliu=newIndexUtil();
- iu.forceDelete();
- }
- @Test
- publicvoidtestMerge(){
- IndexUtiliu=newIndexUtil();
- iu.merge();
- }
- @Test
- publicvoidtestUpdate(){
- IndexUtiliu=newIndexUtil();
- iu.update();
- }
- @Test
- publicvoidtestSearch01(){
- IndexUtiliu=newIndexUtil();
- iu.search01();
- }
- @Test
- publicvoidtestSearch02(){
- IndexUtiliu=newIndexUtil();
- for(inti=0;i<5;i++){
- iu.search02();
- System.out.println("-----------------------------");
- try{
- Thread.sleep(10000);
- }catch(InterruptedExceptione){
- e.printStackTrace();
- }
- }
- }
- }