Lucene 3.6 中文分词、分页查询、高亮显示等

最新推荐文章于 2022-04-25 11:10:12 发布

原创最新推荐文章于 2022-04-25 11:10:12 发布 · 289 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#数据库 #java

JAVA 专栏收录该内容

228 篇文章

订阅专栏

本文详细介绍如何使用Lucene进行全文检索，包括中文分词、索引建立、单字段及多字段查询，并介绍了高亮显示、通配符查询、嵌套查询等高级特性。

1、准备工作

下载lucene 3.6.1 ： http://lucene.apache.org/

下载中文分词IK Analyzer： http://code.google.com/p/ik-analyzer/downloads/list （注意下载的是IK Analyzer 2012_u5_source.zip，其他版本有bug）

下载solr 3.6.1： http://lucene.apache.org/solr/（编译IK Analyzer时需引用包）

OK，将lucene 、solr 相关包（lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar）拷贝到项目lib下，IK源码置于项目src下。

2、从Oracle数据库中取数据创建索引（使用IK分词）

 
package lucene.util; 

   002 

import org.apache.lucene.index.IndexWriter; 

import org.apache.lucene.index.IndexWriterConfig; 

import org.apache.lucene.index.CorruptIndexException; 

import org.apache.lucene.store.FSDirectory; 

import org.apache.lucene.store.Directory; 

import org.apache.lucene.analysis.Analyzer; 

import org.apache.lucene.analysis.standard.StandardAnalyzer; 

import org.apache.lucene.util.Version; 

import org.apache.lucene.document.Document; 

import org.apache.lucene.document.Field; 

import org.wltea.analyzer.lucene.IKAnalyzer; 

   014 

import java.sql.Connection; 

import java.io.File; 

import java.io.IOException; 

import java.util.ArrayList; 

import java.util.Date; 

   020 

import modules.gk.Gk_info; 

import modules.gk.Gk_infoSub; 

import web.sys.Globals; 

import web.db.DBConnector; 

import web.db.ObjectCtl; 

import web.util.StringUtil; 

   027//Wizzer.cn

public class LuceneIndex { 

    IndexWriter writer = null; 

    FSDirectory dir = null; 

    boolean create = true; 

   032 

    public void init() { 

        long a1 = System.currentTimeMillis(); 

        System.out.println("[Lucene 开始执行：" + new Date() + "]"); 

        Connection con = DBConnector.getconecttion(); //取得一个数据库连接 

        try { 

            final File docDir = newFile(Globals.SYS_COM_CONFIG.get("sys.index.path").toString());//E:\lucene 

            if (!docDir.exists()) { 

                docDir.mkdirs(); 

            } 

            String cr = Globals.SYS_COM_CONFIG.get("sys.index.create").toString();//true or false 

            if ("false".equals(cr.toLowerCase())) { 

                create = false; 

            } 

            Directory dir = FSDirectory.open(docDir); 

   047//            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

            Analyzer analyzer = new IKAnalyzer(true); 

            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer); 

            if (create) { 

                // Create a new index in the directory, removing any 

                // previously indexed documents: 

                iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); 

            } else { 

                // Add new documents to an existing index: 

                iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); 

            } 

            IndexWriter writer = new IndexWriter(dir, iwc); 

            String sql = "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 "; 

            int rowCount = ObjectCtl.getRowCount(con, sql); 

            int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get("sys.index.size").toString());   //每页记录数 

            int pages = (rowCount - 1) / pageSize + 1; //计算总页数 

            ArrayList list = null; 

            Gk_infoSub gk = null; 

            for (int i = 1; i < pages+1; i++) { 

                long a = System.currentTimeMillis(); 

                list = ObjectCtl.listPage(con, sql, i, pageSize, new Gk_infoSub()); 

                for (int j = 0; j < list.size(); j++) { 

                    gk = (Gk_infoSub) list.get(j); 

                    Document doc = new Document(); 

                    doc.add(new Field("indexno", StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//主键不分词 

                    doc.add(new Field("title", StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED)); 

                    doc.add(new Field("describes", StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED)); 

                    doc.add(new Field("pdate", StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//日期不分词 

                    doc.add(new Field("keywords", StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED)); 

                    writer.addDocument(doc); 

                    ObjectCtl.executeUpdateBySql(con,"UPDATE TABLEA SET SSTAG=1 WHERE indexno='"+gk.getIndexno()+"'");//更新已索引状态 

                } 

   079 

                long b = System.currentTimeMillis(); 

                long c = b - a; 

                System.out.println("[Lucene " + rowCount + "条，" + pages + "页，第" + i + "页花费时间：" + c + "毫秒]"); 

            } 

            writer.commit(); 

   085 

        } catch (Exception e) { 

            e.printStackTrace(); 

        } finally { 

            DBConnector.freecon(con); //释放数据库连接 

            try { 

                if (writer != null) { 

                    writer.close(); 

                } 

            } catch (CorruptIndexException e) { 

                e.printStackTrace(); 

            } catch (IOException e) { 

                e.printStackTrace(); 

            } finally { 

                try { 

                    if (dir != null && IndexWriter.isLocked(dir)) { 

                        IndexWriter.unlock(dir);//注意解锁 

                    } 

                } catch (IOException e) { 

                    e.printStackTrace(); 

                } 

            } 

        } 

        long b1 = System.currentTimeMillis(); 

        long c1 = b1 - a1; 

        System.out.println("[Lucene 执行完毕，花费时间：" + c1 + "毫秒，完成时间：" + newDate() + "]"); 

    } 

   112}

3、单字段查询以及多字段分页查询高亮显示

 
package lucene.util; 

   002 

import org.apache.lucene.store.FSDirectory; 

import org.apache.lucene.store.Directory; 

import org.apache.lucene.search.*; 

import org.apache.lucene.search.highlight.SimpleHTMLFormatter; 

import org.apache.lucene.search.highlight.Highlighter; 

import org.apache.lucene.search.highlight.SimpleFragmenter; 

import org.apache.lucene.search.highlight.QueryScorer; 

import org.apache.lucene.queryParser.QueryParser; 

import org.apache.lucene.queryParser.MultiFieldQueryParser; 

import org.apache.lucene.analysis.TokenStream; 

import org.apache.lucene.analysis.Analyzer; 

import org.apache.lucene.analysis.KeywordAnalyzer; 

import org.apache.lucene.document.Document; 

import org.apache.lucene.index.IndexReader; 

import org.apache.lucene.index.Term; 

import org.apache.lucene.util.Version; 

import modules.gk.Gk_infoSub; 

   020 

import java.util.ArrayList; 

import java.io.File; 

import java.io.StringReader; 

import java.lang.reflect.Constructor; 

   025 

import web.util.StringUtil; 

import web.sys.Globals; 

import org.wltea.analyzer.lucene.IKAnalyzer; 

   029//Wizzer.cn

public class LuceneQuery { 

    private static String indexPath;// 索引生成的目录 

    private int rowCount;// 记录数 

    private int pages;// 总页数 

    private int currentPage;// 当前页数 

    private int pageSize;   //每页记录数 

   036 

    public LuceneQuery() { 

        this.indexPath = Globals.SYS_COM_CONFIG.get("sys.index.path").toString(); 

    } 

   040 

    public int getRowCount() { 

        return rowCount; 

    } 

   044 

    public int getPages() { 

        return pages; 

    } 

   048 

    public int getPageSize() { 

        return pageSize; 

    } 

   052 

    public int getCurrentPage() { 

        return currentPage; 

    } 

   056 

    /** 

     * 函数功能:根据字段查询索引 

     */ 

    public ArrayList queryIndexTitle(String keyWord, int curpage, int pageSize) { 

        ArrayList list = new ArrayList(); 

        try { 

            if (curpage <= 0) { 

                curpage = 1; 

            } 

            if (pageSize <= 0) { 

                pageSize = 20; 

            } 

            this.pageSize = pageSize;   //每页记录数 

            this.currentPage = curpage;   //当前页 

            int start = (curpage - 1) * pageSize; 

            Directory dir = FSDirectory.open(new File(indexPath)); 

            IndexReader reader = IndexReader.open(dir); 

            IndexSearcher searcher = new IndexSearcher(reader); 

            Analyzer analyzer = new IKAnalyzer(true); 

            QueryParser queryParser = new QueryParser(Version.LUCENE_36, "title", analyzer); 

            queryParser.setDefaultOperator(QueryParser.AND_OPERATOR); 

            Query query = queryParser.parse(keyWord); 

            int hm = start + pageSize; 

            TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); 

            searcher.search(query, res); 

   082 

            SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); 

            Highlighter highlighter = new Highlighter(simpleHTMLFormatter, newQueryScorer(query)); 

            this.rowCount = res.getTotalHits(); 

            this.pages = (rowCount - 1) / pageSize + 1; //计算总页数 

            TopDocs tds = res.topDocs(start, pageSize); 

            ScoreDoc[] sd = tds.scoreDocs; 

            for (int i = 0; i < sd.length; i++) { 

                Document hitDoc = reader.document(sd[i].doc); 

                list.add(createObj(hitDoc, analyzer, highlighter)); 

            } 

   093 

        } catch (Exception e) { 

            e.printStackTrace(); 

        } 

   097 

        return list; 

   099 

    } 

    /** 

     * 函数功能:根据字段查询索引 

     */ 

    public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword, int curpage, int pageSize) { 

        ArrayList list = new ArrayList(); 

        try { 

            if (curpage <= 0) { 

                curpage = 1; 

            } 

            if (pageSize <= 0) { 

                pageSize = 20; 

            } 

            this.pageSize = pageSize;   //每页记录数 

            this.currentPage = curpage;   //当前页 

            int start = (curpage - 1) * pageSize; 

            Directory dir = FSDirectory.open(new File(indexPath)); 

            IndexReader reader = IndexReader.open(dir); 

            IndexSearcher searcher = new IndexSearcher(reader); 

            BooleanQuery bQuery = new BooleanQuery();  //组合查询 

            if (!"".equals(allkeyword)) {//包含全部关键词 

                KeywordAnalyzer analyzer = new KeywordAnalyzer(); 

                BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//AND 

                Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer); 

                bQuery.add(query, BooleanClause.Occur.MUST);  //AND 

            } 

            if (!"".equals(onekeyword)) { //包含任意关键词 

                Analyzer analyzer = new IKAnalyzer(true); 

                BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//OR 

                Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer); 

                bQuery.add(query, BooleanClause.Occur.MUST);  //AND 

            } 

            if (!"".equals(nokeyword)) { //排除关键词 

                Analyzer analyzer = new IKAnalyzer(true); 

                BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//NOT 

                Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer); 

                bQuery.add(query, BooleanClause.Occur.MUST_NOT);  //AND 

   137 

            } 

            int hm = start + pageSize; 

            TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); 

            searcher.search(bQuery, res); 

            SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); 

            Highlighter highlighter = new Highlighter(simpleHTMLFormatter, newQueryScorer(bQuery)); 

            this.rowCount = res.getTotalHits(); 

            this.pages = (rowCount - 1) / pageSize + 1; //计算总页数 

            System.out.println("rowCount:" + rowCount); 

            TopDocs tds = res.topDocs(start, pageSize); 

            ScoreDoc[] sd = tds.scoreDocs; 

            Analyzer analyzer = new IKAnalyzer(); 

            for (int i = 0; i < sd.length; i++) { 

                Document hitDoc = reader.document(sd[i].doc); 

                list.add(createObj(hitDoc, analyzer, highlighter)); 

            } 

   154 

        } catch (Exception e) { 

            e.printStackTrace(); 

        } 

   158 

        return list; 

   160 

    } 

   162 

    /** 

     * 创建返回对象（高亮） 

     */ 

   166 

    private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) { 

   168 

        Gk_infoSub gk = new Gk_infoSub(); 

        try { 

   171 

            if (doc != null) { 

                gk.setIndexno(StringUtil.null2String(doc.get("indexno"))); 

                gk.setPdate(StringUtil.null2String(doc.get("pdate"))); 

                String title = StringUtil.null2String(doc.get("title")); 

                gk.setTitle(title); 

                if (!"".equals(title)) { 

                    highlighter.setTextFragmenter(newSimpleFragmenter(title.length())); 

                    TokenStream tk = analyzer.tokenStream("title", newStringReader(title)); 

                    String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title)); 

                    if (!"".equals(htext)) { 

                        gk.setTitle(htext); 

                    } 

                } 

                String keywords = StringUtil.null2String(doc.get("keywords")); 

                gk.setKeywords(keywords); 

                if (!"".equals(keywords)) { 

                    highlighter.setTextFragmenter(newSimpleFragmenter(keywords.length())); 

                    TokenStream tk = analyzer.tokenStream("keywords", newStringReader(keywords)); 

                    String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords)); 

                    if (!"".equals(htext)) { 

                        gk.setKeywords(htext); 

                    } 

                } 

                String describes = StringUtil.null2String(doc.get("describes")); 

                gk.setDescribes(describes); 

                if (!"".equals(describes)) { 

                    highlighter.setTextFragmenter(newSimpleFragmenter(describes.length())); 

                    TokenStream tk = analyzer.tokenStream("keywords", newStringReader(describes)); 

                    String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes)); 

                    if (!"".equals(htext)) { 

                        gk.setDescribes(htext); 

                    } 

                } 

   205 

            } 

            return gk; 

        } 

        catch (Exception e) { 

   210 

            e.printStackTrace(); 

            return null; 

        } 

        finally { 

            gk = null; 

        } 

   217 

    } 

   219 

    private synchronized static Object createObj(Document doc) { 

   221 

        Gk_infoSub gk = new Gk_infoSub(); 

        try { 

   224 

            if (doc != null) { 

                gk.setIndexno(StringUtil.null2String(doc.get("indexno"))); 

                gk.setPdate(StringUtil.null2String(doc.get("pdate"))); 

                gk.setTitle(StringUtil.null2String(doc.get("title"))); 

                gk.setKeywords(StringUtil.null2String(doc.get("keywords"))); 

                gk.setDescribes(StringUtil.null2String(doc.get("describes"))); 

            } 

            return gk; 

        } 

        catch (Exception e) { 

   235 

            e.printStackTrace(); 

            return null; 

        } 

        finally { 

            gk = null; 

        } 

   242 

    } 

   244}

单字段查询：

 
long a = System.currentTimeMillis(); 

try { 

    int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage"))); 

    int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize"))); 

    String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("title"))); 

    LuceneQuery lu = new LuceneQuery(); 

    form.addResult("list", lu.queryIndexTitle(title, curpage, pagesize)); 

    form.addResult("curPage", lu.getCurrentPage()); 

    form.addResult("pageSize", lu.getPageSize()); 

    form.addResult("rowCount", lu.getRowCount()); 

    form.addResult("pageCount", lu.getPages()); 

} catch (Exception e) { 

    e.printStackTrace(); 

   14}

long b = System.currentTimeMillis(); 

long c = b - a; 

System.out.println("[搜索信息花费时间：" + c + "毫秒]");

多字段查询：

 
long a = System.currentTimeMillis(); 

try { 

    int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage"))); 

    int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize"))); 

    String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("allkeyword"))); 

    String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("onekeyword"))); 

    String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("nokeyword"))); 

    LuceneQuery lu = new LuceneQuery(); 

    form.addResult("list", lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize)); 

    form.addResult("curPage", lu.getCurrentPage()); 

    form.addResult("pageSize", lu.getPageSize()); 

    form.addResult("rowCount", lu.getRowCount()); 

    form.addResult("pageCount", lu.getPages()); 

} catch (Exception e) { 

    e.printStackTrace(); 

   16}

long b = System.currentTimeMillis(); 

long c = b - a; 

System.out.println("[高级检索花费时间：" + c + "毫秒]");

4、Lucene通配符查询

 
   1 BooleanQuery bQuery = new BooleanQuery();  //组合查询 

   2 if (!"".equals(title)) { 

   3     WildcardQuery w1 = new WildcardQuery(new Term("title", title+ "*")); 

   4 

   5     bQuery.add(w1, BooleanClause.Occur.MUST);  //AND 

   6}

   7 int hm = start + pageSize; 

   8 TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); 

   9searcher.search(bQuery, res);

5、Lucene嵌套查询

实现SQL：(unitid like 'unitid%' and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1)

 
BooleanQuery bQuery = new BooleanQuery(); 

BooleanQuery b1 = new BooleanQuery(); 

WildcardQuery w1 = new WildcardQuery(new Term("unitid", unitid + "*")); 

WildcardQuery w2 = new WildcardQuery(new Term("idml", id2 + "*")); 

b1.add(w1, BooleanClause.Occur.MUST);//AND 

b1.add(w2, BooleanClause.Occur.MUST);//AND 

bQuery.add(b1, BooleanClause.Occur.SHOULD);//OR 

BooleanQuery b2 = new BooleanQuery(); 

WildcardQuery w3 = new WildcardQuery(new Term("tounitid", unitid + "*")); 

WildcardQuery w4 = new WildcardQuery(new Term("tomlid", id2 + "*")); 

WildcardQuery w5 = new WildcardQuery(new Term("tostate", "1")); 

b2.add(w3, BooleanClause.Occur.MUST);//AND 

b2.add(w4, BooleanClause.Occur.MUST);//AND 

b2.add(w5, BooleanClause.Occur.MUST);//AND 

bQuery.add(b2, BooleanClause.Occur.SHOULD);//OR

6、Lucene先根据时间排序后分页

 
int hm = start + pageSize; 

Sort sort = new Sort(new SortField("pdate", SortField.STRING, true)); 

TopScoreDocCollector res = TopScoreDocCollector.create(pageSize, false); 

   04searcher.search(bQuery, res);

this.rowCount = res.getTotalHits(); 

this.pages = (rowCount - 1) / pageSize + 1; //计算总页数 

TopDocs tds =searcher.search(bQuery,rowCount,sort);// res.topDocs(start, pageSize); 

   08ScoreDoc[] sd = tds.scoreDocs;

System.out.println("rowCount:" + rowCount); 

int i=0; 

for (ScoreDoc scoreDoc : sd) { 

    i++; 

    if(i<start){ 

        continue; 

    } 

    if(i>hm){ 

        break; 

    } 

    Document doc = searcher.doc(scoreDoc.doc); 

    list.add(createObj(doc)); 

   21}