Lucene.net 实现数据库检索

最新推荐文章于 2021-01-27 06:56:29 发布

原创最新推荐文章于 2021-01-27 06:56:29 发布 · 1.3k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#数据库 #exception #string #null #query #object

搜索引擎专栏收录该内容

3 篇文章

订阅专栏

为了提高大量数据的检索效率，文章介绍了如何利用Lucene.net代替传统的LIKE模糊匹配方法来建立全文索引。索引过程包括分词等步骤，从而实现快速搜索。

因为工作的需要，不得不研究这一块。我首先考虑过去在字符串里通过LIKE来模糊匹配，这样当然可以，可是如果当量非常大的时候效率会降到多低你也是可以想到的，那么其他的办法就只有Lucene.net了。全文索引或者数据库索引，基本上都是两步走，第一步，建立索引，第二步，去搜索。

建立索引：

        /// <summary>
        /// 建立索引
        /// </summary>
        /// <param name="myred"></param>
        /// <returns></returns>
        public void CreateIndex(SqlDataReader myred)
        {
            Analyzer analyzer = new PanGuAnalyzer();  //我在这里用的是盘古分词器，因为LUCENE.NET自带的分词器功能比较单一，不能满足我的要求
            IndexWriter writer = null;
            try
            {
                writer = new IndexWriter("c:/index/", analyzer, true);
                //建立索引字段
                while (myred.Read())
                {
                    AddDocument(ref writer, Convert.ToString(myred.GetInt32(0)),myred.GetString(1).ToString());
                    AddDocument(ref writer, Convert.ToString(myred.GetInt32(0)),myred.GetString(2).ToString());
                }
                writer.Optimize();
                writer.Close();
              
            }
            catch (Exception e)
            {
                MessageBox.Show(e.Message.ToString ());
            }
        }
        /// <summary>
        /// 添加字段
        /// </summary>
        /// <param name="writer"></param>
        /// <param name="title">标题</param>
        /// <param name="content">内容</param>
        void AddDocument(ref IndexWriter writer, string _key, string _value)
        {
            try
            {
                Document document = new Document();
                document.Add(new Field("key", _key, Field.Store.YES, Field.Index.TOKENIZED));
                document.Add(new Field("value", _value, Field.Store.YES, Field.Index.TOKENIZED));
                writer.AddDocument(document);
            }
            catch(Exception ex)
            {
                MessageBox.Show(ex.Message.ToString());
            }
        }

搜索：

       /// <summary>
       /// 搜索
       /// </summary>
       /// <param name="queryString"></param>
       /// <returns></returns>
        public void seacher(String queryString)
        {
            try
            {
                IndexSearcher mysea = new IndexSearcher("c:/index/");
                QueryParser parser = new QueryParser("content", new PanGuAnalyzer());
                Query query = parser.Parse(queryString);
                Hits=mysea.Search(query);
                for (int i = 0; i < hits.Length(); i++)
                {
                     int num = hits.Id(i);
                     float result = hits.Score(i);
                     Document doc = hits.Doc(i); 
                     string str=doc.Get("key");
                     string body = doc.Get("value");
                } 
            }
            catch (Exception e)
            {
                MessageBox.Show(e.Message.ToString());
                return null;
            }
        }

盘古分词器：

    #region 盘古分词
    public class PanGuAnalyzer : Analyzer
    {
          public PanGuAnalyzer()
          {
          }
 
         public override TokenStream TokenStream(string fieldName, TextReader reader)
         {
             TokenStream result = new PanGuTokenizer(reader);
             result = new LowerCaseFilter(result);
             return result;
         }
     }
    public class PanGuTokenizer : Tokenizer
     {
         static object _LockObj = new object();
         static bool _Inited = false;
 
         WordInfo[] _WordList;
         int _Position = -1;
         string _InputText;
 
         static private void InitPanGuSegment()
         {
             if (!_Inited)
             {
                 PanGu.Segment.Init();
                 _Inited = true;
             }
         }
 
         public PanGuTokenizer()
         {
             lock (_LockObj)
             {
                 InitPanGuSegment();
             }
         }
 
         public PanGuTokenizer(TextReader input): base(input)
         {
             lock (_LockObj)
             {
                 InitPanGuSegment();
             }
 
             _InputText = base.input.ReadToEnd();
 
             if (string.IsNullOrEmpty(_InputText))
             {
                 _WordList = new WordInfo[0];
             }
             else
             {
                 PanGu.Segment segment = new Segment();
                 ICollection<WordInfo> wordInfos = segment.DoSegment(_InputText);
                 _WordList = new WordInfo[wordInfos.Count];
                 wordInfos.CopyTo(_WordList, 0);
             }
         }
 
         public override Lucene.Net.Analysis.Token Next()
        {
             int length = 0;    //词汇的长度.
              int start = 0;     //开始偏移量.
  
             while (true)
             {
                 _Position++;
                 if (_Position < _WordList.Length)
                 {
                     if (_WordList[_Position] != null)
                     {
                         length = _WordList[_Position].Word.Length;
                         start = _WordList[_Position].Position;
                         return new Lucene.Net.Analysis.Token(_WordList[_Position].Word, start, start + length);
                     }
                 }
                 else
                 {
                     break;
                 }
             }
 
             _InputText = null;
             return null;
         }
     }
    #endregion

另外需要加的命名空间：

using Lucene.Net;
using Lucene.Net.Analysis;
using Lucene.Net.Index;
using Lucene.Net.Documents;
using Lucene.Net.Util;
using Lucene.Net.Store;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using PanGu;

关于盘古分词器，大家需要到官网下载，然后同时把它提供的词库拿过来，放到Debug文件下，不然它会提示错误，找不到文件Dict/Dict.dct

这样分词就基本上完成了。