.net lucene 实战搜索(二)----- 基本之索引

本文介绍Lucene的索引原理及实现方法,包括索引文件结构、文档库模型、索引参数配置、索引器功能及使用示例。
 

也许很多朋友没有luncene,但没关系,你可以认为他仅仅是“数据库”,或者文档库更合适。

这也是我们网站有又一次数据库服务器被不知道谁提掉网线,仍然运行了1天没人发现,-_-!.

既然luncene是文档库结构模型(不知道这样称呼是否合适),那我们先来了解luncene 一些基本的组成:

                   整个索引文件可认为张大表,实际上也是,在索引是luncene目前的版本使用的倒排序的方式存储的。

Hits :命中文档集合,可以认为是行集

Document:文档,一行数据。

Field:一个单元数据包括名称和它的值

了解上面之后,让从程序中看看是怎么回事儿:
(注意以下代码都是在 luncene 2.0的版本下的)

    public class Formater
    {
        /// <summary>
        /// 时间格式化
        /// </summary>
        /// <param name="time"></param>
        /// <returns></returns>
        public static string FormatTime(DateTime time)
        {
            return time.ToString("yyyyMMddhhmmss");
        }


        /// <summary>
        /// 时间格式化
        /// </summary>
        /// <param name="time"></param>
        /// <returns></returns>
        public static string FormatTime(string  str)
        {
            DateTime time = new DateTime(2000, 1, 1);
            return FormatTime(time);
        }

        // 格式化文字类型
        public static string FormatNum(string s)
        {
            if (s.Length > 9) return s;
            return s.PadLeft(9, '0');
        }

        /// <summary>
        /// 格式化搜索参数
        /// </summary>
        /// <param name="parm"></param>
        /// <returns></returns>
        public static Query FormatSearchItem(SearchParameter parm)
        {
            List<Query> list = new List<Query>();
            Query squery = null;

            foreach (QueryItem item in parm.QueryItems)
            {
                item.FieldName = item.FieldName.ToUpper();
                if (item.IsToLower)
                {
                    item.Value = item.Value.ToLower();
                    item.Value_1 = item.Value_1.ToLower();
                }
                QueryParser queryparser;
                Query query;
                switch (item.Type)
                {
                    case QueryItemType.Or:
                        {
                            queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
                            queryparser.SetDefaultOperator(QueryParser.Operator.OR);
                            query = queryparser.Parse(item.Value);
                            break;
                        }
                    case QueryItemType.Range:
                        {
                            queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
                            queryparser.SetDefaultOperator(QueryParser.Operator.AND);
                            query = queryparser.Parse(string.Format("{0}:[{1} TO {2}]", item.FieldName, Format(item.Value)));
                            break;
                        }
                    case QueryItemType.Fuzzy:
                        {
                            queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
                            queryparser.SetDefaultOperator(QueryParser.Operator.AND);
                            query = queryparser.Parse(item.Value + "~");
                            break;
                        }
                    case QueryItemType.Wildcard:
                        {
                            queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
                            queryparser.SetDefaultOperator(QueryParser.Operator.AND);
                            query = queryparser.Parse(item.Value + "*");
                            break;
                        }
                    case QueryItemType.TimeRange:
                        {
                            queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
                            queryparser.SetDefaultOperator(QueryParser.Operator.AND);
                            query = queryparser.Parse(string.Format("{0}:[{1} TO {2}]", item.FieldName, item.Value, item.Value_1));
                            break;
                        }
                    default:
                        {
                            queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
                            queryparser.SetDefaultOperator(QueryParser.Operator.AND);
                            query = queryparser.Parse(item.Value);
                            break;
                        }

                }
                if (squery == null)
                {
                    squery = query;
                }
                list.Add(query);
            }
            squery = squery.Combine(list.ToArray());
            return squery;
        }

        /// <summary>
        /// 获取分词器
        /// </summary>
        /// <param name="type"></param>
        /// <returns></returns>
        public static Analyzer GetAnalyzer(AnalyzerEnum type)
        {
            switch (type)
            {
                case AnalyzerEnum.ChineseAnalyzer:
                    {
                        return new ChineseAnalyzer();
                    }
                case AnalyzerEnum.DoubleWordAnalyzer:
                    {
                        return new DoubleWordAnalyzer();
                    }
                case AnalyzerEnum.ChineseWordAnalyzer:
                    {
                        return new ChineseWordAnalyzer();
                    }
                case AnalyzerEnum.CustomAnalyzer:
                    {
                        return new CustomAnalyzer();
                    }
                case AnalyzerEnum.SmartSegmentAnalyzer:
                    {
                        return new SmartSegmentAnalyzer();
                    }
                default:
                    {
                        return new StandardAnalyzer();
                    }
            }
        }
        // 格式化文字类型
        public static string Format(string s)
        {
            int temp = 0;
            int.TryParse(s, out temp);
            temp = temp * 100;
            s = temp.ToString();

            if (s.Length > 8) return s;
            return s.PadLeft(8, '0');
        }
    }


由于
有时候我们需要对结果进行排序,但lucene只能按字符来排序,所以我们必须将数字和时间转化成可排序得字符。

索引参数:

 public enum AnalyzerEnum : int
    {
        StandardAnalyzer,
        ChineseAnalyzer,
        DoubleWordAnalyzer,
        ChineseWordAnalyzer,
        CustomAnalyzer,
        SmartSegmentAnalyzer

    }
    [Serializable]
    public class IndexParameter
    {
        private AnalyzerEnum _analyzertype;
        private string _indexdir;
        private bool _rebuildIndex;

        private int maxFieldLength = 999999; // 字段最大长度
        private int mergeFactor = 999999;
        private int minMergeDocs = 1000;
        private int maxMergeDocs = 99999999;

        public int MaxMergeDocs
        {
            get { return maxMergeDocs; }
            set { maxMergeDocs = value; }
        }


        public int MaxFieldLength
        {
            get { return maxFieldLength; }
            set { maxFieldLength = value; }
        }

        public int MergeFactor
        {
            get { return mergeFactor; }
            set { mergeFactor = value; }
        }

        public int MinMergeDocs
        {
            get { return minMergeDocs; }
            set { minMergeDocs = value; }
        }

        public AnalyzerEnum AnalyzerType
        {
            set { _analyzertype = value; }
            get { return _analyzertype; }
        }

        /// <summary>
        /// 索引目录
        /// </summary>
        public string IndexDir
        {
            get { return _indexdir; }
            set { _indexdir = value; }
        }

        /// <summary>
        /// 是否重新,还是增量索引
        /// </summary>
        public bool RebuildIndex
        {
            get { return _rebuildIndex; }
            set { _rebuildIndex = value; }
        }


    }


索引器:

    public class Index
    {


        private IndexParameter param;

        public IndexParameter Param
        {
            get
            {
                return param;
            }
            set
            {
                param = value;
            }
        }

        public Index(IndexParameter mParam)
        {
            this.param = mParam;
        }

        /// <summary>
        /// 删除索引
        /// </summary>
        /// <param name="slist"></param>
        /// <returns></returns>
        public bool DelelteIndex(List<int> slist)
        {
            if (!IndexReader.IndexExists(this.param.IndexDir)) return false;
            IndexReader reader = IndexReader.Open(this.param.IndexDir);

            try
            {
                foreach (int item in slist)
                {
                    reader.DeleteDocument(item);
                }

            }

            finally
            {
                reader.Close();
            }

            return true;
        }

        /// <summary>
        /// 删除索引
        /// </summary>
        /// <param name="sfield"></param>
        /// <param name="svalue"></param>
        /// <returns></returns>
        public bool DelelteIndex(string sfield, string svalue)
        {
            if (!IndexReader.IndexExists(this.param.IndexDir)) return false;

            IndexReader reader = IndexReader.Open(this.param.IndexDir);

            try
            {
                reader.DeleteDocuments(new Term(sfield, svalue));
            }
            catch
            {
                return true;
            }
            finally
            {
                reader.Close();
            }

            return true;
        }

        /// <summary>
        /// 删除
        /// </summary>
        public void EnableChanged()
        {
            IndexWriter indexwriter = null;
            try
            {
                if (!IndexReader.IndexExists(this.param.IndexDir)) return;
                indexwriter = new IndexWriter(this.param.IndexDir, Formater.GetAnalyzer(this.param.AnalyzerType), false);
                indexwriter.Optimize();
            }
            finally
            {
                if (indexwriter != null)
                {
                    indexwriter.Close();
                }
            }
        }

        /// <summary>
        /// 建立索引
        /// </summary>
        /// <param name="items"></param>
        public void BuildIndex(List<List<IndexItem>> items)
        {
            lock (this)
            {
                IndexWriter indexwriter = null;

                try
                {
                    if (this.param == null)
                    {
                        throw new Exception("缺少建立索引参数,param 为空!");
                    }
                    if (!IndexReader.IndexExists(this.param.IndexDir) || this.param.RebuildIndex)
                    {
                        indexwriter = new IndexWriter(this.param.IndexDir, Formater.GetAnalyzer(this.param.AnalyzerType), true);
                    }
                    else
                    {
                        indexwriter = new IndexWriter(this.param.IndexDir, Formater.GetAnalyzer(this.param.AnalyzerType), false);
                    }

                    RAMDirectory _ramDir = new RAMDirectory();
                    IndexWriter _ramWriter = new IndexWriter(_ramDir, Formater.GetAnalyzer(this.param.AnalyzerType), true);

                    _ramWriter.SetMaxFieldLength(this.param.MaxFieldLength);
                    _ramWriter.SetMergeFactor(this.param.MergeFactor);
                    _ramWriter.SetMaxMergeDocs(this.param.MinMergeDocs);
                    _ramWriter.SetMaxMergeDocs(this.param.MaxMergeDocs);

                    indexwriter.SetMaxFieldLength(this.param.MaxFieldLength);
                    indexwriter.SetMergeFactor(this.param.MergeFactor);
                    indexwriter.SetMaxMergeDocs(this.param.MinMergeDocs);
                    indexwriter.SetMaxMergeDocs(this.param.MaxMergeDocs);

                    foreach (List<IndexItem> item in items)
                    {
                        _ramWriter.AddDocument(Doc(item));
                    }
                    _ramWriter.Close();
                    indexwriter.AddIndexes(new Directory[] { _ramDir });
                }
                catch (Exception err)
                {
                    new Log().WriteLog(err.ToString());
                }
                finally
                {
                    if (indexwriter != null)
                    {
                        indexwriter.Optimize();
                        indexwriter.Close();
                    }
                }
            }
        }

        private Document Doc(List<IndexItem> items)
        {
            Document doc = new Document();

 

            for (int i = 0; i < items.Count; i++)
            {
                items[i].FieldName = items[i].FieldName.ToUpper();
                if (items[i].IsToLower)
                {
                    items[i].FieldVale = items[i].FieldVale.ToLower();
                }
                Field file;
                switch (items[i].Type)
                {
                    case IndexType.KeyWord:
                        {
                            doc.Add(new Field(items[i].FieldName, items[i].FieldVale, Field.Store.YES, Field.Index.UN_TOKENIZED));
                            break;
                        }
                    case IndexType.Text:
                        {
                            doc.Add(new Field(items[i].FieldName, items[i].FieldVale, Field.Store.YES, Field.Index.TOKENIZED));
                            break;
                        }
                    case IndexType.UnIndexed:
                        {
                            doc.Add(new Field(items[i].FieldName, items[i].FieldVale, Field.Store.NO, Field.Index.UN_TOKENIZED));
                            break;
                        }
                    case IndexType.UnStored:
                        {
                            doc.Add(new Field(items[i].FieldName, items[i].FieldVale, Field.Store.NO, Field.Index.TOKENIZED));
                            break;
                        }
                    default:
                        {
                            doc.Add(new Field(items[i].FieldName, items[i].FieldVale, Field.Store.YES, Field.Index.TOKENIZED));
                            break;
                        }
                }
            }
            return doc;
        }
    }

索引封装参数:

public enum IndexType
    {
        KeyWord = 0,
        UnIndexed = 1,
        UnStored = 2,
        Text = 3
    }
    [Serializable]
    public class IndexItem
    {
        private string fieldName = string.Empty;
        private string fieldValue = string.Empty;
        private IndexType type;
        private bool isToLower = true;

        /// <summary>
        /// 索引名称
        /// </summary>
        public string FieldName
        {
            get
            {
                return fieldName;
            }
            set
            {
                fieldName = value;
            }
        }

        /// <summary>
        /// 索引值
        /// </summary>
        public string FieldVale
        {
            get
            {
                return fieldValue;
            }
            set
            {
                fieldValue = value;
            }
        }

        /// <summary>
        /// 索引类型
        /// </summary>
        public IndexType Type
        {
            get
            {
                return type;
            }
            set
            {
                type = value;
            }
        }

        /// <summary>
        /// 是否允许转成小写
        /// </summary>
        public bool IsToLower
        {
            get { return isToLower; }
            set { isToLower = value; }
        }
    }


注意:lucene是不支持更新文档功能的,所以先删除,后重新添加,还有删除的文档必须Optimize之后才真正删掉。
以上是通过BuildIndex方法索来引数据,lucene是支持增量搜索数据,RebuildIndex就是这个作用。
StandardAnalyzer,
        ChineseAnalyzer,
        DoubleWordAnalyzer,
        ChineseWordAnalyzer,
        CustomAnalyzer,
        SmartSegmentAnalyzer

是分词器,以后将讲述到。

应用举例:

public void BuildIndex()
        {
            List<List<IndexItem>> list = new List<List<IndexItem>>();
            DataTable mdt = GetKeywords();

            foreach (DataRow dr in mdt.Rows)
            {
                List<IndexItem> slist = new List<IndexItem>();
                IndexItem item;

                item = new IndexItem();
                item.FieldName = "sword";
                item.FieldVale = dr["SWord"].ToString();
                item.Type = IndexType.Text;
                slist.Add(item);

                item = new IndexItem();
                item.FieldName = "sword";
                item.FieldVale = dr["SWord"].ToString();
                item.Type = IndexType.Text;
                slist.Add(item);

                item = new IndexItem();
                item.FieldName = "scount";
                item.FieldVale = dr["scount"].ToString();
                item.Type = IndexType.KeyWord;
                slist.Add(item);

                item = new IndexItem();
                item.FieldName = "rcount";
                item.FieldVale = dr["rcount"].ToString();
                item.Type = IndexType.KeyWord;
                slist.Add(item);
                list.Add(slist);

            }
            IndexParameter parm = new IndexParameter();
            parm.AnalyzerType = AnalyzerEnum.ChineseAnalyzer;
            parm.IndexDir = this.Item.TempDir;
            parm.RebuildIndex = true;
            new Index(parm).BuildIndex(list);
        }



以上就是 lucene 索引文档的方法,实际上,很多地方都有,笔者只是将它们重新封装,侧重说明知道注意的地方。
不明白的,可以自己看看lucene的帮助和demo,不过demo写得太业余了,就将就将就吧。

http://lucene.apache.org/这里可以下载lucene .net,和索引察看工具nukeall
.net lucene 实战搜索(三)----- 基本之搜索 : http://www.cnblogs.com/xuwenzhuo/archive/2007/10/11/918246.html

转载于:https://www.cnblogs.com/xuwenzhuo/archive/2007/10/06/915280.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值