因为工作的需要,不得不研究这一块。我首先考虑过去在字符串里通过LIKE来模糊匹配,这样当然可以,可是如果当量非常大的时候效率会降到多低你也是可以想到的,那么其他的办法就只有Lucene.net了。全文索引或者数据库索引,基本上都是两步走,第一步,建立索引,第二步,去搜索。
建立索引:
/// <summary>
/// 建立索引
/// </summary>
/// <param name="myred"></param>
/// <returns></returns>
public void CreateIndex(SqlDataReader myred)
{
Analyzer analyzer = new PanGuAnalyzer(); //我在这里用的是盘古分词器,因为LUCENE.NET自带的分词器功能比较单一,不能满足我的要求
IndexWriter writer = null;
try
{
writer = new IndexWriter("c:/index/", analyzer, true);
//建立索引字段
while (myred.Read())
{
AddDocument(ref writer, Convert.ToString(myred.GetInt32(0)),myred.GetString(1).ToString());
AddDocument(ref writer, Convert.ToString(myred.GetInt32(0)),myred.GetString(2).ToString());
}
writer.Optimize();
writer.Close();
}
catch (Exception e)
{
MessageBox.Show(e.Message.ToString ());
}
}
/// <summary>
/// 添加字段
/// </summary>
/// <param name="writer"></param>
/// <param name="title">标题</param>
/// <param name="content">内容</param>
void AddDocument(ref IndexWriter writer, string _key, string _value)
{
try
{
Document document = new Document();
document.Add(new Field("key", _key, Field.Store.YES, Field.Index.TOKENIZED));
document.Add(new Field("value", _value, Field.Store.YES, Field.Index.TOKENIZED));
writer.AddDocument(document);
}
catch(Exception ex)
{
MessageBox.Show(ex.Message.ToString());
}
}搜索: /// <summary>
/// 搜索
/// </summary>
/// <param name="queryString"></param>
/// <returns></returns>
public void seacher(String queryString)
{
try
{
IndexSearcher mysea = new IndexSearcher("c:/index/");
QueryParser parser = new QueryParser("content", new PanGuAnalyzer());
Query query = parser.Parse(queryString);
Hits=mysea.Search(query);
for (int i = 0; i < hits.Length(); i++)
{
int num = hits.Id(i);
float result = hits.Score(i);
Document doc = hits.Doc(i);
string str=doc.Get("key");
string body = doc.Get("value");
}
}
catch (Exception e)
{
MessageBox.Show(e.Message.ToString());
return null;
}
}盘古分词器: #region 盘古分词
public class PanGuAnalyzer : Analyzer
{
public PanGuAnalyzer()
{
}
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
TokenStream result = new PanGuTokenizer(reader);
result = new LowerCaseFilter(result);
return result;
}
}
public class PanGuTokenizer : Tokenizer
{
static object _LockObj = new object();
static bool _Inited = false;
WordInfo[] _WordList;
int _Position = -1;
string _InputText;
static private void InitPanGuSegment()
{
if (!_Inited)
{
PanGu.Segment.Init();
_Inited = true;
}
}
public PanGuTokenizer()
{
lock (_LockObj)
{
InitPanGuSegment();
}
}
public PanGuTokenizer(TextReader input): base(input)
{
lock (_LockObj)
{
InitPanGuSegment();
}
_InputText = base.input.ReadToEnd();
if (string.IsNullOrEmpty(_InputText))
{
_WordList = new WordInfo[0];
}
else
{
PanGu.Segment segment = new Segment();
ICollection<WordInfo> wordInfos = segment.DoSegment(_InputText);
_WordList = new WordInfo[wordInfos.Count];
wordInfos.CopyTo(_WordList, 0);
}
}
public override Lucene.Net.Analysis.Token Next()
{
int length = 0; //词汇的长度.
int start = 0; //开始偏移量.
while (true)
{
_Position++;
if (_Position < _WordList.Length)
{
if (_WordList[_Position] != null)
{
length = _WordList[_Position].Word.Length;
start = _WordList[_Position].Position;
return new Lucene.Net.Analysis.Token(_WordList[_Position].Word, start, start + length);
}
}
else
{
break;
}
}
_InputText = null;
return null;
}
}
#endregion另外需要加的命名空间:using Lucene.Net;
using Lucene.Net.Analysis;
using Lucene.Net.Index;
using Lucene.Net.Documents;
using Lucene.Net.Util;
using Lucene.Net.Store;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using PanGu; 关于盘古分词器,大家需要到官网下载,然后同时把它提供的词库拿过来,放到Debug文件下,不然它会提示错误,找不到文件Dict/Dict.dct这样分词就基本上完成了。
为了提高大量数据的检索效率,文章介绍了如何利用Lucene.net代替传统的LIKE模糊匹配方法来建立全文索引。索引过程包括分词等步骤,从而实现快速搜索。
276

被折叠的 条评论
为什么被折叠?



