注意lucene的版本不要超过3.0
下载地址 lucenenet: http://lucenenet.apache.org/ pangu4lucene:http://pangusegment.codeplex.com/
分词代码:
private IEnumerable<string> SpliteWord(string word)
{
List<string> listResu = new List<string>();
Analyzer analyzer = new PanGuAnalyzer();
TokenStream tokenStream = analyzer.TokenStream("", new StringReader(word));
Lucene.Net.Analysis.Token token = null;
while ((token = tokenStream.Next()) != null)//reader.Read()//只要还有词,就不返回null
{
listResu.Add(token.TermText());//token.TermText()为当前分的词
}
return listResu;
}
建立索引代码:
private void BuildIndex(Model model)
{
string indexPath = @"C:\Users\Administrator\Desktop\asp.net\articleManager\articleManagerWeb\article";//注意和磁盘上文件夹的大小写一致,否则会报错。
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(directory);//判断索引库是否存在
if (isUpdate)
{
//如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
//Lucene.Net在写索引库之前会自动加锁,在close的时候会自动解锁
//不能多线程执行,只能处理意外被永远锁定的情况
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory);//un-否定。强制解锁
}
}
IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
Document document = new Document();//一条Document相当于一条记录
document.Add(new Field("id", (model.Id).ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));//每个Document可以有自己的属性(字段),所有字段名都是自定义的,值都是string类型
document.Add(new Field("title", model.Title, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));//与上面不同 这里分词
document.Add(new Field("msg", model.Msg, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.DeleteDocuments(new Term("id", model.Id.ToString()));//防止存在的数据//delete from t where id=i
//如果不存在则删除0条
writer.AddDocument(document);//把文档写入索引库
writer.Close();
directory.Close();//不要忘了Close,否则索引结果搜不到
}
查找代码:
protected void btnSearch_Click(object sender, EventArgs e)
{
string indexPath = @"C:\Users\Administrator\Desktop\asp.net\articleManager\articleManagerWeb\article";
string kw = txtSearch.Text;
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());//打开目录中的文件
IndexReader reader = IndexReader.Open(directory, true);
IndexSearcher searcher = new IndexSearcher(reader);
PhraseQuery query = new PhraseQuery();//查询条件
foreach (var ky in SpliteWord(kw))
{
query.Add(new Term("msg", ky));//where contains("msg",kw)
}
query.SetSlop(100);//两个词的距离大于100(经验值)就不放入搜索结果,因为距离太远相关度就不高了
TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);//盛放查询结果的容器
searcher.Search(query, null, collector);//使用query这个查询条件进行搜索,搜索结果放入collector
//collector.GetTotalHits()总的结果条数
ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;//从查询结果中取出第m条到第n条的数据
List<Model> list = new List<Model>();
for (int i = 0; i < docs.Length; i++)//遍历查询结果
{
int docId = docs[i].doc;//拿到文档的id。因为Document可能非常占内存(DataSet和DataReader的区别)
//所以查询结果中只有id,具体内容需要二次查询
Document doc = searcher.Doc(docId);//根据id查询内容。放进去的是Document,查出来的还是Document
//Console.WriteLine(doc.Get("id"));
//Console.WriteLine(doc.Get("msg"));
Model result = new Model();
result.Id = Convert.ToInt64(doc.Get("id"));
result.Title = doc.Get("title");//只有 Field.Store.YES的字段才能用Get查出来
result.Msg = doc.Get("msg");//只有 Field.Store.YES的字段才能用Get查出来
list.Add(result);
}
Repeater1.DataSource = list;
Repeater1.DataBind();
}