【转】--使用肖波的KTDictSeg分词器为Lucene.net服务

最新推荐文章于 2018-10-08 16:49:35 发布

转载最新推荐文章于 2018-10-08 16:49:35 发布 · 972 阅读

文章标签：

本文介绍了一种基于Lucene.NET的自定义分词器KTDictSegAnalyzer及其内部工作原理KTDictSegTokenizer。该分词器利用第三方中文分词组件进行词汇切分，并通过实例化CSimpleDictSeg类加载词典路径，实现对输入文本的有效分词。

http://www.cnblogs.com/suyuan/archive/2008/03/25/1120827.html

using System;
2 using System.Collections.Generic;
3 using System.Text;
4 using System.IO;
5 using Lucene.Net;
6 using Lucene.Net.Analysis;
7
8 namespace Lucene.Net.Analysis.KTDictSeg
9 {
10      public class KTDictSegAnalyzer:Analyzer
11      {
12          public KTDictSegAnalyzer()
13          {
14         }
15
16          public override TokenStream TokenStream( string fieldName, TextReader reader)
17          {
18             TokenStream result = new KTDictSegTokenizer(reader);
19             result = new LowerCaseFilter(result);
20              return result;
21         }
22     }
23 }

using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Collections;
using Lucene.Net;
using Lucene.Net.Analysis;
using KTDictSeg;

namespace Lucene.Net.Analysis.KTDictSeg
{
     public class KTDictSegTokenizer:Tokenizer
     {
         public static CSimpleDictSeg m_SimpleDictSeg;
         private ArrayList ioBuffer;
         private int offSet = 0 ;     // 偏移量.
         private int position = - 1 ; // 词汇在缓冲中的位置.
         private int length = 0 ;     // 词汇的长度.
         private int start = 0 ;      // 开始偏移量.

         public KTDictSegTokenizer(System.IO.TextReader input)
            : base (input)
         {
             // 这里用了一个第三方的中文分词组件.
             // ioBuffer = Sj110.Com.Chinese.Tokenizer.Tokenize(input.ReadToEnd());
             if (m_SimpleDictSeg == null )
             {
                 try
                 {
                    m_SimpleDictSeg = new CSimpleDictSeg();
                    m_SimpleDictSeg.DictPath = Path.Combine(Environment.CurrentDirectory, " Data " ) + Path.DirectorySeparatorChar;
                    m_SimpleDictSeg.LoadDict();
                }
                 catch (Exception e1)
                 {
                    m_SimpleDictSeg = null ;
                     throw e1;
                }
            }

            m_SimpleDictSeg.FilterStopWords = true ;
            m_SimpleDictSeg.MatchName = true ;
            ioBuffer = m_SimpleDictSeg.Segment(input.ReadToEnd());

        }

         // DotLucene的分词器简单来说，就是实现Tokenizer的Next方法，把分解出来的每一个词构造为一个Token，因为Token是DotLucene分词的基本单位。
         public override Token Next()
         {
            position ++ ;
             if (position < ioBuffer.Count)
             {
                length = ioBuffer[position].ToString().Length;
                start = offSet ;
                offSet += length ;
                 return new Token(ioBuffer[position].ToString(), start, start + length);
            }

             return null ;
        }
    }
}

private void mackIndex()
         {
           Analyzer analyzer = new KTDictSegAnalyzer();
// lucene.net 默认分词器
             // Analyzer analyzer = new StandardAnalyzer();

            FSDirectory fsDir = FSDirectory.GetDirectory(Index_Store_Path, true );

            IndexWriter fswriter = new IndexWriter(fsDir, analyzer, true );
            ProductDao productDao = new ProductDao();
// 得到数据源
            IList < Product > PList = productDao.GetProduct();
            IEnumerator < Product > _p = PList.GetEnumerator();
// 根据数据源制定document
             while (_p.MoveNext())
             {
                Document Doc = new Document();
                Field prodname = new Field( " prodname " , _p.Current.Proname,Field.Store.YES,Field.Index.TOKENIZED);
                 if (_p.Current.Proshuoming == null )
                 {
                    _p.Current.Proshuoming = " null " ;
                }
                Field profunction = new Field( " profunction " , _p.Current.Proshuoming, Field.Store.YES, Field.Index.UN_TOKENIZED);
                Doc.Add(prodname);
                Doc.Add(profunction);
                fswriter.AddDocument(Doc);
            }

            fswriter.Close();


        }