http://www.cnblogs.com/suyuan/archive/2008/03/25/1120827.html
using
System;
2
using
System.Collections.Generic;
3
using
System.Text;
4
using
System.IO;
5
using
Lucene.Net;
6
using
Lucene.Net.Analysis;
7
8
namespace
Lucene.Net.Analysis.KTDictSeg
9
{
10
public
class
KTDictSegAnalyzer:Analyzer
11
{
12
public
KTDictSegAnalyzer()
13
{
14
}
15
16
public
override
TokenStream TokenStream(
string
fieldName, TextReader reader)
17
{
18
TokenStream result
=
new
KTDictSegTokenizer(reader);
19
result
=
new
LowerCaseFilter(result);
20
return
result;
21
}
22
}
23
}
using
System;
using
System.Collections.Generic;
using
System.Text;
using
System.IO;
using
System.Collections;
using
Lucene.Net;
using
Lucene.Net.Analysis;
using
KTDictSeg;
namespace
Lucene.Net.Analysis.KTDictSeg
{
public
class
KTDictSegTokenizer:Tokenizer
{
public
static
CSimpleDictSeg m_SimpleDictSeg;
private
ArrayList ioBuffer;
private
int
offSet
=
0
;
//
偏移量.
private
int
position
=
-
1
;
//
词汇在缓冲中的位置.
private
int
length
=
0
;
//
词汇的长度.
private
int
start
=
0
;
//
开始偏移量.
public
KTDictSegTokenizer(System.IO.TextReader input)
:
base
(input)
{
//
这里用了一个第三方的中文分词组件.
//
ioBuffer = Sj110.Com.Chinese.Tokenizer.Tokenize(input.ReadToEnd());
if
(m_SimpleDictSeg
==
null
)
{
try
{
m_SimpleDictSeg
=
new
CSimpleDictSeg();
m_SimpleDictSeg.DictPath
=
Path.Combine(Environment.CurrentDirectory,
"
Data
"
)
+
Path.DirectorySeparatorChar;
m_SimpleDictSeg.LoadDict();
}
catch
(Exception e1)
{
m_SimpleDictSeg
=
null
;
throw
e1;
}
}
m_SimpleDictSeg.FilterStopWords
=
true
;
m_SimpleDictSeg.MatchName
=
true
;
ioBuffer
=
m_SimpleDictSeg.Segment(input.ReadToEnd());
}
//
DotLucene的分词器简单来说,就是实现Tokenizer的Next方法,把分解出来的每一个词构造为一个Token,因为Token是DotLucene分词的基本单位。
public
override
Token Next()
{
position
++
;
if
(position
<
ioBuffer.Count)
{
length
=
ioBuffer[position].ToString().Length;
start
=
offSet ;
offSet
+=
length ;
return
new
Token(ioBuffer[position].ToString(), start, start
+
length);
}
return
null
;
}
}
}
private
void
mackIndex()
{
Analyzer analyzer
=
new
KTDictSegAnalyzer();
//
lucene.net 默认分词器
//
Analyzer analyzer = new StandardAnalyzer();
FSDirectory fsDir
=
FSDirectory.GetDirectory(Index_Store_Path,
true
);
IndexWriter fswriter
=
new
IndexWriter(fsDir, analyzer,
true
);
ProductDao productDao
=
new
ProductDao();
//
得到数据源
IList
<
Product
>
PList
=
productDao.GetProduct();
IEnumerator
<
Product
>
_p
=
PList.GetEnumerator();
//
根据数据源制定document
while
(_p.MoveNext())
{
Document Doc
=
new
Document();
Field prodname
=
new
Field(
"
prodname
"
, _p.Current.Proname,Field.Store.YES,Field.Index.TOKENIZED);
if
(_p.Current.Proshuoming
==
null
)
{
_p.Current.Proshuoming
=
"
null
"
;
}
Field profunction
=
new
Field(
"
profunction
"
, _p.Current.Proshuoming, Field.Store.YES, Field.Index.UN_TOKENIZED);
Doc.Add(prodname);
Doc.Add(profunction);
fswriter.AddDocument(Doc);
}
fswriter.Close();
}
本文介绍了一种基于Lucene.NET的自定义分词器KTDictSegAnalyzer及其内部工作原理KTDictSegTokenizer。该分词器利用第三方中文分词组件进行词汇切分,并通过实例化CSimpleDictSeg类加载词典路径,实现对输入文本的有效分词。
1985

被折叠的 条评论
为什么被折叠?



