想写BLOG,又不知道写些什么(只是最近刚想是不是该写个自已的BLOG)。于是乎,把以前曾经在项目中用到的一个东东写上了,个人愚见:).
使用LUCENE时,搜索'c++'这样的词会出现空的结果.原因肯定是在创建索引时出了问题,创建索引就需要分词,你没搜到'C++',就是因为没有c++这样一个分词加到索引中.
说完就开工,我仿照英文的分词方式,每个字母做一个分词,也就是'c++'被分成'c' '+' '+'三个,然后加在索引中.
publicsealedclassChineseTokenizer:Tokenizer

...{
publicChineseTokenizer(TextReader_in)

...{
input=_in;
}

privateintoffset=0;//偏移量

privateintbufferIndex=0;/**/////词汇在缓冲中的位置
privateintdataLen=0;//缓冲字符的长度
privatestaticintMAX_WORD_LEN=255;
privatestaticintIO_BUFFER_SIZE=1024;
privatechar[]buffer=newchar[MAX_WORD_LEN];
privatechar[]ioBuffer=newchar[IO_BUFFER_SIZE];

privateintlength;//词汇的长度
privateintstart;//开始偏移量.

privatevoidPush(charc)

...{

if(length==0)start=offset-1;//分词起始位置
buffer[length++]=Char.ToLower(c);//加入缓冲区

}

privateTokenFlush()

...{

if(length>0)

...{
returnnewToken(newString(buffer,0,length),start,start+length);
}
else
returnnull;
}

publicoverrideTokenNext()

...{
length=0;
start=offset;

while(true)

...{
charc;
offset++;

if(bufferIndex>=dataLen)

...{
dataLen=input.Read(ioBuffer,0,ioBuffer.Length);
bufferIndex=0;
};

if(dataLen==0)returnFlush();
else
c=ioBuffer[bufferIndex++];

if(c==','
||c==','
||c==';'
||c==';')//排除字符,;,;参与分词

...{
if(length>0)returnFlush();
continue;
}
switch(Char.GetUnicodeCategory(c))

...{
caseUnicodeCategory.DecimalDigitNumber://指示字符是十进制数字;即在范围0到9内
caseUnicodeCategory.LowercaseLetter://指示字符是小写字母
caseUnicodeCategory.UppercaseLetter://指示字符是大写字母

Push(c);
if(length==MAX_WORD_LEN)returnFlush();
break;
caseUnicodeCategory.MathSymbol://指示字符是数学符号,例如“+”或“=”。
caseUnicodeCategory.OpenPunctuation://指示字符是成对的标点符号(例如括号、方括号和大括号)之一的开始字符
caseUnicodeCategory.ClosePunctuation://指示字符是成对的标点符号(例如括号、方括号和大括号)之一的封闭字符
caseUnicodeCategory.CurrencySymbol://货币符号
caseUnicodeCategory.DashPunctuation://指示字符是短划线或连字符
caseUnicodeCategory.ModifierSymbol://指示字符是修饰符符号,这指示环绕的字符的修改例如:^
caseUnicodeCategory.OtherPunctuation://指示字符是标点,但它不是连接符标点、短划线标点、开始标点、结束标点、前引号标点或后引号标点,如:%
caseUnicodeCategory.OtherLetter:
if(length>0)

...{
bufferIndex--;
offset--;
returnFlush();
}
Push(c);
returnFlush();

default:
if(length>0)

...{
returnFlush();
}
break;
}
}

}
}
这是最主要的分词,你可以自已设置想要参与分词的字符,放到你的Analyzer中使用吧.
顺便贴一下网上的ChineseAnalyzer,为了配合使用,改过后如下:
publicsealedclassChineseFilter:TokenFilter

...{
publicstaticString[]STOP_WORDS=

...{
"and","are","as","at","be","but","by",
"for","if","in","into","is","it",
"no","not","of","on","or","such",
"that","the","their","then","there","these",
"they","this","to","was","will","with"
};

privateHashtablestopTable;

publicChineseFilter(TokenStream_in):base(_in)

...{
stopTable=newHashtable(STOP_WORDS.Length);

for(inti=0;i<STOP_WORDS.Length;i++)
stopTable[STOP_WORDS[i]]=STOP_WORDS[i];
}

publicoverrideTokenNext()

...{

for(Tokentoken=input.Next();token!=null;token=input.Next())

...{
Stringtext=token.TermText();

if(stopTable[text]==null)

...{
returntoken;

}

}
returnnull;
}
}

publicclassChineseAnalyzer:Analyzer

...{

publicChineseAnalyzer()

...{
}

publicoverridesealedTokenStreamTokenStream(StringfieldName,TextReaderreader)

...{
TokenStreamresult=newChineseTokenizer(reader);
result=newChineseFilter(result);
returnresult;
}
}
使用LUCENE时,搜索'c++'这样的词会出现空的结果.原因肯定是在创建索引时出了问题,创建索引就需要分词,你没搜到'C++',就是因为没有c++这样一个分词加到索引中.
说完就开工,我仿照英文的分词方式,每个字母做一个分词,也就是'c++'被分成'c' '+' '+'三个,然后加在索引中.
publicsealedclassChineseTokenizer:Tokenizer
...{
publicChineseTokenizer(TextReader_in)
...{
input=_in;
}
privateintoffset=0;//偏移量
privateintbufferIndex=0;/**/////词汇在缓冲中的位置
privateintdataLen=0;//缓冲字符的长度
privatestaticintMAX_WORD_LEN=255;
privatestaticintIO_BUFFER_SIZE=1024;
privatechar[]buffer=newchar[MAX_WORD_LEN];
privatechar[]ioBuffer=newchar[IO_BUFFER_SIZE];
privateintlength;//词汇的长度
privateintstart;//开始偏移量.
privatevoidPush(charc)
...{
if(length==0)start=offset-1;//分词起始位置
buffer[length++]=Char.ToLower(c);//加入缓冲区
}
privateTokenFlush()
...{
if(length>0)
...{
returnnewToken(newString(buffer,0,length),start,start+length);
}
else
returnnull;
}
publicoverrideTokenNext()
...{
length=0;
start=offset;
while(true)
...{
charc;
offset++;
if(bufferIndex>=dataLen)
...{
dataLen=input.Read(ioBuffer,0,ioBuffer.Length);
bufferIndex=0;
};
if(dataLen==0)returnFlush();
else
c=ioBuffer[bufferIndex++];
if(c==','
||c==','
||c==';'
||c==';')//排除字符,;,;参与分词
...{
if(length>0)returnFlush();
continue;
}
switch(Char.GetUnicodeCategory(c))
...{
caseUnicodeCategory.DecimalDigitNumber://指示字符是十进制数字;即在范围0到9内
caseUnicodeCategory.LowercaseLetter://指示字符是小写字母
caseUnicodeCategory.UppercaseLetter://指示字符是大写字母
Push(c);
if(length==MAX_WORD_LEN)returnFlush();
break;
caseUnicodeCategory.MathSymbol://指示字符是数学符号,例如“+”或“=”。
caseUnicodeCategory.OpenPunctuation://指示字符是成对的标点符号(例如括号、方括号和大括号)之一的开始字符
caseUnicodeCategory.ClosePunctuation://指示字符是成对的标点符号(例如括号、方括号和大括号)之一的封闭字符
caseUnicodeCategory.CurrencySymbol://货币符号
caseUnicodeCategory.DashPunctuation://指示字符是短划线或连字符
caseUnicodeCategory.ModifierSymbol://指示字符是修饰符符号,这指示环绕的字符的修改例如:^
caseUnicodeCategory.OtherPunctuation://指示字符是标点,但它不是连接符标点、短划线标点、开始标点、结束标点、前引号标点或后引号标点,如:%
caseUnicodeCategory.OtherLetter:
if(length>0)
...{
bufferIndex--;
offset--;
returnFlush();
}
Push(c);
returnFlush();
default:
if(length>0)
...{
returnFlush();
}
break;
}
}
}
}
顺便贴一下网上的ChineseAnalyzer,为了配合使用,改过后如下:
publicsealedclassChineseFilter:TokenFilter
...{
publicstaticString[]STOP_WORDS=
...{
"and","are","as","at","be","but","by",
"for","if","in","into","is","it",
"no","not","of","on","or","such",
"that","the","their","then","there","these",
"they","this","to","was","will","with"
};
privateHashtablestopTable;
publicChineseFilter(TokenStream_in):base(_in)
...{
stopTable=newHashtable(STOP_WORDS.Length);
for(inti=0;i<STOP_WORDS.Length;i++)
stopTable[STOP_WORDS[i]]=STOP_WORDS[i];
}
publicoverrideTokenNext()
...{
for(Tokentoken=input.Next();token!=null;token=input.Next())
...{
Stringtext=token.TermText();
if(stopTable[text]==null)
...{
returntoken;
}
}
returnnull;
}
}
publicclassChineseAnalyzer:Analyzer
...{
publicChineseAnalyzer()
...{
}
publicoverridesealedTokenStreamTokenStream(StringfieldName,TextReaderreader)
...{
TokenStreamresult=newChineseTokenizer(reader);
result=newChineseFilter(result);
returnresult;
}
}
本文介绍了一种针对Lucene搜索引擎的自定义中文分词器实现,解决了搜索特定格式词汇(如'C++')时出现的问题。通过将特殊符号和字母视为独立词汇进行分词,并过滤常见停用词,提高了搜索准确性。
9425

被折叠的 条评论
为什么被折叠?



