public
class
MM2
{
private
static
final
Log log
=
LogFactory.getLog(MM2.
class
);
private
static
HashMap
<
String, Integer
>
dictionary
=
null
;
private
static
final
int
WORD_MAX_LENGTH
=
9
;
private
Reader reader;
static
{
loadDictionary();
}
public
MM2(Reader reader)
{
this
.reader
=
reader;
}
//
切分出由中文、字母、数字组成的句子
public
ArrayList
<
Sentence
>
getSentence()
throws
IOException
{
ArrayList
<
Sentence
>
list
=
new
ArrayList
<
Sentence
>
();
StringBuffer cb
=
new
StringBuffer();
int
d
=
reader.read();
int
offset
=
0
;
boolean
b
=
false
;
while
(d
>-
1
)
{
int
type
=
Character.getType(d);
if
(type
==
2
||
type
==
9
||
type
==
5
)
{
d
=
toAscii(d);
cb.append((
char
)d);
}
else
{
b
=
true
;
}
d
=
reader.read();
if
(d
==-
1
||
b)
{
if
(d
==-
1
) offset
++
;
b
=
false
;
char
[] ioBuffer
=
new
char
[cb.length()];
cb.getChars(
0
, cb.length(), ioBuffer,
0
);
Sentence sen
=
new
Sentence(ioBuffer,offset
-
cb.length());
list.add(sen);
cb.setLength(
0
);
}
offset
++
;
}
return
list;
}
//
将句子切分出词
public
ArrayList
<
Token
>
getToken(ArrayList
<
Sentence
>
list)
throws
IOException
{
ArrayList
<
Token
>
tokenlist
=
new
ArrayList
<
Token
>
();
for
(Sentence sen:list)
{
StringBuffer word
=
new
StringBuffer();
int
offset
=
sen.getStartOffset();
int
bufferIndex
=
0
;
char
c;
boolean
b
=
false
;
while
(bufferIndex
<
sen.getText().length)
{
offset
++
;
c
=
sen.getText()[bufferIndex
++
];
if
(word.length()
==
0
)
word.append(c);
else
{
String temp
=
(word.toString()
+
c).intern();
if
(dictionary.containsKey(temp)
&&
dictionary.get(temp)
==
1
)
word.append(c);
else
if
(dictionary.containsKey(temp)
&&
bufferIndex
<
sen.getText().length)
word.append(c);
else
{
bufferIndex
--
;
offset
--
;
while
(word.length()
>
1
&&
dictionary.get(word.toString())
!=
null
&&
dictionary.get(word.toString())
==
2
)
{
word.deleteCharAt(word.length()
-
1
);
bufferIndex
--
;
offset
--
;
}
b
=
true
;
}
}
if
(b
||
bufferIndex
==
sen.getText().length)
{
Token token
=
new
Token(word.toString(),offset
-
word.length(),offset,
"
word
"
);
word.setLength(
0
);
tokenlist.add(token);
b
=
false
;
}
}
}
return
tokenlist;
}
//
将相连的单个英文或数字组合成词
public
ArrayList
<
Token
>
getNewToken(ArrayList
<
Token
>
list)
throws
IOException
{
ArrayList
<
Token
>
tokenlist
=
new
ArrayList
<
Token
>
();
Token word
=
null
;
for
(
int
i
=
0
;i
<
list.size();i
++
)
{
Token t
=
list.get(i);
if
(t.getWord().length()
==
1
&&
Character.getType((
int
)t.getWord().charAt(
0
))
!=
5
)
{
if
(word
==
null
)
word
=
t;
else
if
(word.getEnd()
==
t.getStart())
{
word.setEnd(t.getEnd());
word.setWord(word.getWord()
+
t.getWord());
}
else
{
tokenlist.add(word);
word
=
t;
}
}
else
if
(word
!=
null
)
{
tokenlist.add(word);
word
=
null
;
tokenlist.add(t);
}
else
tokenlist.add(t);
}
if
(word
!=
null
)
tokenlist.add(word);
return
tokenlist;
}
//
双角转单角
public
static
int
toAscii(
int
codePoint)
{
if
((codePoint
>=
65296
&&
codePoint
<=
65305
)
//
0-9
||
(codePoint
>=
65313
&&
codePoint
<=
65338
)
//
A-Z
||
(codePoint
>=
65345
&&
codePoint
<=
65370
)
//
a-z
)
{
codePoint
-=
65248
;
}
return
codePoint;
}
//
加载词典
public
static
void
loadDictionary()
{
if
(dictionary
==
null
)
{
dictionary
=
new
HashMap
<
String, Integer
>
();
InputStream is
=
null
;
BufferedReader br
=
null
;
try
{
is
=
new
FileInputStream(
new
File(MM2.
class
.getClassLoader().getResource(
"
dictionary.txt
"
).toURI()));
br
=
new
BufferedReader(
new
InputStreamReader(is,
"
UTF-8
"
));
String word
=
null
;
while
((word
=
br.readLine())
!=
null
)
{
word
=
word.toLowerCase();
if
((word.indexOf(
"
#
"
)
==
-
1
)
&&
(word.length()
<=
WORD_MAX_LENGTH))
{
dictionary.put(word.intern(),
1
);
int
i
=
word.length()
-
1
;
while
(i
>=
2
)
{
String temp
=
word.substring(
0
, i).intern();
if
(
!
dictionary.containsKey(temp))
dictionary.put(temp,
2
);
i
--
;
}
}
}
}
catch
(Exception e)
{
log.info(e);
}
finally
{
try
{
if
(br
!=
null
)
br.close();
if
(is
!=
null
)
is.close();
}
catch
(IOException e)
{
log.info(e);
}
}
}
}
public
static
String[] segWords(Reader input)
{
ArrayList
<
String
>
list
=
new
ArrayList
<
String
>
();
try
{
MM2 f
=
new
MM2(input);
ArrayList
<
Token
>
tlist
=
f.getNewToken(f.getToken(f.getSentence()));
for
(Token t:tlist)
{
list.add(t.getWord());
}
}
catch
(IOException e)
{
log.info(e);
}
return
(String[])list.toArray(
new
String[
0
]);
}
public
static
void
main(String[] args)
{
String[] cc
=
MM2.segWords(
new
StringReader(
"
ibm商务机t60p
"
.toLowerCase()));
for
(String c:cc)
{
System.out.println(c);
}
}
}
第一次写中文分词程序
最新推荐文章于 2024-06-25 10:33:00 发布