第一次写中文分词程序

最新推荐文章于 2024-06-25 10:33:00 发布

iteye_6832

最新推荐文章于 2024-06-25 10:33:00 发布

阅读量159

点赞数

public class MM2
{
     private static final Log log = LogFactory.getLog(MM2. class );

     private static HashMap < String, Integer > dictionary = null ;
     private static final int WORD_MAX_LENGTH = 9 ;
     private Reader reader;

     static
     {
        loadDictionary();
    }

     public MM2(Reader reader)
     {
         this .reader = reader;
    }

     // 切分出由中文、字母、数字组成的句子
     public ArrayList < Sentence > getSentence() throws IOException
     {
        ArrayList < Sentence > list = new ArrayList < Sentence > ();
        StringBuffer cb = new StringBuffer();
         int d = reader.read();
         int offset = 0 ;
         boolean b = false ;
         while (d >- 1 )
         {
             int type = Character.getType(d);
             if (type == 2 || type == 9 || type == 5 )
             {
                d = toAscii(d);
                cb.append(( char )d);
            }
             else
             {
                b = true ;
            }
            d = reader.read();
             if (d ==- 1 || b)
             {
                 if (d ==- 1 ) offset ++ ;
                b = false ;
                 char [] ioBuffer = new char [cb.length()];
                cb.getChars( 0 , cb.length(), ioBuffer, 0 );
                Sentence sen = new Sentence(ioBuffer,offset - cb.length());
                list.add(sen);
                cb.setLength( 0 );
            }
            offset ++ ;
        }
         return list;
    }

     // 将句子切分出词
     public ArrayList < Token > getToken(ArrayList < Sentence > list) throws IOException
     {
        ArrayList < Token > tokenlist = new ArrayList < Token > ();
         for (Sentence sen:list)
         {
            StringBuffer word = new StringBuffer();
             int offset = sen.getStartOffset();
             int bufferIndex = 0 ;
             char c;
             boolean b = false ;
             while (bufferIndex < sen.getText().length)
             {
                offset ++ ;
                c = sen.getText()[bufferIndex ++ ];
                 if (word.length() == 0 )
                    word.append(c);
                 else
                 {
                    String temp = (word.toString() + c).intern();
                     if (dictionary.containsKey(temp) && dictionary.get(temp) == 1 )
                        word.append(c);
                     else if (dictionary.containsKey(temp) && bufferIndex < sen.getText().length)
                        word.append(c);
                     else
                     {
                        bufferIndex -- ;
                        offset -- ;
                         while (word.length() > 1 && dictionary.get(word.toString()) != null && dictionary.get(word.toString()) == 2 )
                         {
                            word.deleteCharAt(word.length() - 1 );
                            bufferIndex -- ;
                            offset -- ;
                        }
                        b = true ;
                    }
                }
                 if (b || bufferIndex == sen.getText().length)
                 {
                    Token token = new Token(word.toString(),offset - word.length(),offset, " word " );
                    word.setLength( 0 );
                    tokenlist.add(token);
                    b = false ;
                }
            }
        }
         return tokenlist;
    }

     // 将相连的单个英文或数字组合成词
     public ArrayList < Token > getNewToken(ArrayList < Token > list) throws IOException
     {
        ArrayList < Token > tokenlist = new ArrayList < Token > ();
        Token word = null ;
         for ( int i = 0 ;i < list.size();i ++ )
         {
            Token t = list.get(i);
             if (t.getWord().length() == 1 && Character.getType(( int )t.getWord().charAt( 0 )) != 5 )
             {
                 if (word == null )
                    word = t;
                 else if (word.getEnd() == t.getStart())
                 {
                    word.setEnd(t.getEnd());
                    word.setWord(word.getWord() + t.getWord());
                }
                 else
                 {
                    tokenlist.add(word);
                    word = t;
                }
            }
             else if (word != null )
             {
                tokenlist.add(word);
                word = null ;
                tokenlist.add(t);
            }
             else
                tokenlist.add(t);
        }
         if (word != null )
            tokenlist.add(word);
         return tokenlist;
    }

     // 双角转单角
     public static int toAscii( int codePoint)
     {
         if ((codePoint >= 65296 && codePoint <= 65305 )     // ０-９
                 || (codePoint >= 65313 && codePoint <= 65338 )     // Ａ-Ｚ
                 || (codePoint >= 65345 && codePoint <= 65370 )     // ａ-ｚ
                )
         {
            codePoint -= 65248 ;
        }
         return codePoint;
    }

     // 加载词典
     public static void loadDictionary()
     {
         if (dictionary == null )
         {
            dictionary = new HashMap < String, Integer > ();
            InputStream is = null ;
            BufferedReader br = null ;
             try
             {
                is = new FileInputStream( new File(MM2. class .getClassLoader().getResource( " dictionary.txt " ).toURI()));
                br = new BufferedReader( new InputStreamReader(is, " UTF-8 " ));
                String word = null ;
                 while ((word = br.readLine()) != null )
                 {
                    word = word.toLowerCase();
                     if ((word.indexOf( " # " ) == - 1 ) && (word.length() <= WORD_MAX_LENGTH))
                     {
                        dictionary.put(word.intern(), 1 );
                         int i = word.length() - 1 ;
                         while (i >= 2 )
                         {
                            String temp = word.substring( 0 , i).intern();
                             if ( ! dictionary.containsKey(temp))
                                dictionary.put(temp, 2 );
                            i -- ;
                        }
                    }
                }
            }
             catch (Exception e)
             {
                log.info(e);
            }
             finally
             {
                 try
                 {
                     if (br != null )
                        br.close();
                     if (is != null )
                        is.close();
                }
                 catch (IOException e)
                 {
                    log.info(e);
                }
            }
        }
    }

     public static String[] segWords(Reader input)
     {
        ArrayList < String > list = new ArrayList < String > ();
         try
         {
            MM2 f = new MM2(input);
            ArrayList < Token > tlist = f.getNewToken(f.getToken(f.getSentence()));
             for (Token t:tlist)
             {
                list.add(t.getWord());
            }
        }
         catch (IOException e)
         {
            log.info(e);
        }
         return (String[])list.toArray( new String[ 0 ]);
    }

     public static void main(String[] args)
     {
        String[] cc = MM2.segWords( new StringReader( " ibm商务机t60p " .toLowerCase()));
         for (String c:cc)
         {
            System.out.println(c);
        }
    }
}