中文分词系统之-导入文本文件类词库建立哈希表索引-JAVA源码

本文档详细介绍了如何使用JAVA编程语言,通过导入文本文件类词库,建立哈希表索引来实现中文分词系统。内容包括词库读取、哈希表构建及分词过程的关键步骤。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

/*
/*分词系统的词典导入的接口类,主要实现文本文件中词典的导入,及创建哈希表。且实现初步的哈希表关键字查询
*输入参数:Iterator other/int max
*输出数据:
*author 张顺明
*email  
mingzhingjien@hotmail.com
**/

package chinese_participle;
import java.io.*;
import java.util.Vector;

// Referenced classes of package beyondchtacs:
//                 FreDicHashProcessing

public class InPutTextFile
{
         HashProcessing hashproce;
         Vector vector = new Vector();
         static int fdMax = 0x21a31;
    
         public InPutTextFile(String filename)
         {
             //fdHp = new HashProcessing(fdMax);
        
             File readfile = new File(filename);
             try{
        
          InputStreamReader readinput = new InputStreamReader(new FileInputStream(readfile));
                 BufferedReader reader = new BufferedReader(readinput);
                 String s;
                   while((s = reader.readLine()) != null)
                   {
                      vector.addElement(s);
                   }
                   hashproce=new HashProcessing(vector);
                   reader.close();
                   readinput.close();
                 }
             catch(Exception ey)
                 {
                   System.out.println(ey);
                 }
           
                return;
            }
            ///////////////////////////////////////////////////检查某词是否在词典里面
            public boolean isInVector(String x)
            {
              for (int i=0;i<vector.size();i++)
             {
               if( x.equals( vector.elementAt( i ).toString() ) );
               return true;
             }
             return true;
            }
            //////////////////////////////////////////////////检查某词是否在新建的哈希表里面
            public boolean isInHash(String x)
            {
                return hashproce.isActive( hashproce.hashStr,hashproce.findAddress( x ) ) ;
           
            }
}

/*
/*分词系统的词典导入的哈希表,是基于二次探测构建的
*输入参数:Iterator other/int max
*输出数据:
*author 张顺明
*email  
mingzhingjien@hotmail.com
**/

package chinese_participle;

import java.util.*;
public class HashProcessing
{   
         private int max;
         public HashEntry hashStr[];
         private int currentSize=0;
         private int occupied=0;
         private int modCount=0;
    
         public HashProcessing()
         {        
             hashStr = new HashEntry[max];
             clear();
         }
    
         public HashProcessing(Collection other)
         {        
             hashStr = new HashEntry[other.size()*2];
             clear();
        
             Iterator itr = other.iterator();
               while( itr.hasNext() )
                add( itr.next() );
         }
         //////////////////////////////////////////////二次探测哈希表的isActive方法
         public       boolean isActive(HashEntry[] arr,int pos)
         {
          return arr[pos]!=null&&arr[pos].isActive;
         }
         //////////////////////////////////////////////二次探测哈希表的remove方法和clear方法
         public boolean remove( Object x)
         {
          int address=findAddress( x );
          if( !isActive( hashStr,address ) )
            return false;
          hashStr[address].isActive=false;
          currentSize--;
          modCount++;
          if(currentSize<hashStr.length/8)
           rehash();
          return true;
         }
         public void clear()
         {
          currentSize=occupied=0;
          modCount++;
          for(int i=0;i<hashStr.length;i++)
          hashStr[i]=null;
         }
         //////////////////////////////////////////////查找对应地址函数
         public int findAddress( Object x )
         {
          int colliCounter = 0;
             int address =( x==null )? 0:Math.abs( x.hashCode( )% hashStr.length );
        
             while( hashStr[address]!=null )
             {
              if(x==null)
              {
               if(hashStr[address].element==null)
               break;
              }
              else if (x.equals(hashStr[address].element))
              break;
              address+=2*++colliCounter-1;
              if(address>=hashStr.length)
                 address-=hashStr.length;
          
             }
             return address;
         }
         ////////////////////////////////////////////////二次探测哈希表的add方法
         public boolean add(Object x)
         {
          int address=findAddress(x);
          if( isActive( hashStr,address ) )
             return false;
      
           hashStr[address]=new HashEntry(x,true);
           currentSize++;
           occupied++;
           modCount++;
      
           if( occupied>hashStr.length/2 )
              rehash();
           return true;
         }
    
         ////////////////////////////////////////////////实现再哈希
         public void rehash()
         {
          HashEntry[] oldArry=hashStr;
          //Create a new,empty table
          hashStr=new HashEntry[nextPrime( 4*currentSize ) ];
          currentSize=0;
          occupied=0;
          //Copy table over
          for(int i=0;i<oldArry.length;i++)
            if( isActive( oldArry,i ) )
               add( oldArry[i].element );
         }
/*        ////////////////////////////////////////////////
         public static void createHash(String keyStr)
         {
             int colliCounter = 0;
             int address =(keyStr==null)? 0:hashMod(keyStr);
             do
             {
                 if(hashStr[address] == null)
                 {
                     hashStr[address] = keyStr;
                     break;
                 }
                 colliCounter++;
                 address = collisionOffset(address, colliCounter);
             } while(true);
         }
*/
         ///////////////////////////////////////////////二次探测哈希表的查找函数
         public       Object hashSearch(Object x)
         {   
           int address=findAddress( x );
           if( hashStr[address]==null )
              return null;
           return hashStr[address].element;
         }
         public boolean contains(Object x)
         {
           return isActive( hashStr,findAddress( x ) );
         }
         //////////////////////////////////////////////寻找一个合适的质数
        public boolean isPrime(int n)
        {
          int i=n%2;
          if(i==0)
           return false;
          for(;i>1;i--)
          {
            if(n/i==0) break;
               return true;
          }
            return true;
        }
        public int nextPrime(int n)
        {
          if( n%2==0)
             n++;
          for( ;!isPrime( n);n+=2 )
          ;
          return n;
        }
}

 
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值