一个用php写的中文分词类
-
<?php -
classSegmentation{ -
var$options=array('lowercase'=>TRUE, -
'segment_english'=>FALSE); -
var$dict_name='Unknown'; -
var$dict_words=array(); -
functionsetLowercase($value){ -
if($value){ -
$this->options['lowercase']=TRUE; -
}else{ -
$this->options['lowercase']=FALSE; -
} -
returnTRUE; -
} -
functionsetSegmentEnglish($value){ -
if($value){ -
$this->options['segment_english']=TRUE; -
}else{ -
$this->options['segment_english']=FALSE; -
} -
returnTRUE; -
} -
functionload($dict_file){ -
if(!file_exists($dict_file)){ -
returnFALSE; -
} -
$fp=fopen($dict_file,'r'); -
$temp=fgets($fp,1024); -
if($temp===FALSE){ -
returnFALSE; -
}else{ -
if(strpos($temp,"\t")!==FALSE){ -
list($dict_type,$dict_name)=explode("\t",trim($temp)); -
}else{ -
$dict_type=trim($temp); -
$dict_name='Unknown'; -
} -
$this->dict_name=$dict_name; -
if($dict_type!=='DICT_WORD_W'){ -
returnFALSE; -
} -
} -
while(!feof($fp)){ -
$this->dict_words[rtrim(fgets($fp,32))]=1; -
} -
fclose($fp); -
returnTRUE; -
} -
functiongetDictName(){ -
return$this->dict_name; -
} -
functionsegmentString($str){ -
if(count($this->dict_words)===0){ -
returnFALSE; -
} -
$lines=explode("\n",$str); -
return$this->_segmentLines($lines); -
} -
functionsegmentFile($filename){ -
if(count($this->dict_words)===0){ -
returnFALSE; -
} -
$lines=file($filename); -
return$this->_segmentLines($lines); -
} -
function_segmentLines($lines){ -
$contents_segmented=''; -
foreach($linesas$line){ -
$contents_segmented.=$this->_segmentLine(rtrim($line))."\n"; -
} -
do{ -
$contents_segmented=str_replace('','',$contents_segmented); -
}while(strpos($contents_segmented,'')!==FALSE); -
return$contents_segmented; -
} -
function_segmentLine($str){ -
$str_final=''; -
$str_array=array(); -
$str_length=strlen($str); -
if($str_length>0){ -
if(ord($str{$str_length-1})>=129){ -
$str.=''; -
} -
} -
for($i=0;$i<$str_length;$i++){ -
if(ord($str{$i})>=129){ -
$str_array[]=$str{$i}.$str{$i+1}; -
$i++; -
}else{ -
$str_tmp=$str{$i}; -
for($j=$i+1;$j<$str_length;$j++){ -
if(ord($str{$j})<129){ -
$str_tmp.=$str{$j}; -
}else{ -
break; -
} -
} -
$str_array[]=array($str_tmp); -
$i=$j-1; -
} -
} -
-
$pos=count($str_array); -
while($pos>0){ -
$char=$str_array[$pos-1]; -
if(is_array($char)){ -
$str_final_tmp=$char[0]; -
if($this->options['segment_english']){ -
$str_final_tmp=preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f]+)/","$1",$str_final_tmp); -
$str_final_tmp=preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])/","$1$2",$str_final_tmp); -
} -
if($this->options['lowercase']){ -
$str_final_tmp=strtolower($str_final_tmp); -
} -
$str_final="$str_final_tmp$str_final"; -
$pos--; -
}else{ -
$word_found=0; -
$word_array=array(0=>''); -
if($pos<4){ -
$word_temp=$pos+1; -
}else{ -
$word_temp=5; -
} -
for($i=1;$i<$word_temp;$i++){ -
$word_array[$i]=$str_array[$pos-$i].$word_array[$i-1]; -
} -
-
for($i=($word_temp-1);$i>1;$i--){ -
-
if(array_key_exists($word_array[$i],$this->dict_words)){ -
$word_found=$i; -
break; -
} -
} -
if($word_found){ -
$str_final="$word_array[$word_found]$str_final"; -
$pos=$pos-$word_found; -
}else{ -
$str_final="$char$str_final"; -
$pos--; -
} -
} -
} -
return$str_final; -
} -
} -
?>
来源参考:
http://www.phpchina.cn/code/2006/0607/381.html
http://www.xuchao.cn/?play=reply&id=851
本文介绍了一个使用PHP编写的中文分词类,该类支持加载词典文件、设置是否转换为小写及是否切分英文字符串等功能。通过实例化此类并调用相应的方法可以实现对中文文本的有效分词。
379

被折叠的 条评论
为什么被折叠?



