一个用php写的中文分词类
-
<?php
-
classSegmentation{
-
var$options=array('lowercase'=>TRUE,
-
'segment_english'=>FALSE);
-
var$dict_name='Unknown';
-
var$dict_words=array();
-
functionsetLowercase($value){
-
if($value){
-
$this->options['lowercase']=TRUE;
-
}else{
-
$this->options['lowercase']=FALSE;
-
}
-
returnTRUE;
-
}
-
functionsetSegmentEnglish($value){
-
if($value){
-
$this->options['segment_english']=TRUE;
-
}else{
-
$this->options['segment_english']=FALSE;
-
}
-
returnTRUE;
-
}
-
functionload($dict_file){
-
if(!file_exists($dict_file)){
-
returnFALSE;
-
}
-
$fp=fopen($dict_file,'r');
-
$temp=fgets($fp,1024);
-
if($temp===FALSE){
-
returnFALSE;
-
}else{
-
if(strpos($temp,"\t")!==FALSE){
-
list($dict_type,$dict_name)=explode("\t",trim($temp));
-
}else{
-
$dict_type=trim($temp);
-
$dict_name='Unknown';
-
}
-
$this->dict_name=$dict_name;
-
if($dict_type!=='DICT_WORD_W'){
-
returnFALSE;
-
}
-
}
-
while(!feof($fp)){
-
$this->dict_words[rtrim(fgets($fp,32))]=1;
-
}
-
fclose($fp);
-
returnTRUE;
-
}
-
functiongetDictName(){
-
return$this->dict_name;
-
}
-
functionsegmentString($str){
-
if(count($this->dict_words)===0){
-
returnFALSE;
-
}
-
$lines=explode("\n",$str);
-
return$this->_segmentLines($lines);
-
}
-
functionsegmentFile($filename){
-
if(count($this->dict_words)===0){
-
returnFALSE;
-
}
-
$lines=file($filename);
-
return$this->_segmentLines($lines);
-
}
-
function_segmentLines($lines){
-
$contents_segmented='';
-
foreach($linesas$line){
-
$contents_segmented.=$this->_segmentLine(rtrim($line))."\n";
-
}
-
do{
-
$contents_segmented=str_replace('','',$contents_segmented);
-
}while(strpos($contents_segmented,'')!==FALSE);
-
return$contents_segmented;
-
}
-
function_segmentLine($str){
-
$str_final='';
-
$str_array=array();
-
$str_length=strlen($str);
-
if($str_length>0){
-
if(ord($str{$str_length-1})>=129){
-
$str.='';
-
}
-
}
-
for($i=0;$i<$str_length;$i++){
-
if(ord($str{$i})>=129){
-
$str_array[]=$str{$i}.$str{$i+1};
-
$i++;
-
}else{
-
$str_tmp=$str{$i};
-
for($j=$i+1;$j<$str_length;$j++){
-
if(ord($str{$j})<129){
-
$str_tmp.=$str{$j};
-
}else{
-
break;
-
}
-
}
-
$str_array[]=array($str_tmp);
-
$i=$j-1;
-
}
-
}
-
-
$pos=count($str_array);
-
while($pos>0){
-
$char=$str_array[$pos-1];
-
if(is_array($char)){
-
$str_final_tmp=$char[0];
-
if($this->options['segment_english']){
-
$str_final_tmp=preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f]+)/","$1",$str_final_tmp);
-
$str_final_tmp=preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])/","$1$2",$str_final_tmp);
-
}
-
if($this->options['lowercase']){
-
$str_final_tmp=strtolower($str_final_tmp);
-
}
-
$str_final="$str_final_tmp$str_final";
-
$pos--;
-
}else{
-
$word_found=0;
-
$word_array=array(0=>'');
-
if($pos<4){
-
$word_temp=$pos+1;
-
}else{
-
$word_temp=5;
-
}
-
for($i=1;$i<$word_temp;$i++){
-
$word_array[$i]=$str_array[$pos-$i].$word_array[$i-1];
-
}
-
-
for($i=($word_temp-1);$i>1;$i--){
-
-
if(array_key_exists($word_array[$i],$this->dict_words)){
-
$word_found=$i;
-
break;
-
}
-
}
-
if($word_found){
-
$str_final="$word_array[$word_found]$str_final";
-
$pos=$pos-$word_found;
-
}else{
-
$str_final="$char$str_final";
-
$pos--;
-
}
-
}
-
}
-
return$str_final;
-
}
-
}
-
?>
来源参考:
http://www.phpchina.cn/code/2006/0607/381.html
http://www.xuchao.cn/?play=reply&id=851