利用SRILM 对注音的句子进行补全（Viterbi）

本文链接：https://blog.youkuaiyun.com/junk2012/article/details/46292693

本文介绍了一种将带有注音声母的字符串转换为标准汉字的算法实现。该算法利用了最大概率路径搜索策略，并结合了N-gram语言模型来确定最可能的汉字序列。此外，还详细介绍了关键函数的实现，包括读取映射表、删除空白字符、计算最佳字符串等。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

给一个句子，部分汉字是用注音的声母代替的，还原成原始的句子

感悟：

调用一个接受char* 的函数，记得char*后面加上'\0'；

文件记得及时关闭；

作用域 may matter

map<string,string> big5 sucks

char* to int

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include "Ngram.h"
#include "Prob.h"
using namespace std;

string deleteBlank(string s);
map<int, string> readMap(map<int, string> zyMap);
double gramProb(const char *w1, const char *w2);
string bestStr(string s,map<int,string> zyMap);
int big5ToInt(char *ch);


string testPath="./testdata/";
string resultPath="./result2/";
string mapPath="ZhuYin-Big5.map";

Vocab voc;
Ngram lm( voc,2);


int main()
{
	map<int, string> zyMap;
	zyMap=readMap(zyMap);
	
	File lmFile("./bigram.lm", "r" );
	lm.read(lmFile);
	lmFile.close();

	int i=1;
	for(i=1;i<11;i++)
	{
		stringstream ss;
		string s;
		ss<<i;
		ss>>s;
		string path1=testPath+s+".txt";
		string path2=resultPath+s+".txt";

		
		ifstream infile(path1.c_str());
		ofstream outfile(path2.c_str(),ios::app);
		string line;
		

		while(getline(infile,line))
		{
		//	const  char *cline=line.c_str();
			string newline=deleteBlank(line);
			string bestString=bestStr(newline,zyMap);
			outfile<<bestString.c_str();
			
		}
		infile.close();
		outfile.close();
	}

	return 0;
}

int big5ToInt(char *ch)
{
	int a=ch[1]+(ch[0]<<8);
	return a;
}
//delete the blank in the string
string deleteBlank(string s)
{
	string::iterator it;
	for(it=s.begin();it!=s.end();)
	{
		if(*it==' ')
			it=s.erase(it);
		else
			++it;
	}

	return s;
}

//read the map
map<int, string> readMap(map<int, string> zyMap)
{
	ifstream infile(mapPath.c_str());
	string line;
	int i=0;
	while(getline(infile,line))
	{
		char w[2];
		line=deleteBlank(line);

		w[0]=line.c_str()[0];
		w[1]=line.c_str()[1];

		int key=w[1]+(w[0]<<8);
		string value=line.substr(2,line.length()-2);
		zyMap[key]=value;
	}
	
	return zyMap;
}


//compute the most possible string
string bestStr(string s,map<int,string> zyMap)
{
	int k=0;
	string bestString;
//	double candidateNum[s.length()/2];
	int trackMatrix[s.length()/2][10000];
	double vMatrix[s.length()/2][10000];
	
	memset(vMatrix,0.0,5000*s.length()*sizeof(double));
	memset(trackMatrix,0,5000*s.length()*sizeof(int));

	int maxId=0;

	double max=LogP_Zero;;
	double prob;
	char wo1[3];
	char wo2[3];

	wo1[2]='\0';
	wo2[2]='\0';

	char w1[2];
	char w2[2];

	int word1;
	int word2;

	int i=0;
	int j=0;

	double maxPro=LogP_Zero;
	string values1;
	string values2;

	int l2;
	int l1;
	int sl=s.length();
	for(k=0;k<=sl-4;k+=2)
	{
		w1[0]=s.c_str()[k];
		w1[1]=s.c_str()[k+1];

		w2[0]=s.c_str()[k+2];
		w2[1]=s.c_str()[k+3];


		word1=w1[1]+(w1[0]<<8);
		word2=w2[1]+(w2[0]<<8);


		values1=zyMap[word1];
		values2=zyMap[word2];
		
		l2=values2.length();
		l1=values1.length();

		for(j=0;j<l2/2;j++)
		{

			maxPro=LogP_Zero;
			wo2[0]=values2.c_str()[2*j];
			wo2[1]=values2.c_str()[2*j+1];
			
			for(i=0;i<l1/2;i++)
			{
				
				wo1[0]=values1.c_str()[2*i];
				wo1[1]=values1.c_str()[2*i+1];
				prob=gramProb(wo1,wo2);

				if((prob+vMatrix[k/2][i])>maxPro)
				{
					maxPro=prob+vMatrix[k/2][i];
					trackMatrix[k/2+1][j]=i;
				}
			}
			if(k==(sl-4)&&maxPro>max)
			{
				maxId=j;
				max=maxPro;
			}
			vMatrix[k/2+1][j]=maxPro;
		}
	}
//cout<<"maxId:"<<maxId<<endl;
	//back tracking 
	string resultStr="";
	int word;
	char bestWord[3];
	bestWord[2]='\0';
	string values;
	for(k=sl/2-1;k>=0;k--)
	{
		word=s.c_str()[2*k+1]+(s.c_str()[2*k]<<8);
		values=zyMap[word];
		bestWord[0]=values.c_str()[maxId*2];
		bestWord[1]=values.c_str()[maxId*2+1];

		resultStr=" "+string(bestWord)+resultStr ;
		if(k>0)
		maxId=trackMatrix[k][maxId];
	}
	resultStr="<s> "+resultStr+"  </s>\n";

	return resultStr;
}


double gramProb(const char *w1, const char *w2)
{
	VocabIndex wid1 = voc.getIndex(w1);
	VocabIndex wid2 = voc.getIndex(w2);
	if(wid1 == Vocab_None)  //OOV
		wid1 = voc.getIndex(Vocab_Unknown);
	if(wid2 == Vocab_None)  //OOV
		wid2 = voc.getIndex(Vocab_Unknown);
	VocabIndex context[] = { wid1, Vocab_None };
	return lm.wordProb( wid2, context);

}