利用SRILM 对注音的句子进行补全(Viterbi)

给一个句子,部分汉字是用注音的声母代替的,还原成原始的句子

感悟:

调用一个接受char* 的函数,记得char*后面加上'\0';

文件记得及时关闭;

作用域 may matter

map<string,string> big5 sucks

char* to int 


#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include "Ngram.h"
#include "Prob.h"
using namespace std;

string deleteBlank(string s);
map<int, string> readMap(map<int, string> zyMap);
double gramProb(const char *w1, const char *w2);
string bestStr(string s,map<int,string> zyMap);
int big5ToInt(char *ch);


string testPath="./testdata/";
string resultPath="./result2/";
string mapPath="ZhuYin-Big5.map";

Vocab voc;
Ngram lm( voc,2);


int main()
{
	map<int, string> zyMap;
	zyMap=readMap(zyMap);
	
	File lmFile("./bigram.lm", "r" );
	lm.read(lmFile);
	lmFile.close();

	int i=1;
	for(i=1;i<11;i++)
	{
		stringstream ss;
		string s;
		ss<<i;
		ss>>s;
		string path1=testPath+s+".txt";
		string path2=resultPath+s+".txt";

		
		ifstream infile(path1.c_str());
		ofstream outfile(path2.c_str(),ios::app);
		string line;
		

		while(getline(infile,line))
		{
		//	const  char *cline=line.c_str();
			string newline=deleteBlank(line);
			string bestString=bestStr(newline,zyMap);
			outfile<<bestString.c_str();
			
		}
		infile.close();
		outfile.close();
	}

	return 0;
}

int big5ToInt(char *ch)
{
	int a=ch[1]+(ch[0]<<8);
	return a;
}
//delete the blank in the string
string deleteBlank(string s)
{
	string::iterator it;
	for(it=s.begin();it!=s.end();)
	{
		if(*it==' ')
			it=s.erase(it);
		else
			++it;
	}

	return s;
}

//read the map
map<int, string> readMap(map<int, string> zyMap)
{
	ifstream infile(mapPath.c_str());
	string line;
	int i=0;
	while(getline(infile,line))
	{
		char w[2];
		line=deleteBlank(line);

		w[0]=line.c_str()[0];
		w[1]=line.c_str()[1];

		int key=w[1]+(w[0]<<8);
		string value=line.substr(2,line.length()-2);
		zyMap[key]=value;
	}
	
	return zyMap;
}


//compute the most possible string
string bestStr(string s,map<int,string> zyMap)
{
	int k=0;
	string bestString;
//	double candidateNum[s.length()/2];
	int trackMatrix[s.length()/2][10000];
	double vMatrix[s.length()/2][10000];
	
	memset(vMatrix,0.0,5000*s.length()*sizeof(double));
	memset(trackMatrix,0,5000*s.length()*sizeof(int));

	int maxId=0;

	double max=LogP_Zero;;
	double prob;
	char wo1[3];
	char wo2[3];

	wo1[2]='\0';
	wo2[2]='\0';

	char w1[2];
	char w2[2];

	int word1;
	int word2;

	int i=0;
	int j=0;

	double maxPro=LogP_Zero;
	string values1;
	string values2;

	int l2;
	int l1;
	int sl=s.length();
	for(k=0;k<=sl-4;k+=2)
	{
		w1[0]=s.c_str()[k];
		w1[1]=s.c_str()[k+1];

		w2[0]=s.c_str()[k+2];
		w2[1]=s.c_str()[k+3];


		word1=w1[1]+(w1[0]<<8);
		word2=w2[1]+(w2[0]<<8);


		values1=zyMap[word1];
		values2=zyMap[word2];
		
		l2=values2.length();
		l1=values1.length();

		for(j=0;j<l2/2;j++)
		{

			maxPro=LogP_Zero;
			wo2[0]=values2.c_str()[2*j];
			wo2[1]=values2.c_str()[2*j+1];
			
			for(i=0;i<l1/2;i++)
			{
				
				wo1[0]=values1.c_str()[2*i];
				wo1[1]=values1.c_str()[2*i+1];
				prob=gramProb(wo1,wo2);

				if((prob+vMatrix[k/2][i])>maxPro)
				{
					maxPro=prob+vMatrix[k/2][i];
					trackMatrix[k/2+1][j]=i;
				}
			}
			if(k==(sl-4)&&maxPro>max)
			{
				maxId=j;
				max=maxPro;
			}
			vMatrix[k/2+1][j]=maxPro;
		}
	}
//cout<<"maxId:"<<maxId<<endl;
	//back tracking 
	string resultStr="";
	int word;
	char bestWord[3];
	bestWord[2]='\0';
	string values;
	for(k=sl/2-1;k>=0;k--)
	{
		word=s.c_str()[2*k+1]+(s.c_str()[2*k]<<8);
		values=zyMap[word];
		bestWord[0]=values.c_str()[maxId*2];
		bestWord[1]=values.c_str()[maxId*2+1];

		resultStr=" "+string(bestWord)+resultStr ;
		if(k>0)
		maxId=trackMatrix[k][maxId];
	}
	resultStr="<s> "+resultStr+"  </s>\n";

	return resultStr;
}


double gramProb(const char *w1, const char *w2)
{
	VocabIndex wid1 = voc.getIndex(w1);
	VocabIndex wid2 = voc.getIndex(w2);
	if(wid1 == Vocab_None)  //OOV
		wid1 = voc.getIndex(Vocab_Unknown);
	if(wid2 == Vocab_None)  //OOV
		wid2 = voc.getIndex(Vocab_Unknown);
	VocabIndex context[] = { wid1, Vocab_None };
	return lm.wordProb( wid2, context);

}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值