1.
【目标】:将二进制的模型数据lm_sc.t3g转换成 utf8格式 lm_sc.t3g.arpa
【操作】:
./tslminfo -p -v -l ../raw/dict.utf8 ../data/lm_sc.t3g >../raw/lm_sc.t3g.arpa
【解释】:
-p : 使用正常的频率值Pr, 非默认的-log(Pr)
-v : 输出apra格式
-l : 指定utf8格式的字典文件
../raw/dict.utf8 : utf8格式的字典文件
../data/lm_sc.t3g : 需要转换的二进制模型数据
../raw/lm_sc.t3g.apra : 输出utf8格式的模型数据
2.
【目标】: 将utf8格式的模型数据转换为二进制的模型数据
[操作] : tslmpack <arpa_file> <lexicon_file> <t3g_file>
./tslmpack ../raw/lm_sc.t3g.arpa ../raw/dict.utf8 lm_sc.t3g
3. 如果数据模型文件是2-gram不是默认的3-gram,如果执行2的命令会报错 “Failed to read from”
【解决方案】:
1) 修改类 CArpaSlm和tslmpack程序以支持2-gram。具体代码如下
diff --git a/src/slm/tslmpack/arpa_slm.cpp b/src/slm/tslmpack/arpa_slm.cpp
old mode 100644
new mode 100755
index 82029c6..5bb242d
--- a/src/slm/tslmpack/arpa_slm.cpp
+++ b/src/slm/tslmpack/arpa_slm.cpp
@@ -150,7 +150,7 @@ CArpaSlm::load(const char* filename, const TLexicon& lexicon)
printf("Loading ARPA slm..."); fflush(stdout);
ifstream file(filename);
char buf[1024];
- for (int i = 0; i <= N_GRAM; ++i) {
+ for (unsigned i = 0; i <= m_N; ++i) {
unsigned lvl;
int size;
file.getline(buf, sizeof(buf));
@@ -159,7 +159,7 @@ CArpaSlm::load(const char* filename, const TLexicon& lexicon)
exit(1);
}
sscanf(buf, "\\%d-gram\\%d%*[\n]", &lvl, &size);
- assert(lvl <= N_GRAM);
+ assert(lvl <= m_N);
if (lvl == 0) {
TNode node0;
node0.load_level0(file);
diff --git a/src/slm/tslmpack/arpa_slm.h b/src/slm/tslmpack/arpa_slm.h
old mode 100644
new mode 100755
index 3c49393..7c8a1eb
--- a/src/slm/tslmpack/arpa_slm.h
+++ b/src/slm/tslmpack/arpa_slm.h
@@ -78,6 +78,7 @@ public:
/* XXX, ARPA file does not provide these information.
so we assume this SLM is trigram, and does not use LogPr */
CArpaSlm() : m_usingLogPr(false), m_N(N_GRAM) {}
+ CArpaSlm(unsigned N) : m_usingLogPr(false), m_N(N) {}
bool good() const { return m_levels[0].size() != 0; }
unsigned getN() const { return m_N; }
bool usingLogPr() const { return m_usingLogPr; }
diff --git a/src/slm/tslmpack/slmpack.cpp b/src/slm/tslmpack/slmpack.cpp
old mode 100644
new mode 100755
index 3f00d72..d267925
--- a/src/slm/tslmpack/slmpack.cpp
+++ b/src/slm/tslmpack/slmpack.cpp
@@ -323,13 +323,14 @@ cleanup(CompressedTable& pr_table, CompressedTable& bow_table,
int
main(int argc, char* argv[])
{
- if (argc != 4)
+ if (argc != 5)
ShowUsage(argv[0]);
const char* arpa_path = argv[1];
const char* lexicon_path = argv[2];
const char* threaded_path = argv[3];
+ unsigned n = (unsigned)(atoi(argv[4]));
- CArpaSlm slm;
+ CArpaSlm slm(n);
TLexicon lexicon = read_lexicon(lexicon_path);
slm.load(arpa_path, lexicon);
[操作]:
[操作] : tslmpack <arpa_file> <lexicon_file> <t3g_file> <n-gram>
./tslmpack ../raw/lm_sc.t2g.arpa ../raw/dict.utf8 lm_sc.t2g 2