Discount.cc Discount.h
文档作者:jianzhu
修改时间: 08.12.15-08.12.17
文档作者:jianzhu
修改时间: 08.12.15-08.12.17
注:本文档改写自rickjin书写的Discount文档
修正了原文档中存在的一些错误,并扩充了部分内容;
阅读该文档前,建议先阅读srilm的ngram-discount.7.html手册
或者查看jianzhu的翻译文档Ngram折扣平滑算法
--------------
1、基本类
--------------
Discount.h Discount.cc 这两个文件主要实现了最重要的几个折扣算法, 包括
a. Katz Discount (基于 Good-Turing Discounting)
b. Absolute Discounting
c. Natural law of succession [Eric Sven Ristad, 1995]
d. Additive Discounting [Lidstone-Johnson-Jeffrey]
e Witten-Bell Discounting
f. Kneser-Ney Discounting
g. Modified Kneser-Ney Discounting, [Chen, Goodman, 1998]
修正了原文档中存在的一些错误,并扩充了部分内容;
阅读该文档前,建议先阅读srilm的ngram-discount.7.html手册
或者查看jianzhu的翻译文档Ngram折扣平滑算法
--------------
1、基本类
--------------
Discount.h Discount.cc 这两个文件主要实现了最重要的几个折扣算法, 包括
a. Katz Discount (基于 Good-Turing Discounting)
b. Absolute Discounting
c. Natural law of succession [Eric Sven Ristad, 1995]
d. Additive Discounting [Lidstone-Johnson-Jeffrey]
e Witten-Bell Discounting
f. Kneser-Ney Discounting
g. Modified Kneser-Ney Discounting, [Chen, Goodman, 1998]
----------------
2、类接口说明
----------------
2、类接口说明
----------------
2.1) Discount 类主要接口
<src>
class Discount
{
public:
/**
* @brief 构造函数
* 通过成员初始化列表的方式将interpolate初始化为false,表示不支持插值平滑
*/
Discount() : interpolate(false) {};
virtual ~Discount() {};
/* 使用折扣算法, 返回频率 count 对应的折扣系数
* @param count 当前 ngram 频率 ( C(a_z) )
* @param totalCount 历史 n-1gram 频率 ( C(a_) )
* @param oberservedVocab 语料中统计到的历史ngram a_ 后接的 z 的类型数
* */
virtual double discount(Count count, Count totalCount, Count observedVocab);
virtual double discount(FloatCount count, FloatCount totalCount,
Count observedVocab);
class Discount
{
public:
/**
* @brief 构造函数
* 通过成员初始化列表的方式将interpolate初始化为false,表示不支持插值平滑
*/
Discount() : interpolate(false) {};
virtual ~Discount() {};
/* 使用折扣算法, 返回频率 count 对应的折扣系数
* @param count 当前 ngram 频率 ( C(a_z) )
* @param totalCount 历史 n-1gram 频率 ( C(a_) )
* @param oberservedVocab 语料中统计到的历史ngram a_ 后接的 z 的类型数
* */
virtual double discount(Count count, Count totalCount, Count observedVocab);
virtual double discount(FloatCount count, FloatCount totalCount,
Count observedVocab);
/* 在插值模型中, 低阶模型的权值大小
*
* @param min2Vocab 频率>=2 的ngram type 数目
* @param min3Vocab 频率>=3 的ngram type 数目
* */
virtual double lowerOrderWeight(Count totalCount, Count observedVocab,
Count min2Vocab, Count min3Vocab);
virtual double lowerOrderWeight(FloatCount totalCount, Count observedVocab,
Count min2Vocab, Count min3Vocab);
*
* @param min2Vocab 频率>=2 的ngram type 数目
* @param min3Vocab 频率>=3 的ngram type 数目
* */
virtual double lowerOrderWeight(Count totalCount, Count observedVocab,
Count min2Vocab, Count min3Vocab);
virtual double lowerOrderWeight(FloatCount totalCount, Count observedVocab,
Count min2Vocab, Count min3Vocab);
/* 是否支持该折扣操作 */
virtual Boolean nodiscount();
virtual Boolean nodiscount();
/* 保存折扣系数 */
virtual void write(File &file);
virtual void write(File &file);
/* 读入折扣系数 */
virtual Boolean read(File &file);
virtual Boolean read(File &file);
/* 计算折扣系数
*
* @param counts ngram 计数器
* @param order 对 order 阶ngram 进行平滑计算
* */
virtual Boolean estimate(NgramStats &counts, unsigned order);
virtual Boolean estimate(NgramCounts<FloatCount> &counts, unsigned order);
*
* @param counts ngram 计数器
* @param order 对 order 阶ngram 进行平滑计算
* */
virtual Boolean estimate(NgramStats &counts, unsigned order);
virtual Boolean estimate(NgramCounts<FloatCount> &counts, unsigned order);
/* 在平滑之前, 对 ngram 的频率进行调整,在 KneserNey 平滑算法中使用 */
virtual void prepareCounts(NgramCounts<NgramCount> &counts,
unsigned order, unsigned maxOrder);
virtual void prepareCounts(NgramCounts<FloatCount> &counts,
unsigned order, unsigned maxOrder);
virtual void prepareCounts(NgramCounts<NgramCount> &counts,
unsigned order, unsigned maxOrder);
virtual void prepareCounts(NgramCounts<FloatCount> &counts,
unsigned order, unsigned maxOrder);
/* 是否支持插值,这里将interpolate置为public区域,是为子类的继承 */
Boolean interpolate;
protected:
Boolean interpolate;
protected:
/* Vocab 中有效条目的 ngram 数目 */
static unsigned vocabSize(Vocab &vocab);
};
<src>
static unsigned vocabSize(Vocab &vocab);
};
<src>
2.2) GoodTuring 类主要接口
<src>
class GoodTuring: public Discount
{
public:
/* 构造函数,该构造函数中并未修改从父类继承的interploate值,
* 因此interpolate为false,表示不支持插值平滑
*/
GoodTuring(unsigned mincount = GT_defaultMinCount,
unsigned maxcount = GT_defaultMaxCount);
/* 折扣系数估算函数
* jianzhu added 2008-12-15
*/
Boolean estimate(NgramStats &counts, unsigned order);
class GoodTuring: public Discount
{
public:
/* 构造函数,该构造函数中并未修改从父类继承的interploate值,
* 因此interpolate为false,表示不支持插值平滑
*/
GoodTuring(unsigned mincount = GT_defaultMinCount,
unsigned maxcount = GT_defaultMaxCount);
/* 折扣系数估算函数
* jianzhu added 2008-12-15
*/
Boolean estimate(NgramStats &counts, unsigned order);
protected:
/* 小于该值的频率将被设置为 0 */
Count minCount;
/* 小于该值的频率将被设置为 0 */
Count minCount;
/* 大于该值的频率将不平滑, 保持不变 */
Count maxCount;
Count maxCount;
/* 平滑系数 */
Array<double> discountCoeffs;
};
</src>
Array<double> discountCoeffs;
};
</src>
说明:
a. 该函数实现标准的基于图灵平滑的 Katz 平滑算法
b. 图灵平滑算法为 r* = (r + 1) * n_(r+1) / n_r, Katz 平滑算法如下
a. 该函数实现标准的基于图灵平滑的 Katz 平滑算法
b. 图灵平滑算法为 r* = (r + 1) * n_(r+1) / n_r, Katz 平滑算法如下
|-- d_r * r (0 < r < k)
r_katz = |-- r (r > k)
|-- alpha (r = 0)
r_katz = |-- r (r > k)
|-- alpha (r = 0)
d_r = (r*/r - commonTerm ) / (1 - commonTerm)
commonTerm = (k+1) * n_(k+1) / n_1
commonTerm = (k+1) * n_(k+1) / n_1
在此处 k 默认值为 GT_defaultMaxCount = 5;
折扣系数估算函数
<src>
Boolean
GoodTuring::estimate(NgramStats &counts, unsigned order)
{
Array<Count> countOfCounts;
<src>
Boolean
GoodTuring::estimate(NgramStats &counts, unsigned order)
{
Array<Count> countOfCounts;
/*
* First tabulate count-of-counts for the given order of ngrams
* Note we need GT count for up to maxCount + 1 inclusive, to apply
* the GT formula for counts up to maxCount.
*/
makeArray(VocabIndex, wids, order + 1);
* First tabulate count-of-counts for the given order of ngrams
* Note we need GT count for up to maxCount + 1 inclusive, to apply
* the GT formula for counts up to maxCount.
*/
makeArray(VocabIndex, wids, order + 1);
NgramsIter iter(counts, wids, order);
NgramCount *count;
Count i;
NgramCount *count;
Count i;
for (i = 0; i <= maxCount + 1; i++) {
countOfCounts[i] = 0;
}
countOfCounts[i] = 0;
}
while (count = iter.next()) {
if (counts.vocab.isNonEvent(wids[order - 1])) {
continue;
} else if (counts.vocab.isMetaTag(wids[order - 1])) {
unsigned type = counts.vocab.typeOfMetaTag(wids[order - 1]);
if (counts.vocab.isNonEvent(wids[order - 1])) {
continue;
} else if (counts.vocab.isMetaTag(wids[order - 1])) {
unsigned type = counts.vocab.typeOfMetaTag(wids[order - 1]);
/*
* process count-of-count
*/
if (type > 0 && type <= maxCount + 1) {
countOfCounts[type] += *count;
}
} else if (*count <= maxCount + 1) {
countOfCounts[*count] ++;
}
}
* process count-of-count
*/
if (type > 0 && type <= maxCount + 1) {
countOfCounts[type] += *count;
}
} else if (*count <= maxCount + 1) {
countOfCounts[*count] ++;
}
}
if (debug(DEBUG_ESTIMATE_DISCOUNT)) {
dout() << "Good-Turing discounting " &
dout() << "Good-Turing discounting " &