bool CvBoost::train( const CvMat* _train_data, int _tflag,
const CvMat* _responses, const CvMat* _var_idx,
const CvMat* _sample_idx, const CvMat* _var_type,
const CvMat* _missing_mask,
CvBoostParams _params, bool _update )
{
boolok = false;
CvMemStorage*storage = 0;
CV_FUNCNAME("CvBoost::train" );
__BEGIN__;
inti;
set_params(_params );
cvReleaseMat( &active_vars );
cvReleaseMat( &active_vars_abs );
if( !_update || !data )
{
clear();
//初始化训练数据CvDTreeTrainData data
data = new CvDTreeTrainData(_train_data, _tflag,_responses, _var_idx,
_sample_idx, _var_type, _missing_mask, _params, true, true );
if(data->get_num_classes() != 2 )
CV_ERROR( CV_StsNotImplemented,
"Boosted trees can only be used for 2-class classification." );
CV_CALL( storage = cvCreateMemStorage() );
//weak 为CvBoostTree序列,指向一系列的弱分类器
// weak指向的弱分类器的个数由params.weak_count指定
weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
storage = 0;
}
else
{
data->set_data( _train_data, _tflag, _responses, _var_idx,
_sample_idx, _var_type, _missing_mask, _params, true, true, true );
}
if ( (_params.boost_type ==LOGIT) || (_params.boost_type ==GENTLE) )
data->do_responses_copy();
update_weights( 0 );
//核心:每训练出一个弱分类器CvBoostTree后,就需要根据此分类器更新样本权重
//然后根据params.weight_trim_rate去除权重很低的样本,再次训练
//直至达到弱分类器的总个数,或者去除权重很低的样本后的样本个数为0,则退出
for(i = 0; i < params.weak_count; i++ )
{
CvBoostTree* tree = new CvBoostTree;
//训练出一个弱分类器CvBoostTree
if( !tree->train(data, subsample_mask,this ) )
{
delete tree;
break;
}
//cvCheckArr( get_weak_response());
cvSeqPush( weak, &tree );
update_weights( tree ); //根据此分类器更新样本权重
trim_weights();//根据params.weight_trim_rate去除权重很低的样本
//subsample_mask和样本个数大小相同,0表示不选择使用这个样本,1表示选择
//cvCountNonZero统计非0的个数
//下面为统计选择的样本的个数,如果选择的样本个数为0,则退出
if( cvCountNonZero(subsample_mask) == 0 )
break;
}
if(weak->total > 0)
{
get_active_vars();// recompute active_vars* maps and condensed_idx's in the splits.
data->is_classifier = true;
data->free_train_data();
ok = true;
}
else
clear();
__END__;
returnok;
}
void CvBoost::update_weights( CvBoostTree* tree ) {
double initial_weights[2] = { 1, 1 };
update_weights_impl( tree, initial_weights );
}
// initial_weights只有在没有树生成的情况下有用
void
CvBoost::update_weights_impl( CvBoostTree* tree, double initial_weights[2] )
{
CV_FUNCNAME( "CvBoost::update_weights_impl" );
__BEGIN__;
。。。
if ( (params.boost_type == LOGIT) || (params.boost_type == GENTLE) )
{
step = CV_IS_MAT_CONT(data->responses_copy->type) ?
1 : data->responses_copy->step / CV_ELEM_SIZE(data->responses_copy->type);
fdata = data->responses_copy->data.fl;
sample_idx_buf = (int*)cur_buf_pos;
cur_buf_pos = (uchar*)(sample_idx_buf + data->sample_count);
sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf );
}
CvMat* dtree_data_buf = data->buf;
size_t length_buf_row = data->get_length_subbuf();
// before training the first tree, initialize weights and other parameters
if( !tree )
{
。。。。。// the first tree, initialize weights and other parameters
}
else
{
// at this moment, for all the samples that participated in the training of the most
// recent weak classifier we know the responses. For other samples we need to compute them
if( have_subsample )
{
float* values = (float*)cur_buf_pos;
cur_buf_pos = (uchar*)(values + data->get_length_subbuf());
uchar* missing = cur_buf_pos;
cur_buf_pos = missing + data->get_length_subbuf() * (size_t)CV_ELEM_SIZE(data->buf->type);
CvMat _sample, _mask;
// invert the subsample mask
cvXorS( subsample_mask, cvScalar(1.), subsample_mask );
data->get_vectors( subsample_mask, values, missing, 0 );
_sample = cvMat( 1, data->var_count, CV_32F );
_mask = cvMat( 1, data->var_count, CV_8U );
// run tree through all the non-processed samples
for( i = 0; i < n; i++ )
if( subsample_mask->data.ptr[i] )
{
_sample.data.fl = values;
_mask.data.ptr = missing;
values += _sample.cols;
missing += _mask.cols;
//每个样本经过预测,返回最终叶子节点的value,叶子节点的value为f(x_i)
//即 weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)
weak_eval->data.db[i] = tree->predict( &_sample, &_mask, true )->value;
}
}
// now update weights and other parameters for each type of boosting
if( params.boost_type == DISCRETE )
{
// Discrete AdaBoost:
// weak_eval[i] (=f(x_i)) is in {-1,1}
// err = sum(w_i*(f(x_i) != y_i))/sum(w_i)
// C = log((1-err)/err)
// w_i *= exp(C*(f(x_i) != y_i))
。。。
}
else if( params.boost_type == REAL )
{
// Real AdaBoost:
//weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)
//w_i *= exp(-y_i*f(x_i))
for( i = 0; i < n; i++ )
{
weak_eval->data.db[i] *= -orig_response->data.i[i];
}
cvExp( weak_eval, weak_eval );
for( i = 0; i < n; i++ )
{
double w = weights->data.db[i]*weak_eval->data.db[i];
sumw += w;
weights->data.db[i] = w; //每个样本更新权重
}
}
else if( params.boost_type == LOGIT )
{
// LogitBoost:
// weak_eval[i] = f(x_i) in [-z_max,z_max]
// sum_response = F(x_i).
// F(x_i) += 0.5*f(x_i)
// p(x_i) = exp(F(x_i))/(exp(F(x_i)) + exp(-F(x_i))=1/(1+exp(-2*F(x_i)))
// reuse weak_eval: weak_eval[i] <- p(x_i)
// w_i = p(x_i)*1(1 - p(x_i))
// z_i = ((y_i+1)/2 - p(x_i))/(p(x_i)*(1 - p(x_i)))
// store z_i to the data->data_root as the new target responses
。。。
}
else
{
// Gentle AdaBoost:
// weak_eval[i] = f(x_i) in [-1,1]
// w_i *= exp(-y_i*f(x_i))
。。。
}
// renormalize weights
if( sumw > FLT_EPSILON )
{
sumw = 1./sumw;
for( i = 0; i < n; ++i )
weights->data.db[i] *= sumw;
}
__END__;
}
【注】很多资料中讲解每个决策树(可能有好几层深度,很多个节点)对应一个,从代码上看到的并非如此。应该是每个节点node对应着一个,在样本更新权重时,将样本扔入该决策树中,预测最终走到哪个叶子节点,此时该叶子节点对应的便是该样本更新权重时使用的。如此说来,在一次全部样本的权重更新中,虽然都是一个决策树,但是各个样本对应的。
除此之外,也和资料中有些出入。资料中
代码中的 的计算确有所改变。代码中的计算如下:
(1) 分别计算某个节点所有正样本D(i)的累积和(正样本权重累积和),和所有负样本D(i)的累积和(负样本权重累积和)。
(2) 正样本权重累积和/(正样本权重累积和+负样本权重累积和)
下面是其代码实现:
void CvBoostTree::calc_node_value( CvDTreeNode* node )
{
int i, n = node->sample_count;
。。。
double rcw[2] = {0,0};//某节点正样本权重累积和,负样本权重累积和
if( data->is_classifier )
{
int* _responses_buf = labels_buf + n;
const int* _responses = data->get_class_labels(node, _responses_buf);
int m = data->get_num_classes();
int* cls_count = data->counts->data.i;
for( int k = 0; k < m; k++ )
cls_count[k] = 0;
// 分别计算某节点所有正样本D(i)的累积和,和所有负样本D(i)的累积和
//cls_count是某节点所有正样本的个数,和所有负样本的个数
for( i = 0; i < n; i++ )
{
int idx = labels[i];
double w = weights[idx];
int r = _responses[i];
rcw[r] += w;
cls_count[r]++;
subtree_weights[i] = w;
}
node->class_idx = rcw[1] > rcw[0];
if( boost_type == CvBoost::DISCRETE )
{
// ignore cat_map for responses, and use {-1,1},
//as the whole ensemble response is computes as sign(sum_i(weak_response_i)
node->value = node->class_idx*2 - 1;
}
else
{
double p = rcw[1]/(rcw[0] + rcw[1]);//p对应
assert( boost_type == CvBoost::REAL );
// store log-ratio of the probability
node->value = 0.5*log_ratio(p);//value对应
}
}
else
{
// in case of regression tree:
// * node value is 1/n*sum_i(Y_i), where Y_i is i-th response,
// n is the number of samples in the node.
// * node risk is the sum of squared errors: sum_i((Y_i - <node_value>)^2)
。。。。
}
// store summary weights
subtree_weights[n] = rcw[0];
subtree_weights[n+1] = rcw[1];
}
void CvBoost::trim_weights()
{
//CV_FUNCNAME( "CvBoost::trim_weights" );
__BEGIN__;
inti, count = data->sample_count, nz_count = 0;
doublesum, threshold;
if(params.weight_trim_rate <= 0. ||params.weight_trim_rate >= 1. )
EXIT;
// use weak_eval as temporary buffer for sorted weights
cvCopy(weights, weak_eval );
icvSort_64f(weak_eval->data.db,count, 0 );//从小到大排序,找出阈值
// as weight trimming occurs immediately after updating the weights,
// where they are renormalized, we assume that the weight sum = 1.
sum = 1. -params.weight_trim_rate;
for(i = 0; i < count; i++ )
{
doublew = weak_eval->data.db[i];
if(sum <= 0 )
break;
sum -= w;
}
// sum<0对应着某个小于1. - params.weight_trim_rate的阈值
threshold =i < count ?weak_eval->data.db[i] :DBL_MAX;
for(i = 0; i < count; i++ )
{
doublew = weights->data.db[i];
intf = w >= threshold; // 去除权重很小的样本
subsample_mask->data.ptr[i] = (uchar)f;
nz_count += f;
}
//没有去除样本,则下次训练决策树时,不用生成样本子集
have_subsample =nz_count < count;
__END__;
}
【注】资料中params.weight_trim_rate是权重修剪的比例,如果样本权重D(i)<(1- params.weight_trim_rate),则去除该样本。资料原话见下文:
但是从上面的代码中,去除权重的阈值,并不是1- params.weight_trim_rate。而是使sum小于0时(sum就是1- params.weight_trim_rate)的对应样本的weak_eval。这里的weak_eval是在update_weights之后,即为样本对应的