boost的训练过程

最新推荐文章于 2023-02-15 18:59:47 发布

nongfu_spring

最新推荐文章于 2023-02-15 18:59:47 发布

阅读量1.3k

点赞数

分类专栏： OpenCV分类器训练

本文链接：https://blog.youkuaiyun.com/nongfu_spring/article/details/39555953

版权

OpenCV分类器训练专栏收录该内容

11 篇文章

订阅专栏

bool CvBoost::train( const CvMat* _train_data, int _tflag,

const CvMat* _responses, const CvMat* _var_idx,

const CvMat* _sample_idx, const CvMat* _var_type,

const CvMat* _missing_mask,

CvBoostParams _params, bool _update )

{

boolok = false;

CvMemStorage*storage = 0;

CV_FUNCNAME("CvBoost::train" );

__BEGIN__;

inti;

set_params(_params );

cvReleaseMat( &active_vars );

cvReleaseMat( &active_vars_abs );

if( !_update || !data )

{

clear();

//初始化训练数据CvDTreeTrainData data

data = new CvDTreeTrainData(_train_data, _tflag,_responses, _var_idx,

_sample_idx, _var_type, _missing_mask, _params, true, true );

if(data->get_num_classes() != 2 )

CV_ERROR( CV_StsNotImplemented,

"Boosted trees can only be used for 2-class classification." );

CV_CALL( storage = cvCreateMemStorage() );

//weak 为CvBoostTree序列，指向一系列的弱分类器

// weak指向的弱分类器的个数由params.weak_count指定

weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );

storage = 0;

}

else

{

data->set_data( _train_data, _tflag, _responses, _var_idx,

_sample_idx, _var_type, _missing_mask, _params, true, true, true );

}

if ( (_params.boost_type ==LOGIT) || (_params.boost_type ==GENTLE) )

data->do_responses_copy();

update_weights( 0 );

//核心：每训练出一个弱分类器CvBoostTree后，就需要根据此分类器更新样本权重

//然后根据params.weight_trim_rate去除权重很低的样本，再次训练

//直至达到弱分类器的总个数，或者去除权重很低的样本后的样本个数为0，则退出

for(i = 0; i < params.weak_count; i++ )

{

CvBoostTree* tree = new CvBoostTree;

//训练出一个弱分类器CvBoostTree

if( !tree->train(data, subsample_mask,this ) )

{

delete tree;

break;

}

//cvCheckArr( get_weak_response());

cvSeqPush( weak, &tree );

update_weights( tree ); //根据此分类器更新样本权重

trim_weights();//根据params.weight_trim_rate去除权重很低的样本

//subsample_mask和样本个数大小相同，0表示不选择使用这个样本，1表示选择

//cvCountNonZero统计非0的个数

//下面为统计选择的样本的个数，如果选择的样本个数为0，则退出

if( cvCountNonZero(subsample_mask) == 0 )

break;

}

if(weak->total > 0)

{

get_active_vars();// recompute active_vars* maps and condensed_idx's in the splits.

data->is_classifier = true;

data->free_train_data();

ok = true;

}

else

clear();

__END__;

returnok;

}

void CvBoost::update_weights( CvBoostTree* tree ) {

double initial_weights[2] = { 1, 1 };

update_weights_impl( tree, initial_weights );

}

// initial_weights只有在没有树生成的情况下有用

void

CvBoost::update_weights_impl( CvBoostTree* tree, double initial_weights[2] )

{

CV_FUNCNAME( "CvBoost::update_weights_impl" );

__BEGIN__;

。。。

if ( (params.boost_type == LOGIT) || (params.boost_type == GENTLE) )

{

step = CV_IS_MAT_CONT(data->responses_copy->type) ?

1 : data->responses_copy->step / CV_ELEM_SIZE(data->responses_copy->type);

fdata = data->responses_copy->data.fl;

sample_idx_buf = (int*)cur_buf_pos;

cur_buf_pos = (uchar*)(sample_idx_buf + data->sample_count);

sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf );

}

CvMat* dtree_data_buf = data->buf;

size_t length_buf_row = data->get_length_subbuf();

// before training the first tree, initialize weights and other parameters

if( !tree )

{

。。。。。// the first tree, initialize weights and other parameters

}

else

{

// at this moment, for all the samples that participated in the training of the most

// recent weak classifier we know the responses. For other samples we need to compute them

if( have_subsample )

{

float* values = (float*)cur_buf_pos;

cur_buf_pos = (uchar*)(values + data->get_length_subbuf());

uchar* missing = cur_buf_pos;

cur_buf_pos = missing + data->get_length_subbuf() * (size_t)CV_ELEM_SIZE(data->buf->type);

CvMat _sample, _mask;

// invert the subsample mask

cvXorS( subsample_mask, cvScalar(1.), subsample_mask );

data->get_vectors( subsample_mask, values, missing, 0 );

_sample = cvMat( 1, data->var_count, CV_32F );

_mask = cvMat( 1, data->var_count, CV_8U );

// run tree through all the non-processed samples

for( i = 0; i < n; i++ )

if( subsample_mask->data.ptr[i] )

{

_sample.data.fl = values;

_mask.data.ptr = missing;

values += _sample.cols;

missing += _mask.cols;

//每个样本经过预测，返回最终叶子节点的value，叶子节点的value为f(x_i)

//即 weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)

weak_eval->data.db[i] = tree->predict( &_sample, &_mask, true )->value;

}

// now update weights and other parameters for each type of boosting

if( params.boost_type == DISCRETE )

{

// Discrete AdaBoost:

// weak_eval[i] (=f(x_i)) is in {-1,1}

// err = sum(w_i*(f(x_i) != y_i))/sum(w_i)

// C = log((1-err)/err)

// w_i *= exp(C*(f(x_i) != y_i))

。。。

}

else if( params.boost_type == REAL )

{

// Real AdaBoost:

//weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)

//w_i *= exp(-y_i*f(x_i))

for( i = 0; i < n; i++ )

{

weak_eval->data.db[i] *= -orig_response->data.i[i];

}

cvExp( weak_eval, weak_eval );

for( i = 0; i < n; i++ )

{

double w = weights->data.db[i]*weak_eval->data.db[i];

sumw += w;

weights->data.db[i] = w; //每个样本更新权重

}

else if( params.boost_type == LOGIT )

{

// LogitBoost:

// weak_eval[i] = f(x_i) in [-z_max,z_max]

// sum_response = F(x_i).

// F(x_i) += 0.5*f(x_i)

// p(x_i) = exp(F(x_i))/(exp(F(x_i)) + exp(-F(x_i))=1/(1+exp(-2*F(x_i)))

// reuse weak_eval: weak_eval[i] <- p(x_i)

// w_i = p(x_i)*1(1 - p(x_i))

// z_i = ((y_i+1)/2 - p(x_i))/(p(x_i)*(1 - p(x_i)))

// store z_i to the data->data_root as the new target responses

。。。

}

else

{

// Gentle AdaBoost:

// weak_eval[i] = f(x_i) in [-1,1]

// w_i *= exp(-y_i*f(x_i))

。。。

}

// renormalize weights

if( sumw > FLT_EPSILON )

{

sumw = 1./sumw;

for( i = 0; i < n; ++i )

weights->data.db[i] *= sumw;

}

__END__;

}

【注】很多资料中讲解每个决策树（可能有好几层深度，很多个节点）对应一个,从代码上看到的并非如此。应该是每个节点node对应着一个，在样本更新权重时，将样本扔入该决策树中，预测最终走到哪个叶子节点，此时该叶子节点对应的便是该样本更新权重时使用的。如此说来，在一次全部样本的权重更新中，虽然都是一个决策树，但是各个样本对应的。

除此之外，也和资料中有些出入。资料中

代码中的的计算确有所改变。代码中的计算如下：

（1）分别计算某个节点所有正样本D(i)的累积和（正样本权重累积和），和所有负样本D(i)的累积和（负样本权重累积和）。

（2）正样本权重累积和/(正样本权重累积和+负样本权重累积和)

下面是其代码实现：

void CvBoostTree::calc_node_value( CvDTreeNode* node )

{

int i, n = node->sample_count;

。。。

double rcw[2] = {0,0};//某节点正样本权重累积和,负样本权重累积和

if( data->is_classifier )

{

int* _responses_buf = labels_buf + n;

const int* _responses = data->get_class_labels(node, _responses_buf);

int m = data->get_num_classes();

int* cls_count = data->counts->data.i;

for( int k = 0; k < m; k++ )

cls_count[k] = 0;

// 分别计算某节点所有正样本D(i)的累积和，和所有负样本D(i)的累积和

//cls_count是某节点所有正样本的个数，和所有负样本的个数

for( i = 0; i < n; i++ )

{

int idx = labels[i];

double w = weights[idx];

int r = _responses[i];

rcw[r] += w;

cls_count[r]++;

subtree_weights[i] = w;

}

node->class_idx = rcw[1] > rcw[0];

if( boost_type == CvBoost::DISCRETE )

{

// ignore cat_map for responses, and use {-1,1},

//as the whole ensemble response is computes as sign(sum_i(weak_response_i)

node->value = node->class_idx*2 - 1;

}

else

{

double p = rcw[1]/(rcw[0] + rcw[1]);//p对应

assert( boost_type == CvBoost::REAL );

// store log-ratio of the probability

node->value = 0.5*log_ratio(p);//value对应

}

else

{

// in case of regression tree:

// * node value is 1/n*sum_i(Y_i), where Y_i is i-th response,

// n is the number of samples in the node.

// * node risk is the sum of squared errors: sum_i((Y_i - <node_value>)^2)

。。。。

}

// store summary weights

subtree_weights[n] = rcw[0];

subtree_weights[n+1] = rcw[1];

}

void CvBoost::trim_weights()

{

//CV_FUNCNAME( "CvBoost::trim_weights" );

__BEGIN__;

inti, count = data->sample_count, nz_count = 0;

doublesum, threshold;

if(params.weight_trim_rate <= 0. ||params.weight_trim_rate >= 1. )

EXIT;

// use weak_eval as temporary buffer for sorted weights

cvCopy(weights, weak_eval );

icvSort_64f(weak_eval->data.db,count, 0 );//从小到大排序，找出阈值

// as weight trimming occurs immediately after updating the weights,

// where they are renormalized, we assume that the weight sum = 1.

sum = 1. -params.weight_trim_rate;

for(i = 0; i < count; i++ )

{

doublew = weak_eval->data.db[i];

if(sum <= 0 )

break;

sum -= w;

}

// sum<0对应着某个小于1. - params.weight_trim_rate的阈值

threshold =i < count ?weak_eval->data.db[i] :DBL_MAX;

for(i = 0; i < count; i++ )

{

doublew = weights->data.db[i];

intf = w >= threshold; // 去除权重很小的样本

subsample_mask->data.ptr[i] = (uchar)f;

nz_count += f;

}

//没有去除样本，则下次训练决策树时，不用生成样本子集

have_subsample =nz_count < count;

__END__;

}

【注】资料中params.weight_trim_rate是权重修剪的比例，如果样本权重D(i)<(1- params.weight_trim_rate),则去除该样本。资料原话见下文：

但是从上面的代码中，去除权重的阈值，并不是1- params.weight_trim_rate。而是使sum小于0时（sum就是1- params.weight_trim_rate）的对应样本的weak_eval。这里的weak_eval是在update_weights之后，即为样本对应的