boost的训练过程

bool CvBoost::train( const CvMat* _train_data, int _tflag,

             const CvMat* _responses, const CvMat* _var_idx,

             const CvMat* _sample_idx, const CvMat* _var_type,

             const CvMat* _missing_mask,

             CvBoostParams _params, bool _update )

{

    boolok = false;

    CvMemStorage*storage = 0;

 

    CV_FUNCNAME("CvBoost::train" );

    __BEGIN__;

 

    inti;

    set_params(_params );

 

    cvReleaseMat( &active_vars );

    cvReleaseMat( &active_vars_abs );

 

    if( !_update || !data )

    {

        clear();

        //初始化训练数据CvDTreeTrainData  data

        data = new CvDTreeTrainData(_train_data, _tflag,_responses, _var_idx,

            _sample_idx, _var_type, _missing_mask, _params, true, true );

 

        if(data->get_num_classes() != 2 )

            CV_ERROR( CV_StsNotImplemented,

            "Boosted trees can only be used for 2-class classification." );

        CV_CALL( storage = cvCreateMemStorage() );

//weak 为CvBoostTree序列,指向一系列的弱分类器

// weak指向的弱分类器的个数由params.weak_count指定

        weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );

        storage = 0;

    }

    else

    {

        data->set_data( _train_data, _tflag, _responses, _var_idx,

            _sample_idx, _var_type, _missing_mask, _params, true, true, true );

    }

 

    if ( (_params.boost_type ==LOGIT) || (_params.boost_type ==GENTLE) )

        data->do_responses_copy();

 

    update_weights( 0 );

 

//核心:每训练出一个弱分类器CvBoostTree后,就需要根据此分类器更新样本权重

//然后根据params.weight_trim_rate去除权重很低的样本,再次训练

//直至达到弱分类器的总个数,或者去除权重很低的样本后的样本个数为0,则退出

    for(i = 0; i < params.weak_count; i++ )

    {

        CvBoostTree* tree = new CvBoostTree;

 

        //训练出一个弱分类器CvBoostTree

        if( !tree->train(data, subsample_mask,this ) )

        {

            delete tree;

            break;

        }

        //cvCheckArr( get_weak_response());

        cvSeqPush( weak, &tree );

    update_weights( tree ); //根据此分类器更新样本权重

        trim_weights();//根据params.weight_trim_rate去除权重很低的样本

       

        //subsample_mask和样本个数大小相同,0表示不选择使用这个样本,1表示选择

//cvCountNonZero统计非0的个数    

//下面为统计选择的样本的个数,如果选择的样本个数为0,则退出

if( cvCountNonZero(subsample_mask) == 0 )

            break;

    }

 

    if(weak->total > 0)

    {

        get_active_vars();// recompute active_vars* maps and condensed_idx's in the splits.

        data->is_classifier = true;

        data->free_train_data();

        ok = true;

    }

    else

        clear();

 

    __END__;

    returnok;

}

 

 

 

 

void CvBoost::update_weights( CvBoostTree* tree ) {

  double initial_weights[2] = { 1, 1 };

  update_weights_impl( tree, initial_weights );

}

 

// initial_weights只有在没有树生成的情况下有用

void

CvBoost::update_weights_impl( CvBoostTree* tree, double initial_weights[2] )

{

    CV_FUNCNAME( "CvBoost::update_weights_impl" );

    __BEGIN__;

 

    。。。

 

    if ( (params.boost_type == LOGIT) || (params.boost_type == GENTLE) )

    {

        step = CV_IS_MAT_CONT(data->responses_copy->type) ?

           1 : data->responses_copy->step / CV_ELEM_SIZE(data->responses_copy->type);

        fdata = data->responses_copy->data.fl;

        sample_idx_buf = (int*)cur_buf_pos;

        cur_buf_pos = (uchar*)(sample_idx_buf + data->sample_count);

        sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf );

    }

    CvMat* dtree_data_buf = data->buf;

    size_t length_buf_row = data->get_length_subbuf();

// before training the first tree, initialize weights and other parameters

if( !tree )

    {

        。。。。。// the first tree, initialize weights and other parameters

    }

    else

    {

        // at this moment, for all the samples that participated in the training of the most

        // recent weak classifier we know the responses. For other samples we need to compute them

        if( have_subsample )

        {

           float* values = (float*)cur_buf_pos;

           cur_buf_pos = (uchar*)(values + data->get_length_subbuf());

           uchar* missing = cur_buf_pos;

           cur_buf_pos = missing + data->get_length_subbuf() * (size_t)CV_ELEM_SIZE(data->buf->type);

 

           CvMat _sample, _mask;

 

           // invert the subsample mask

           cvXorS( subsample_mask, cvScalar(1.), subsample_mask );

           data->get_vectors( subsample_mask, values, missing, 0 );

 

           _sample = cvMat( 1, data->var_count, CV_32F );

           _mask = cvMat( 1, data->var_count, CV_8U );

 

           // run tree through all the non-processed samples

           for( i = 0; i < n; i++ )

               if( subsample_mask->data.ptr[i] )

               {

                   _sample.data.fl = values;

                   _mask.data.ptr = missing;

                   values += _sample.cols;

                   missing += _mask.cols;

 

//每个样本经过预测,返回最终叶子节点的value,叶子节点的value为f(x_i)

//即 weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)

                   weak_eval->data.db[i] = tree->predict( &_sample, &_mask, true )->value;

               }

        }

 

        // now update weights and other parameters for each type of boosting

        if( params.boost_type == DISCRETE )

        {

           // Discrete AdaBoost:

           //   weak_eval[i] (=f(x_i)) is in {-1,1}

           //   err = sum(w_i*(f(x_i) != y_i))/sum(w_i)

           //   C = log((1-err)/err)

           //   w_i *= exp(C*(f(x_i) != y_i))

           。。。

        }

        else if( params.boost_type == REAL )

        {

         // Real AdaBoost:

          //weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)

          //w_i *= exp(-y_i*f(x_i))

 

           for( i = 0; i < n; i++ )

           {              

               weak_eval->data.db[i] *= -orig_response->data.i[i];      

           }

 

           cvExp( weak_eval, weak_eval );

 

           for( i = 0; i < n; i++ )

           {

               double w = weights->data.db[i]*weak_eval->data.db[i];

               sumw += w;

               weights->data.db[i] = w;   //每个样本更新权重        

           }

        }

        else if( params.boost_type == LOGIT )

        {

           // LogitBoost:

           //   weak_eval[i] = f(x_i) in [-z_max,z_max]

           //   sum_response = F(x_i).

           //   F(x_i) += 0.5*f(x_i)

            //   p(x_i) = exp(F(x_i))/(exp(F(x_i)) + exp(-F(x_i))=1/(1+exp(-2*F(x_i)))

           //   reuse weak_eval: weak_eval[i] <- p(x_i)

           //   w_i = p(x_i)*1(1 - p(x_i))

           //   z_i = ((y_i+1)/2 - p(x_i))/(p(x_i)*(1 - p(x_i)))

            //   store z_i to the data->data_root as the new target responses

           。。。

        }

        else

        {

           // Gentle AdaBoost:

           //   weak_eval[i] = f(x_i) in [-1,1]

           //   w_i *= exp(-y_i*f(x_i))

           。。。

    }

 

    // renormalize weights

    if( sumw > FLT_EPSILON )

    {

        sumw = 1./sumw;

        for( i = 0; i < n; ++i )

           weights->data.db[i] *= sumw;

    }

 

    __END__;

}

【注】很多资料中讲解每个决策树(可能有好几层深度,很多个节点)对应一个,从代码上看到的并非如此。应该是每个节点node对应着一个,在样本更新权重时,将样本扔入该决策树中,预测最终走到哪个叶子节点,此时该叶子节点对应的便是该样本更新权重时使用的。如此说来,在一次全部样本的权重更新中,虽然都是一个决策树,但是各个样本对应的

除此之外,也和资料中有些出入。资料中

 

 

    代码中的 计算确有所改变。代码中的计算如下:

(1)  分别计算某个节点所有正样本D(i)的累积和(正样本权重累积和),和所有负样本D(i)的累积和(负样本权重累积和)。

(2)  正样本权重累积和/(正样本权重累积和+负样本权重累积和)

下面是其代码实现:

void CvBoostTree::calc_node_value( CvDTreeNode* node )

{

    int i, n = node->sample_count;

    。。。  

double rcw[2] = {0,0};//某节点正样本权重累积和,负样本权重累积和

   

    if( data->is_classifier )

    {

        int* _responses_buf = labels_buf + n;

        const int* _responses = data->get_class_labels(node, _responses_buf);

        int m = data->get_num_classes();

        int* cls_count = data->counts->data.i;

        for( int k = 0; k < m; k++ )

           cls_count[k] = 0;

 

// 分别计算某节点所有正样本D(i)的累积和,和所有负样本D(i)的累积和

//cls_count是某节点所有正样本的个数,和所有负样本的个数

        for( i = 0; i < n; i++ )

        {

           int idx = labels[i];

           double w = weights[idx];

           int r = _responses[i];

           rcw[r] += w;

           cls_count[r]++;

           subtree_weights[i] = w;

        }

 

        node->class_idx = rcw[1] > rcw[0];

 

        if( boost_type == CvBoost::DISCRETE )

        {

         // ignore cat_map for responses, and use {-1,1},

   //as the whole ensemble response is computes as sign(sum_i(weak_response_i)

           node->value = node->class_idx*2 - 1;

        }

        else

        {

           double p = rcw[1]/(rcw[0] + rcw[1]);//p对应

           assert( boost_type == CvBoost::REAL );

 

           // store log-ratio of the probability

           node->value = 0.5*log_ratio(p);//value对应

        }

    }

    else

    {

        // in case of regression tree:

        //  * node value is 1/n*sum_i(Y_i), where Y_i is i-th response,

        //    n is the number of samples in the node.

        //  * node risk is the sum of squared errors: sum_i((Y_i - <node_value>)^2)

       。。。。

    }

 

    // store summary weights

    subtree_weights[n] = rcw[0];

    subtree_weights[n+1] = rcw[1];

}

 

 

 

 

void CvBoost::trim_weights()

{

    //CV_FUNCNAME( "CvBoost::trim_weights" );

 

    __BEGIN__;

 

    inti, count = data->sample_count, nz_count = 0;

    doublesum, threshold;

 

    if(params.weight_trim_rate <= 0. ||params.weight_trim_rate >= 1. )

        EXIT;

 

    // use weak_eval as temporary buffer for sorted weights

    cvCopy(weights, weak_eval );

 

    icvSort_64f(weak_eval->data.db,count, 0 );//从小到大排序,找出阈值

 

    // as weight trimming occurs immediately after updating the weights,

    // where they are renormalized, we assume that the weight sum = 1.

    sum = 1. -params.weight_trim_rate;

 

    for(i = 0; i < count; i++ )

    {

        doublew = weak_eval->data.db[i];

        if(sum <= 0 )

            break;

        sum -= w;

    }

 

// sum<0对应着某个小于1. - params.weight_trim_rate的阈值

    threshold =i < count ?weak_eval->data.db[i] :DBL_MAX;

 

    for(i = 0; i < count; i++ )

    {

        doublew = weights->data.db[i];

        intf = w >= threshold; // 去除权重很小的样本

        subsample_mask->data.ptr[i] = (uchar)f;

        nz_count += f;

    }

//没有去除样本,则下次训练决策树时,不用生成样本子集

    have_subsample =nz_count < count;

    __END__;

}

 

【注】资料中params.weight_trim_rate是权重修剪的比例,如果样本权重D(i)<(1- params.weight_trim_rate),则去除该样本。资料原话见下文:

 

    但是从上面的代码中,去除权重的阈值,并不是1- params.weight_trim_rate。而是使sum小于0时(sum就是1- params.weight_trim_rate)的对应样本的weak_eval。这里的weak_eval是在update_weights之后,即为样本对应的

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值