贝叶斯平滑ctr计算

更新Beta分布里的alpha和beta参数

B e t a ( a , b ) = θ a − 1 ( 1 − θ ) b − 1 B ( a , b ) , B 函 数 是 一 个 标 准 化 函 数 \displaystyle Beta(a,b)=\frac{\theta^{a-1}(1-\theta)^{b-1}}{B(a,b)},B函数是一个标准化函数 Beta(a,b)=B(a,b)θa1(1θ)b1,B

用矩估计估计出来的参数alpha和beta => 给ctr计算做平滑

np.random.seed(0)
class HyperParam(object):
    def __init__(self, alpha, beta):
        self.alpha = alpha
        self.beta = beta

    def sample_from_beta(self, alpha, beta, num, imp_upperbound):
        sample = np.random.beta(alpha, beta, num)   #贝塔分布
        I = []
        C = []
        for click_ratio in sample:   #Beta分布生成的click_ratio
            imp = random.random() * imp_upperbound
            #imp = imp_upperbound
            click = imp * click_ratio
            I.append(imp)
            C.append(click)
        return I, C

    def update_from_data_by_FPI(self, tries, success, iter_num, epsilon):
        '''用不动点迭代法 更新Beta里的参数 alpha和beta'''
        for i in range(iter_num):
            new_alpha, new_beta = self.__fixed_point_iteration(tries, success, self.alpha, self.beta)
            if abs(new_alpha-self.alpha)<epsilon and abs(new_beta-self.beta)<epsilon:
                break
            self.alpha = new_alpha
            self.beta = new_beta

    def __fixed_point_iteration(self, tries, success, alpha, beta):
        '''fixed point iteration 不动点迭代x_i+1 = g(x_i)'''
        sumfenzialpha = 0.0
        sumfenzibeta = 0.0
        sumfenmu = 0.0
        for i in range(len(tries)):
            #special.digamma(z)是 在z处取gamma函数值 再求log
            sumfenzialpha += (special.digamma(success[i]+alpha) - special.digamma(alpha))
            sumfenzibeta += (special.digamma(tries[i]-success[i]+beta) - special.digamma(beta))
            sumfenmu += (special.digamma(tries[i]+alpha+beta) - special.digamma(alpha+beta))
        return alpha*(sumfenzialpha/sumfenmu), beta*(sumfenzibeta/sumfenmu)

    def update_from_data_by_moment(self, tries, success):   #tries是各组总样本数,success是各组click=1的样本和
        '''用矩估计 更新Beta里的参数 alpha和beta'''
        mean, var = self.__compute_moment(tries, success)
        #print 'mean and variance: ', mean, var
        #self.alpha = mean * (mean*(1-mean)/(var+0.000001) - 1)
        self.alpha = (mean+0.000001) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)
        #self.beta = (1-mean) * (mean*(1-mean)/(var+0.000001) - 1)
        self.beta = (1.000001 - mean) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)

    def __compute_moment(self, tries, success):
        '''矩估计'''
        ctr_list = []
        var = 0.0
        for i in range(len(tries)):
            ctr_list.append(float(success[i])/(tries[i] + 0.000000001))
        mean = sum(ctr_list)/len(ctr_list)
        for ctr in ctr_list:
            var += pow(ctr-mean, 2)   #方差

        return mean, var/(len(ctr_list)-1)
import pandas as pd
from tqdm import tqdm
import jieba, os, Levenshtein, time
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from sklearn import preprocessing
import numpy as np 
import random, copy
from xpinyin import Pinyin

all_data = pd.read_csv('test_ctr.csv')
all_data.drop('Unnamed: 0',axis =1, inplace=True)
all_data.head()

在这里插入图片描述

all_data.columns, all_data.shape
# (Index(['prefix', 'query_prediction', 'title', 'tag', 'label', 'diction_label',
#         'max_query_prediction_keys', 'prefix_pinyin', 'prefix_fix', 'old_index',
#         'random_sector'],
#        dtype='object'), (2300000, 11))
all_data['random_sector'].value_counts()
# 1    400883
# 3    400553
# 4    399929
# 5    399922
# 2    398713
# 0    300000
# Name: random_sector, dtype: int64
sec_size = 5
frac_size = 0.5
is_fill_na = False
convert_feature = ['prefix', 'title', 'tag', 'max_query_prediction_keys', 'prefix_pinyin', 'prefix_fix']
for index, feature in enumerate(convert_feature):
    print('计算' + feature + '转换率')
    for sec in range(sec_size + 1):   # 0/1/2/3/4/5
        #对train里 分区不是sec的样本  取指定特征和label
        temp = all_data[(all_data['label'] != -1)&(all_data['random_sector'] != sec)][[feature, 'label']]
        
        if sec != 0:   #对于分区1/2/3/4/5(train集合切分出来的),每个都是 无放回取样50%
            temp = temp.sample(frac = frac_size, random_state = 19960121).reset_index(drop = True)
        
        #按指定特征分组,_label_count该组内label为1的样本数、_all_count该组内总的样本数
        temp[feature + '_all_count'] = temp.groupby(feature).label.transform('count')
        temp[feature + '_label_count'] = temp.groupby(feature).label.transform('sum')
        HP = HyperParam(1, 1)
        HP.update_from_data_by_moment(temp[feature + '_all_count'].values, temp[feature + '_label_count'].values)
        #_convert做了平滑的点击率
        temp[feature + '_convert'] = (temp[feature + '_label_count'] + HP.alpha) / (temp[feature + '_all_count'] + HP.alpha + HP.beta)
        #如果样本的 特征及相应转化率都相同,去重
        temp = temp[[feature, feature + '_convert']].drop_duplicates()
        
        sec_data = copy.deepcopy(all_data[all_data['random_sector'] == sec])
        sec_data = pd.merge(sec_data, temp, on = [feature], how = 'left')
        if is_fill_na:
            sec_data[feature + '_convert'].fillna(HP.alpha / (HP.alpha + HP.beta), inplace = True)
        if sec:   #sec=1/2/3/4/5
            new_all_data = pd.concat((new_all_data, sec_data))
        else:   #根据for循环判断,先从这开始,sec=0
            new_all_data = copy.deepcopy(sec_data)
    all_data = copy.deepcopy(new_all_data)

test_ctr.zip

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值