更新Beta分布里的alpha和beta参数
B
e
t
a
(
a
,
b
)
=
θ
a
−
1
(
1
−
θ
)
b
−
1
B
(
a
,
b
)
,
B
函
数
是
一
个
标
准
化
函
数
\displaystyle Beta(a,b)=\frac{\theta^{a-1}(1-\theta)^{b-1}}{B(a,b)},B函数是一个标准化函数
Beta(a,b)=B(a,b)θa−1(1−θ)b−1,B函数是一个标准化函数
用矩估计估计出来的参数alpha和beta => 给ctr计算做平滑
np.random.seed(0)
class HyperParam(object):
def __init__(self, alpha, beta):
self.alpha = alpha
self.beta = beta
def sample_from_beta(self, alpha, beta, num, imp_upperbound):
sample = np.random.beta(alpha, beta, num) #贝塔分布
I = []
C = []
for click_ratio in sample: #Beta分布生成的click_ratio
imp = random.random() * imp_upperbound
#imp = imp_upperbound
click = imp * click_ratio
I.append(imp)
C.append(click)
return I, C
def update_from_data_by_FPI(self, tries, success, iter_num, epsilon):
'''用不动点迭代法 更新Beta里的参数 alpha和beta'''
for i in range(iter_num):
new_alpha, new_beta = self.__fixed_point_iteration(tries, success, self.alpha, self.beta)
if abs(new_alpha-self.alpha)<epsilon and abs(new_beta-self.beta)<epsilon:
break
self.alpha = new_alpha
self.beta = new_beta
def __fixed_point_iteration(self, tries, success, alpha, beta):
'''fixed point iteration 不动点迭代x_i+1 = g(x_i)'''
sumfenzialpha = 0.0
sumfenzibeta = 0.0
sumfenmu = 0.0
for i in range(len(tries)):
#special.digamma(z)是 在z处取gamma函数值 再求log
sumfenzialpha += (special.digamma(success[i]+alpha) - special.digamma(alpha))
sumfenzibeta += (special.digamma(tries[i]-success[i]+beta) - special.digamma(beta))
sumfenmu += (special.digamma(tries[i]+alpha+beta) - special.digamma(alpha+beta))
return alpha*(sumfenzialpha/sumfenmu), beta*(sumfenzibeta/sumfenmu)
def update_from_data_by_moment(self, tries, success): #tries是各组总样本数,success是各组click=1的样本和
'''用矩估计 更新Beta里的参数 alpha和beta'''
mean, var = self.__compute_moment(tries, success)
#print 'mean and variance: ', mean, var
#self.alpha = mean * (mean*(1-mean)/(var+0.000001) - 1)
self.alpha = (mean+0.000001) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)
#self.beta = (1-mean) * (mean*(1-mean)/(var+0.000001) - 1)
self.beta = (1.000001 - mean) * ((mean+0.000001) * (1.000001 - mean) / (var+0.000001) - 1)
def __compute_moment(self, tries, success):
'''矩估计'''
ctr_list = []
var = 0.0
for i in range(len(tries)):
ctr_list.append(float(success[i])/(tries[i] + 0.000000001))
mean = sum(ctr_list)/len(ctr_list)
for ctr in ctr_list:
var += pow(ctr-mean, 2) #方差
return mean, var/(len(ctr_list)-1)
import pandas as pd
from tqdm import tqdm
import jieba, os, Levenshtein, time
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from sklearn import preprocessing
import numpy as np
import random, copy
from xpinyin import Pinyin
all_data = pd.read_csv('test_ctr.csv')
all_data.drop('Unnamed: 0',axis =1, inplace=True)
all_data.head()
all_data.columns, all_data.shape
# (Index(['prefix', 'query_prediction', 'title', 'tag', 'label', 'diction_label',
# 'max_query_prediction_keys', 'prefix_pinyin', 'prefix_fix', 'old_index',
# 'random_sector'],
# dtype='object'), (2300000, 11))
all_data['random_sector'].value_counts()
# 1 400883
# 3 400553
# 4 399929
# 5 399922
# 2 398713
# 0 300000
# Name: random_sector, dtype: int64
sec_size = 5
frac_size = 0.5
is_fill_na = False
convert_feature = ['prefix', 'title', 'tag', 'max_query_prediction_keys', 'prefix_pinyin', 'prefix_fix']
for index, feature in enumerate(convert_feature):
print('计算' + feature + '转换率')
for sec in range(sec_size + 1): # 0/1/2/3/4/5
#对train里 分区不是sec的样本 取指定特征和label
temp = all_data[(all_data['label'] != -1)&(all_data['random_sector'] != sec)][[feature, 'label']]
if sec != 0: #对于分区1/2/3/4/5(train集合切分出来的),每个都是 无放回取样50%
temp = temp.sample(frac = frac_size, random_state = 19960121).reset_index(drop = True)
#按指定特征分组,_label_count该组内label为1的样本数、_all_count该组内总的样本数
temp[feature + '_all_count'] = temp.groupby(feature).label.transform('count')
temp[feature + '_label_count'] = temp.groupby(feature).label.transform('sum')
HP = HyperParam(1, 1)
HP.update_from_data_by_moment(temp[feature + '_all_count'].values, temp[feature + '_label_count'].values)
#_convert做了平滑的点击率
temp[feature + '_convert'] = (temp[feature + '_label_count'] + HP.alpha) / (temp[feature + '_all_count'] + HP.alpha + HP.beta)
#如果样本的 特征及相应转化率都相同,去重
temp = temp[[feature, feature + '_convert']].drop_duplicates()
sec_data = copy.deepcopy(all_data[all_data['random_sector'] == sec])
sec_data = pd.merge(sec_data, temp, on = [feature], how = 'left')
if is_fill_na:
sec_data[feature + '_convert'].fillna(HP.alpha / (HP.alpha + HP.beta), inplace = True)
if sec: #sec=1/2/3/4/5
new_all_data = pd.concat((new_all_data, sec_data))
else: #根据for循环判断,先从这开始,sec=0
new_all_data = copy.deepcopy(sec_data)
all_data = copy.deepcopy(new_all_data)