import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import lightgbm as lgb
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit,cross_validate,learning_curve
import numpy as np
'''
1-多分类的概率置信区间
2-多分类的psi指标
3-两个样本,从样本B从选出与样本A相似的若干人群(样本B中每个组拿出的人数均衡)
4-多分类拆二分类,再融合
5-样本赋权重(w = np.random.rand(5,1));每一类有自己的权重
'''
###################################################### BP神经网络 ########################################################
def BP_DNN(xtrain,ytrain,xtest,ytest):
from keras import models
from keras import layers
from sklearn.metrics import roc_auc_score,precision_score, recall_score,f1_score,accuracy_score,classification_report
import time
start = time.clock()
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(xtrain.shape[1],)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
nn_pred_y = model.predict(xtest)
nn_pred_y_list = []
for i in range(len(nn_pred_y)):
if nn_pred_y[i]>0.5:
nn_pred_y_list.append(1)
else:
nn_pred_y_list.append(0)
y_scores = pd.DataFrame(nn_pred_y)[0].values
print("BP_DNN","混淆矩阵")
print(confusion_matrix(ytest, nn_pred_y_list))
print('Test_Accuracy:',accuracy_score(ytest, nn_pred_y_list))
print('Test_Precision:',precision_score(ytest,nn_pred_y_list))
print('Test_Recall:',recall_score(ytest, nn_pred_y_list))
print('Test_F1:',f1_score(ytest, nn_pred_y_list))
print('Test_AUC:',roc_auc_score(ytest, nn_pred_y_list))
elapsed = (time.clock() - start)
print("BP_DNN","Time used:",elapsed)
print('-=========')
print("BP_DNN","Report:","\n",classification_report(ytest, nn_pred_y_list))
################################################多分类的概率置信区间评估########################################################
if __name__ == '__main__':
# 多分类数据集
from sklearn import datasets
iris = datasets.load_iris()
data_x = iris.data
data_y = iris.target
# LR拟合
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
modeol = LogisticRegression()
#modeol = DecisionTreeClassifier()
modeol.fit(data_x, data_y)
modeol_predict_proba = modeol.predict_proba(data_x)
modeol_predict = modeol.predict(data_x)
data = np.column_stack([modeol_predict,modeol_predict_proba])
print(data.shape)
def clf_confidence_score(data):
'''
data = np.column_stack([modeol_predict,modeol_predict_proba])
index规定了置信区间的划分
'''
import numpy as np
import pandas as pd
'''构造置信区间'''
confidence = np.arange(0, 0.05 + 1, 0.05)
confidence = confidence.round(2)
index= []
for i in range(len(confidence) - 1):
index.append([confidence[i], confidence[i + 1]])
#print(index)
pd_list=[]
tmp_list=[]
data = pd.DataFrame(data)
len_all = data.shape[0]
len_s_list = []
for i in range(len(list(data.iloc[:,0].unique()))):
len_s = data.loc[(data.iloc[:,0] == list(data.iloc[:,0].unique())[i])].shape[0]
len_s_list.append([list(data.iloc[:,0].unique())[i],len_s])
#print(np.array(len_s_list))
len_s_list_ratio = [x / sum(np.array(len_s_list)[:,1]) for x in np.array(len_s_list)[:,1]]
#print(np.array(len_s_list_ratio))
clf_data_array = np.array(data)
#per_number = []
#for i in range(len(index)):
#count_yh_sum=[]
#count_label_sum=[]
#for j in range(data.shape[1]-1):
#print(j+1,index[i][0],index[i][1])
#count_yh=clf_data_array[:,j+1][np.where((clf_data_array[:,j+1]>index[i][0]) & (clf_data_array[:,j+1]<=index[i][1]))].shape[0]
#print(index[i][0],index[i][1],j,count_yh)
#print(i,j,count_yh)
#count_yh_sum.append(count_yh)
#tmp_list.append(sum(count_yh_sum))
#per_number.append(tmp_list[-1])
#print(per_number)
#print(len(per_number))
#print(data.shape[1]-1)
#per_number_list = [x for x in per_number for z in range(data.shape[1]-1)]
#print(per_number_list)
#print(len(per_number_list))
#print(len(per_number_list) // (data.shape[1]-1))
#per_number_list = np.array(per_number_list).reshape(len(per_number_list) // (data.shape[1]-1), data.shape[1]-1)
#print(sum(per_number_list[:,0]))
#print(clf_data_array)
#print(clf_data_array[np.where( (clf_data_array[:, 0] == 0) & (clf_data_array[:, 0+1] > 0.9) )])
'''计算 覆盖率/准确率/提升率'''
for i in range(len(index)):
count_yh_sum=[]
count_label_sum=[]
for j in range(data.shape[1]-1):
#print(j)
count_yh = clf_data_array[:, j + 1][np.where((clf_data_array[:, j + 1] > index[i][0])
& (clf_data_array[:, j + 1] <= index[i][1]))].shape[0]
count_label = clf_data_array[np.where((clf_data_array[:, j+1] > index[i][0]) & (clf_data_array[:, j+1] <= index[i][1])
& (clf_data_array[:, 0] == j))].shape[0]
#print(i,j,clf_data_array[:, j+1],clf_data_array[np.where((clf_data_array[:, j+1] > index[i][0]) & (clf_data_array[:, j+1] <= index[i][1])
#& (clf_data_array[:, 0] == j))])
#print(index[i][0],index[i][1],j,clf_data_array[np.where((clf_data_array[:, j] > index[i][0]) & (clf_data_array[:, j] <= index[i][1])
#& (clf_data_array[:,0]==j))].shape[0])
pd_list.append(["-".join(map(str, index[i])),j,len_s_list[j][1]
,count_label,count_yh
,((count_label)/(len_s_list[j][1]+0.00001))
,(count_label/(count_yh+0.00001))
,(count_label/(count_yh+0.00001))/len_s_list_ratio[j]])
#print("-".join(map(str, index[i])),j,len_s_list[j][1]
#,count_label,count_yh
#,((count_label)/(len_s_list[j][1]+0.00001))
#,(count_label/(count_yh+0.00001))
#,(count_label/(count_yh+0.00001))/len_s_list_ratio[j])
pd_data = pd.DataFrame(pd_list,columns=['置信区间','flag','各组类别数','各组召回人数','各组总人数','各组召回比率','各组准确率','各组提升度'])
#pd_data = pd_data.sort_values('置信区间',ascending=True)
#print(pd_data)
pivot_pd_data = pd.pivot_table(pd_data,index=[u'置信区间','flag'])
#pd_data = pd.pivot_table(pd_data,index=[u'置信区间',u'flag'],values=[u'各组用户数',u'各组召回人数',u'各组召回比率',u'各组准确率',u'各组提升度'],aggfunc=np.mean)
#pivot_pd_data.insert(-1,'该组用户总数',[x for x in per_number])
#pd_data = pd_data.sort_values(by=['置信区间'],ascending=False)
#pivot_pd_data['总人数'] = [x for x in per_number for zz in range(data.shape[1]-1)]
#print([x for x in per_number for zz in range(3)])
#print(len([x for x in per_number for zz in range(3)]))
#pivot_pd_data = pivot_pd_data.sort_values(pivot_pd_data.iloc[:,0],ascending=False)
print(pivot_pd_data)
#return pivot_pd_data
clf_confidence_score(data)
#print(pivot_pd_data)
################################################# 从样本B中找到与样本A相似的人群 ############################################
'''
方法1:
对样本A降维;聚类-找到每个类的聚类中心,构成待遍历list[list](写成字典)【将为之前若做PCA处理会极大程度降低相似度计算】
遍历每一个样本B,对list[list]去计算【欧氏距离;余弦相似度】,只要与list[list]中任一一个相似度大于某个阈值 则认为该样本B可以加入待扩散人群
要清楚的是:我们使用无监督降维的对象仅限于基于有监督学习的样本(样本量不会很多)
方法2:
利用决策树去看分裂特征及分裂阈值(不稳定;特征选择具有随机性;但是速度快)
'''
from sklearn.cluster import KMeans,MeanShift,DBSCAN
def distEclud(vecA,vecB):
'''欧式距离'''
return np.sum(np.power(vecA - vecB,2))
def cosine_dis(x,y):
'''余弦相似度'''
num=0
for i in range(len(x)):
num+=float(x[i]*y[i])
value2 = np.linalg.norm(x)*np.linalg.norm(y)
if value2!=0:
score=round(num/float(value2),3)
else:
score=0
return score
if __name__ == '__main__':
train_data = data_x[:100]
spread_data = data_x[100:]
sse=[]
for k in range(1,10):
model = KMeans(n_clusters=k,max_iter=500,random_state=1)
model.fit(train_data)
sse.append(model.inertia_)
#k=range(1,10)
#plt.plot(k,sse,'o-')
#plt.show()
k=2
iteration=500
from sklearn.cluster import KMeans
clf=KMeans(n_clusters=k,n_jobs=-1,max_iter=iteration,random_state=1)
clf.fit(train_data)
kmeans_labels = clf.labels_
#print(clf.labels_)
#print(clf.cluster_centers_)
wait_spred_center_ = clf.cluster_centers_
#print(distEclud(data_x[0],data_x[1]))
#print(data_x[0],data_x[1])
#print(cosine_dis(data_x[0], data_x[1]))
#print(cosine_dis(data_x[0], [0,0.2,0.1,1]))
#print(wait_spred_center_)
import time
start = time.clock()
spread_list=[]
for i in range(spread_data.shape[0] - 1):
for j in range(len(wait_spred_center_)):
spread_list.append([spread_data[i],wait_spred_center_[j]
,cosine_dis(spread_data[i], wait_spred_center_[j])
, np.array(distEclud(spread_data[i], wait_spred_center_[j])).clip(0,1)])
#print(spread_list)
spread_np = np.array(spread_list)
#print(spread_np.shape)
can_spread = spread_np[np.where((spread_np[:,-2] >= 0.95))]
cant_spread = spread_np[np.where((spread_np[:,-2] < 0.95))]
#print(can_spread.shape)
#np.unique按行/列进行去重,可以得到待扩散的user_id
#print(can_spread.shape)
#print(cant_spread.shape)
elapsed = (time.clock() - start)
#print("Time used:",elapsed)
################################################# 样本赋权重;每一类有自己的权重 ############################################
if __name__ == '__main__':
w = np.random.rand(5,1)
weight_data = np.column_stack([data_x,data_y])
#print(weight_data.shape)
weight_array = np.array(([0,0.3],[1,0.3],[2,0.4]))
#print(weight_array)
#print(weight_data)
pd_weight_data = pd.DataFrame(weight_data,columns=['x1','x2','x3','x4','flag'])
pd_weight_array = pd.DataFrame(weight_array,columns=['flag','weights'])
final_weight_data = pd.merge(pd_weight_data,pd_weight_array)
#print(final_weight_data.shape)
#print(final_weight_data.sample(3))
datax = final_weight_data.iloc[:,:4]
datay = final_weight_data['flag']
#print(dict={})
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
modeol = DecisionTreeClassifier()
modeol.fit(datax, datay)
modeol_predict_proba = modeol.predict_proba(data_x)
modeol_predict = modeol.predict(data_x)
#print(datax)
0.20180803
最新推荐文章于 2022-07-07 10:43:28 发布