import pandas as pd
import numpy as np
pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)
churn_df = pd.read_csv('churn.csv')
'''
State Account Length Area Code Phone Int'l Plan VMail Plan VMail Message Day Mins Day Calls Day Charge Eve Mins Eve Calls Eve Charge Night Mins Night Calls Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn?
0 KS 128 415 382-4657 no yes 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 False.
1 OH 107 415 371-7191 no yes 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 False.
2 NJ 137 415 358-1921 no no 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 False.
3 OH 84 408 375-9999 yes no 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 False.
4 OK 75 415 330-6626 yes no 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 False.
'''
churn_feat_space = churn_df.drop(['State', 'Area Code', 'Phone', 'Churn?'], axis=1)
yes_no_cols = ["Int'l Plan", "VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
# features = churn_feat_space.columns
# print(churn_feat_space.head())
'''
Account Length Int'l Plan VMail Plan VMail Message Day Mins Day Calls Day Charge Eve Mins Eve Calls Eve Charge Night Mins Night Calls Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls
0 128 False True 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1
1 107 False True 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1
2 137 False False 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0
3 84 True False 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2
4 75 True False 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3
'''
X = churn_feat_space.as_matrix().astype(np.float)
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.', 1, 0)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
# print(X[0])
'''
[ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315
1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436
0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202]
'''
'''交叉验证通用函数'''
from sklearn.cross_validation import KFold
# X,y,选择的分类器,参数
def run_cv(X, y, clf_class, **kwargs):
# Construct a kfolds object
kf = KFold(len(y), n_folds=5, shuffle=True)
y_pred = y.copy()
# Iterate through folds
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
# Initialize a classifier with key word arguments
clf = clf_class(**kwargs)
clf.fit(X_train, y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
# 精度
def accuracy(y_true, y_pred):
# NumPy interprets True and False as 1. and 0.
return np.mean(y_true == y_pred)
print("Support vector machines:")
print("%.3f" % accuracy(y, run_cv(X, y, SVC)))
print("Random forest:")
print("%.3f" % accuracy(y, run_cv(X, y, RF)))
print("K-nearest-neighbors:")
print("%.3f" % accuracy(y, run_cv(X, y, KNN)))
# 客户流失的概率
def run_prob_cv(X, y, clf_class, **kwargs):
kf = KFold(len(y), n_folds=5, shuffle=True)
y_prob = np.zeros((len(y), 2))
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train, y_train)
# Predict probabilities, not classes
y_prob[test_index] = clf.predict_proba(X_test)
return y_prob
# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
# print pred_prob[0]
pred_churn = pred_prob[:, 1]
is_churn = y == 1
# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
# print counts
# calculate true probabilities
true_prob = {}
for prob in counts.index:
true_prob[prob] = np.mean(is_churn[pred_churn == prob])
true_prob = pd.Series(true_prob)
# pandas-fu
counts = pd.concat([counts, true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
print(counts)
Python数据分析与机器学习-用户流失预警
最新推荐文章于 2023-09-18 17:18:29 发布
本文介绍了一个使用多种机器学习算法预测客户流失概率的案例研究。通过对客户数据进行预处理和特征工程,采用支持向量机、随机森林及K近邻算法进行交叉验证,并评估了这些模型的准确性。
1万+

被折叠的 条评论
为什么被折叠?



