Python数据分析与机器学习-用户流失预警

最新推荐文章于 2023-09-18 17:18:29 发布

原创最新推荐文章于 2023-09-18 17:18:29 发布 · 2.3k 阅读

5 ·

CC 4.0 BY-SA版权

文章标签：

#Python数据分析与机器学习

python 专栏收录该内容

30 篇文章

订阅专栏

本文介绍了一个使用多种机器学习算法预测客户流失概率的案例研究。通过对客户数据进行预处理和特征工程，采用支持向量机、随机森林及K近邻算法进行交叉验证，并评估了这些模型的准确性。

import pandas as pd
import numpy as np

pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)

churn_df = pd.read_csv('churn.csv')
'''
  State  Account Length  Area Code     Phone Int'l Plan VMail Plan  VMail Message  Day Mins  Day Calls  Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls  Churn?
0    KS             128        415  382-4657         no        yes             25     265.1        110       45.07     197.4         99       16.78       244.7           91         11.01       10.0           3         2.70               1  False.
1    OH             107        415  371-7191         no        yes             26     161.6        123       27.47     195.5        103       16.62       254.4          103         11.45       13.7           3         3.70               1  False.
2    NJ             137        415  358-1921         no         no              0     243.4        114       41.38     121.2        110       10.30       162.6          104          7.32       12.2           5         3.29               0  False.
3    OH              84        408  375-9999        yes         no              0     299.4         71       50.90      61.9         88        5.26       196.9           89          8.86        6.6           7         1.78               2  False.
4    OK              75        415  330-6626        yes         no              0     166.7        113       28.34     148.3        122       12.61       186.9          121          8.41       10.1           3         2.73               3  False.

'''
churn_feat_space = churn_df.drop(['State', 'Area Code', 'Phone', 'Churn?'], axis=1)
yes_no_cols = ["Int'l Plan", "VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
# features = churn_feat_space.columns
# print(churn_feat_space.head())
'''
   Account Length  Int'l Plan  VMail Plan  VMail Message  Day Mins  Day Calls  Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls
0             128       False        True             25     265.1        110       45.07     197.4         99       16.78       244.7           91         11.01       10.0           3         2.70               1
1             107       False        True             26     161.6        123       27.47     195.5        103       16.62       254.4          103         11.45       13.7           3         3.70               1
2             137       False       False              0     243.4        114       41.38     121.2        110       10.30       162.6          104          7.32       12.2           5         3.29               0
3              84        True       False              0     299.4         71       50.90      61.9         88        5.26       196.9           89          8.86        6.6           7         1.78               2
4              75        True       False              0     166.7        113       28.34     148.3        122       12.61       186.9          121          8.41       10.1           3         2.73               3
'''
X = churn_feat_space.as_matrix().astype(np.float)
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.', 1, 0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
# print(X[0])
'''
[ 0.67648946 -0.32758048  1.6170861   1.23488274  1.56676695  0.47664315
  1.56703625 -0.07060962 -0.05594035 -0.07042665  0.86674322 -0.46549436
  0.86602851 -0.08500823 -0.60119509 -0.0856905  -0.42793202]
'''

'''交叉验证通用函数'''
from sklearn.cross_validation import KFold


# X,y,选择的分类器,参数
def run_cv(X, y, clf_class, **kwargs):
    # Construct a kfolds object
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred


from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN


# 精度
def accuracy(y_true, y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)


print("Support vector machines:")
print("%.3f" % accuracy(y, run_cv(X, y, SVC)))
print("Random forest:")
print("%.3f" % accuracy(y, run_cv(X, y, RF)))
print("K-nearest-neighbors:")
print("%.3f" % accuracy(y, run_cv(X, y, KNN)))


# 客户流失的概率
def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y), 2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob


# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
# print pred_prob[0]
pred_churn = pred_prob[:, 1]
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
# print counts

# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

# pandas-fu
counts = pd.concat([counts, true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
print(counts)