政治献金预测
# 导入相关库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
os.environ['PATH'] += ';...\\Graphviz2.38\\bin' # Graphviz 临时环境变量
数据预处理
### 导入数据
# 设置随机种子
SEED = 222
np.random.seed(SEED)
df = pd.read_csv(r'...\data\federal_giving.csv', low_memory=False)
# 去除整列值相同的列
df = df.loc[:, (df != df.iloc[0]).any()]
feature_columns = ['entity_tp', 'classification', 'rpt_tp', 'cycle',
'transaction_amt', 'state', 'transaction_tp']
### 训练集和测试集
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
def get_train_test(test_size=0.95): # 速度快方便学习
"""划分训练集测试集"""
y = 1 * (df['cand_pty_affiliation'] == "REP") # 映射 REP:1, 非REP:0
X = df[feature_columns].copy()
# 填充空值 mode()众数
for col in ['cycle', 'entity_tp', 'state']:
X[col].fillna(X[col].mode()[0], inplace=True)
X = pd.get_dummies(X, sparse=True) # 独热编码,内存要求高
return train_test_split(X, y, test_size=test_size, random_state=SEED)
xtrain, xtest, ytrain, ytest = get_train_test()
特征值说明
- entity_tp: 个人还是组织
- classification: 领域
- rpt_tp: 贡献的大小
- cycle: 捐赠年份
- transaction_amt: 捐赠金额
- state: 州
查看目标值的种类和数量
df['cand_pty_affiliation'].value_counts(normalize=True).plot(
kind="bar", title="Share of No. donations")
plt.show()
定义绘制决策树的方法
import pydotplus
from IPython.display import Image
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
def print_graph(dtr, feature_names):
"""绘制决策树"""
graph = export_graphviz(dtr, label="root", proportion=True, impurity=False,
out_file=None, feature_names=feature_names, class_names={
0: "D", 1: "R"},
filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(graph)
return Image(graph.create_png())
绘制决策树
# 创建最大深度3的决策树对象
dtr1 = DecisionTreeClassifier(max_depth=3, random_state=SEED)
dtr1.fit(xtrain, ytrain)
# 预测概率值
p1 = dtr1.predict_proba(xtest)[:, 1]
# 决策树的 ROC-AUC 得分
print("Decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p1))
# 绘制决策树
print_graph(dtr1, xtrain.columns