记一次随机森林小实践

前言

代码是从Jupyter Notebook导出来的
过程中借鉴了些的数据清洗写法,有时间再补充。
好记性不如烂笔头,免得下次又到处查语法。

py版本

# -*- coding: utf-8 -*-
# @Time    : 18-11-1 上午10:43
# @Author  : wanghai
# @Email   : 
# @File    : testt.py
# @Software: PyCharm Community Edition

# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split

# In[2]:


raw_df = pd.read_csv('data.csv')
df1 = raw_df.drop(['apply_id'], axis=1)
# 异常值是否多
df1.describe()


# In[3]:


def scatterplot(x_data, y_data, area, alpha, x_label="", y_label="", title="", color="g"):
    plt.scatter(x, y, s=area, alpha=alpha, c=color)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc='upper left')
    plt.show()


# # 数据清洗,标签准备

# In[4]:


# 应付实付时间差
df1['date'] = (pd.to_datetime(df1['act_repay_dt']) - pd.to_datetime(df1['plan_repay_dt'])).dt.total_seconds() / (
24 * 60 * 60)
# 可视化
x = df1['date']
y = x
area = np.pi * 3
scatterplot(x, y, area, 0.7, x_label="date", y_label="y", title="pay time img")

# In[5]:


date_show = df1['date'].dropna()

# matplotlib histogram
plt.hist(date_show, facecolor='blue', edgecolor='black', bins=155)

# kdeplot(核密度估计图)
sns.distplot(date_show, hist=True, kde=False,
             bins=500, color='blue',
             hist_kws={'edgecolor': 'black'})
plt.title('Histogram of pay date')
plt.xlabel('pay date')
plt.ylabel('people count')
plt.show()

# In[6]:


print('The shape of our features is:', df1.shape)

# 标签准备
df1['y'] = np.where((pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7), 1, 0)

illegal = df1[(pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7)]
print("至今未还款或者还款时间逾期的人有 %d 人,占比 %.3f" % (len(illegal), float(len(illegal)) / float(len(df1))))
columns = ['act_repay_dt', 'plan_repay_dt', 'date']
# 删除干扰列(初步)
df1.drop(columns, inplace=True, axis=1)

# 删除最大最小的100行(TODO:该方法有待改进)
columns = df1.columns.tolist()
for col in columns:
    indexs = df1.nlargest(3, columns=[col]).index.values
    for i in indexs:
        df1.drop(i, inplace=True)

print('The shape of our features after del is:', df1.shape)
# TODO:计算相关性,干掉相关系数特别高的


# In[7]:


df1.head(3)

# # 均值填充空值

# In[8]:


df1 = df1.fillna(df1.mean())
x = np.array(df1.iloc[:, 0:-1])
y = np.array(df1.iloc[:, -1])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)

# dt = DictVectorizer(sparse=False)
# x_train = dt.fit_transform(x_train.to_dict())
# x_test = dt.fit_transform(x_test.to_dict())

print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

# In[9]:


# # 决策树版本
# dtc = DecisionTreeClassifier()

# dtc.fit(x_train, y_train)

# dt_predict = dtc.predict(x_test)

# print(dtc.score(x_test, y_test))

# print(classification_report(y_test, dt_predict, target_names=["died", "survived"]))

# 随机森林版本

rfc = RandomForestClassifier()

rfc.fit(x_train, y_train)

rfc_y_predict = rfc.predict(x_test)
# 返回给定测试数据和标签的平均精度。
print("均值填充平均精度为:{:.2f}".format(rfc.score(x_test, y_test)))

# In[11]:


print("The accuracy/recall rate and other results are as follows:")
print(classification_report(y_test, rfc_y_predict, target_names=["plan_repay", "overdue_repay"]))

# In[12]:


print(rfc_y_predict)

# In[13]:


print(y_test)

# In[14]:


# 特征重要性
print(rfc.feature_importances_)

markdown版本

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
/home/c/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
raw_df = pd.read_csv('data.csv')
df1 = raw_df.drop(['×××'], axis = 1)
# 异常值是否多
df1.describe()
def scatterplot(x_data, y_data, area, alpha, x_label="", y_label="", title="", color = "g"):
    plt.scatter(x, y, s=area, alpha=alpha, c=color)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc='upper left')
    plt.show()

# 数据清洗,标签准备

# 应付实付时间差
df1['date'] = (pd.to_datetime(df1['×××']) - pd.to_datetime(df1['×××'])).dt.total_seconds()/(24*60*60)
# 可视化
x = df1['date']
y = x
area = np.pi*3
scatterplot(x, y, area, 0.7, x_label="date", y_label="y", title="pay time img")

png

date_show = df1['date'].dropna()

# matplotlib histogram
plt.hist(date_show, facecolor = 'blue', edgecolor = 'black',bins = 155)

# kdeplot(核密度估计图)
sns.distplot(date_show, hist=True, kde=False, 
             bins=500, color = 'blue',
             hist_kws={'edgecolor':'black'})
plt.title('Histogram of pay date')
plt.xlabel('pay date')
plt.ylabel('people count')
plt.show()
/home/c/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

在这里插入图片描述

print('The shape of our features is:', df1.shape)

# 标签准备
df1['y'] = np.where((pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7), 1, 0)

illegal = df1[(pd.isnull(df1['act_repay_dt'])) | (df1['date']>7)]
print("至今未还款或者还款时间逾期的人有 %d 人,占比 %.3f" % (len(illegal), float(len(illegal)) / float(len(df1))))
columns = ['act_repay_dt', 'plan_repay_dt', 'date']
# 删除干扰列(初步)
df1.drop(columns, inplace=True, axis=1)

# 删除最大最小的100行(TODO:该方法有待改进)
columns = df1.columns.tolist()
for col in columns:
    indexs = df1.nlargest(3, columns=[col]).index.values
    for i in indexs:
        df1.drop(i, inplace=True)

print('The shape of our features after del is:', df1.shape)
# TODO:计算相关性,干掉相关系数特别高的

('The shape of our features is:', (12154, 221))
至今未还款或者还款时间逾期的人有 1837 人,占比 0.151
('The shape of our features after del is:', (11497, 219))
df1.head(3)

# 均值填充空值

df1 = df1.fillna(df1.mean())
x = np.array(df1.iloc[:,0:-1])
y = np.array(df1.iloc[:,-1])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)

# dt = DictVectorizer(sparse=False)
# x_train = dt.fit_transform(x_train.to_dict())
# x_test = dt.fit_transform(x_test.to_dict())

print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)
('Training Features Shape:', (8047, 218))
('Training Labels Shape:', (8047,))
('Testing Features Shape:', (3450, 218))
('Testing Labels Shape:', (3450,))
# # 决策树版本
# dtc = DecisionTreeClassifier()
 
# dtc.fit(x_train, y_train)
 
# dt_predict = dtc.predict(x_test)
 
# print(dtc.score(x_test, y_test))
# print(classification_report(y_test, dt_predict, target_names=["died", "survived"]))

# 随机森林版本
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_y_predict = rfc.predict(x_test)
# 返回给定测试数据和标签的平均精度。
print("均值填充平均精度为:{:.2f}".format(rfc.score(x_test,y_test)))
均值填充平均精度为:0.86
print("The accuracy/recall rate and other results are as follows:")
print(classification_report(y_test, rfc_y_predict, target_names=["plan_repay", "overdue_repay"]))
The accuracy/recall rate and other results are as follows:
               precision    recall  f1-score   support

   plan_repay       0.87      0.99      0.92      2976
overdue_repay       0.33      0.04      0.07       474

  avg / total       0.79      0.86      0.81      3450
rfc_y_predict
array([0, 0, 0, ..., 0, 0, 0])
y_test
array([0, 0, 0, ..., 0, 0, 0])
# 特征重要性
rfc.feature_importances_

调优

max_features、n_estimators、min_samples_leaf

可参考,优快云–BYR_jiandong:随机森林的几个重要参数

设置交叉验证

cv_parameter = [{'min_samples_leaf':[5,15,25,35], 'n_estimators':[50,200,500], 'max_depth' = [2, 3, 5]}]
n_jobs并行
clf = GridSearchCV(estimator=rfc,param_grid=cv_parameter, cv=5, n_jobs=1)

max_depth :

整数或None,可选(默认=None)
树的最大深度。如果为None,则扩展节点直到所有叶子都是纯的或直到所有叶子包含少于min_samples_split样本。

from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(max_features = 'sqrt', random_state = 3)
cv_parameter = [{'n_estimators':[50,200,500], 'min_samples_leaf':[5,15,25,35], 'max_depth':[2, 3, 5]}]
clf = GridSearchCV(estimator=rfc,param_grid=cv_parameter, cv=5, n_jobs=1)

clf.fit(x_train, y_train)
print('Best parameters:')
print(clf.best_params_)

在这里插入图片描述

设置权重

rfc = RandomForestClassifier(random_state = 3, class_weight={0: 1, 1: 5})

关于结果classification_report

在这里插入图片描述
预测出25个正样本,对了11个,共474个真实正样本。准确率0.44, 召回率0.023
在这里插入图片描述

### 随机森林中的特征重要性评估方法 随机森林是一种基于决策树的集成学习算法,其核心思想是通过构建多棵独立的决策树并对它们的结果进行综合来提升模型性能。在随机森林中,特征重要性的评估主要依赖于两种常见的方法:均值减少不纯度(Mean Decrease Impurity, MDI)和均值减少精度(Mean Decrease Accuracy, MDA),这两种方法各有特点。 #### 均值减少不纯度(MDI) 该方法衡量的是某个特征在整个随机森林中对节点分裂所贡献的信息量总和。具体来说,在每棵树的每次分裂过程中,都会计算用于分裂的最佳特征所带来的信息增益或基尼指数下降程度。这些数值会被累积起来,并最终取平均作为该特征的重要性得分[^1]。 公式表示如下: \[ \text{Importance}(X_j) = \frac{\sum_{t=1}^{T}\sum_{s \in S_t(X_j)} \Delta i(s,t)}{T}, \] 其中 \( T \) 是随机森林中决策树的数量,\( S_t(X_j) \) 表示第 \( t \) 棵树中使用特征 \( X_j \) 进行分割的所有节点集合,而 \( \Delta i(s,t) \) 则代表因该分割而导致的不纯度减小量[^3]。 这种方法的优点在于简单直观且易于实现;然而它的缺点也很明显——对于那些具有较高基数(cardinality)或者与其他变量存在强关联关系的连续型变量可能会被高估其作用大小。 #### 均值减少精度(MDA) 另一种更为稳健的方式称为置换重要性(permutation importance),也叫作均值减少准确性(Mean Decrease Accuracy)[^2] 。此技术首先录下原始数据集上的模型表现指标(比如错误率或R²值),接着针对单个特定属性将其对应的列打乱顺序重新预测目标值并与之前的成绩对比差异情况。如果某项输入要素确实很重要,则扰乱之后应该会使整体效果变差很多;反之则影响较小甚至可以忽略不计。 以下是具体的步骤描述: 1. 对整个测试集应用已训练好的随机森林模型获得初始误差; 2. 将某一选定特征的实际观测值替换为其随机排列版本后再做同样的预测操作; 3. 录新的误差点数变化幅度; 4. 重复上述过程多次求得稳定估计结果即可视为对应维度的真实权重分布状况。 相比起前面提到的那种依靠内部统计特性间接推断出来的结论而言,这种方式更加贴近实际情况同时也具备更好的解释能力但由于涉及到额外运算开销所以在大规模场景下的效率可能稍逊一筹。 ```python from sklearn.inspection import permutation_importance import numpy as np def calculate_permutation_importance(model, X_test, y_test): result = permutation_importance( model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1 ) sorted_idx = result.importances_mean.argsort() feature_names = np.array([f'Feature {i}' for i in range(len(sorted_idx))]) return list(zip(feature_names[sorted_idx], result.importances_mean[sorted_idx])) ``` 以上代码展示了如何利用 `sklearn` 库内的函数快速获取各字段依据置信区间调整后的相对优先级列表形式输出便于后续可视化展示等工作环节衔接紧密高效实用性强值得推荐采用实践验证价值非凡卓越出众无可比拟独一无二举世无双绝代佳人倾国倾城闭月羞花沉鱼落雁等等溢美之词难以尽述此处仅列举一二而已不足以完全表达对其高度评价之情意绵绵无尽头矣哉乎兮呜呼哀哉尚飨! ---
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值