记一次随机森林小实践

最新推荐文章于 2025-07-27 21:25:34 发布

原创最新推荐文章于 2025-07-27 21:25:34 发布 · 922 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#RandomFroest

Deep/Machine Learning 专栏收录该内容

27 篇文章

订阅专栏

文章目录

前言
py版本
markdown版本
调优
关于结果classification_report

前言

代码是从Jupyter Notebook导出来的
过程中借鉴了些的数据清洗写法，有时间再补充。
好记性不如烂笔头，免得下次又到处查语法。

py版本

# -*- coding: utf-8 -*-
# @Time    : 18-11-1 上午10:43
# @Author  : wanghai
# @Email   : 
# @File    : testt.py
# @Software: PyCharm Community Edition

# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split

# In[2]:


raw_df = pd.read_csv('data.csv')
df1 = raw_df.drop(['apply_id'], axis=1)
# 异常值是否多
df1.describe()


# In[3]:


def scatterplot(x_data, y_data, area, alpha, x_label="", y_label="", title="", color="g"):
    plt.scatter(x, y, s=area, alpha=alpha, c=color)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc='upper left')
    plt.show()


# # 数据清洗，标签准备

# In[4]:


# 应付实付时间差
df1['date'] = (pd.to_datetime(df1['act_repay_dt']) - pd.to_datetime(df1['plan_repay_dt'])).dt.total_seconds() / (
24 * 60 * 60)
# 可视化
x = df1['date']
y = x
area = np.pi * 3
scatterplot(x, y, area, 0.7, x_label="date", y_label="y", title="pay time img")

# In[5]:


date_show = df1['date'].dropna()

# matplotlib histogram
plt.hist(date_show, facecolor='blue', edgecolor='black', bins=155)

# kdeplot(核密度估计图)
sns.distplot(date_show, hist=True, kde=False,
             bins=500, color='blue',
             hist_kws={'edgecolor': 'black'})
plt.title('Histogram of pay date')
plt.xlabel('pay date')
plt.ylabel('people count')
plt.show()

# In[6]:


print('The shape of our features is:', df1.shape)

# 标签准备
df1['y'] = np.where((pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7), 1, 0)

illegal = df1[(pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7)]
print("至今未还款或者还款时间逾期的人有 %d 人，占比 %.3f" % (len(illegal), float(len(illegal)) / float(len(df1))))
columns = ['act_repay_dt', 'plan_repay_dt', 'date']
# 删除干扰列（初步）
df1.drop(columns, inplace=True, axis=1)

# 删除最大最小的100行（TODO:该方法有待改进）
columns = df1.columns.tolist()
for col in columns:
    indexs = df1.nlargest(3, columns=[col]).index.values
    for i in indexs:
        df1.drop(i, inplace=True)

print('The shape of our features after del is:', df1.shape)
# TODO：计算相关性，干掉相关系数特别高的


# In[7]:


df1.head(3)

# # 均值填充空值

# In[8]:


df1 = df1.fillna(df1.mean())
x = np.array(df1.iloc[:, 0:-1])
y = np.array(df1.iloc[:, -1])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)

# dt = DictVectorizer(sparse=False)
# x_train = dt.fit_transform(x_train.to_dict())
# x_test = dt.fit_transform(x_test.to_dict())

print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

# In[9]:


# # 决策树版本
# dtc = DecisionTreeClassifier()

# dtc.fit(x_train, y_train)

# dt_predict = dtc.predict(x_test)

# print(dtc.score(x_test, y_test))

# print(classification_report(y_test, dt_predict, target_names=["died", "survived"]))

# 随机森林版本

rfc = RandomForestClassifier()

rfc.fit(x_train, y_train)

rfc_y_predict = rfc.predict(x_test)
# 返回给定测试数据和标签的平均精度。
print("均值填充平均精度为：{:.2f}".format(rfc.score(x_test, y_test)))

# In[11]:


print("The accuracy/recall rate and other results are as follows：")
print(classification_report(y_test, rfc_y_predict, target_names=["plan_repay", "overdue_repay"]))

# In[12]:


print(rfc_y_predict)

# In[13]:


print(y_test)

# In[14]:


# 特征重要性
print(rfc.feature_importances_)

markdown版本

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split

/home/c/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

raw_df = pd.read_csv('data.csv')
df1 = raw_df.drop(['×××'], axis = 1)
# 异常值是否多
df1.describe()

def scatterplot(x_data, y_data, area, alpha, x_label="", y_label="", title="", color = "g"):
    plt.scatter(x, y, s=area, alpha=alpha, c=color)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc='upper left')
    plt.show()

# 数据清洗，标签准备

# 应付实付时间差
df1['date'] = (pd.to_datetime(df1['×××']) - pd.to_datetime(df1['×××'])).dt.total_seconds()/(24*60*60)
# 可视化
x = df1['date']
y = x
area = np.pi*3
scatterplot(x, y, area, 0.7, x_label="date", y_label="y", title="pay time img")

png

date_show = df1['date'].dropna()

# matplotlib histogram
plt.hist(date_show, facecolor = 'blue', edgecolor = 'black',bins = 155)

# kdeplot(核密度估计图)
sns.distplot(date_show, hist=True, kde=False, 
             bins=500, color = 'blue',
             hist_kws={'edgecolor':'black'})
plt.title('Histogram of pay date')
plt.xlabel('pay date')
plt.ylabel('people count')
plt.show()

/home/c/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

在这里插入图片描述

print('The shape of our features is:', df1.shape)

# 标签准备
df1['y'] = np.where((pd.isnull(df1['act_repay_dt'])) | (df1['date'] > 7), 1, 0)

illegal = df1[(pd.isnull(df1['act_repay_dt'])) | (df1['date']>7)]
print("至今未还款或者还款时间逾期的人有 %d 人，占比 %.3f" % (len(illegal), float(len(illegal)) / float(len(df1))))
columns = ['act_repay_dt', 'plan_repay_dt', 'date']
# 删除干扰列（初步）
df1.drop(columns, inplace=True, axis=1)

# 删除最大最小的100行（TODO:该方法有待改进）
columns = df1.columns.tolist()
for col in columns:
    indexs = df1.nlargest(3, columns=[col]).index.values
    for i in indexs:
        df1.drop(i, inplace=True)

print('The shape of our features after del is:', df1.shape)
# TODO：计算相关性，干掉相关系数特别高的

('The shape of our features is:', (12154, 221))
至今未还款或者还款时间逾期的人有 1837 人，占比 0.151
('The shape of our features after del is:', (11497, 219))

df1.head(3)

# 均值填充空值

df1 = df1.fillna(df1.mean())
x = np.array(df1.iloc[:,0:-1])
y = np.array(df1.iloc[:,-1])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)

# dt = DictVectorizer(sparse=False)
# x_train = dt.fit_transform(x_train.to_dict())
# x_test = dt.fit_transform(x_test.to_dict())

print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

('Training Features Shape:', (8047, 218))
('Training Labels Shape:', (8047,))
('Testing Features Shape:', (3450, 218))
('Testing Labels Shape:', (3450,))

# # 决策树版本
# dtc = DecisionTreeClassifier()
 
# dtc.fit(x_train, y_train)
 
# dt_predict = dtc.predict(x_test)
 
# print(dtc.score(x_test, y_test))
# print(classification_report(y_test, dt_predict, target_names=["died", "survived"]))

# 随机森林版本
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_y_predict = rfc.predict(x_test)
# 返回给定测试数据和标签的平均精度。
print("均值填充平均精度为：{:.2f}".format(rfc.score(x_test,y_test)))

均值填充平均精度为：0.86

print("The accuracy/recall rate and other results are as follows：")
print(classification_report(y_test, rfc_y_predict, target_names=["plan_repay", "overdue_repay"]))

The accuracy/recall rate and other results are as follows：
               precision    recall  f1-score   support

   plan_repay       0.87      0.99      0.92      2976
overdue_repay       0.33      0.04      0.07       474

  avg / total       0.79      0.86      0.81      3450

rfc_y_predict

array([0, 0, 0, ..., 0, 0, 0])

y_test

array([0, 0, 0, ..., 0, 0, 0])

# 特征重要性
rfc.feature_importances_

调优

max_features、n_estimators、min_samples_leaf

可参考，优快云–BYR_jiandong:随机森林的几个重要参数

设置交叉验证

cv_parameter = [{'min_samples_leaf':[5,15,25,35], 'n_estimators':[50,200,500], 'max_depth' = [2, 3, 5]}]
n_jobs并行
clf = GridSearchCV(estimator=rfc,param_grid=cv_parameter, cv=5, n_jobs=1)

max_depth ：

整数或None，可选（默认=None）
树的最大深度。如果为None，则扩展节点直到所有叶子都是纯的或直到所有叶子包含少于min_samples_split样本。

from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(max_features = 'sqrt', random_state = 3)
cv_parameter = [{'n_estimators':[50,200,500], 'min_samples_leaf':[5,15,25,35], 'max_depth':[2, 3, 5]}]
clf = GridSearchCV(estimator=rfc,param_grid=cv_parameter, cv=5, n_jobs=1)

clf.fit(x_train, y_train)
print('Best parameters:')
print(clf.best_params_)