01 集成学习 - 概述、Bagging - 随机森林、袋外错误率
02 集成学习 - 特征重要度、Extra Tree、TRTE、IForest、随机森林总结
03 集成学习 - Boosting - AdaBoost算法原理
04 集成学习 - Boosting - AdaBoost算法构建
05 集成学习 - Boosting - GBDT初探
06 集成学习 - Boosting - GBDT算法原理、总结
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
# 引入了集成学习的随机森林库
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import label_binarize
from sklearn import metrics
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
names = ['Age', 'Number of sexual partners', 'First sexual intercourse',
'Num of pregnancies', 'Smokes', 'Smokes (years)',
'Smokes (packs/year)', 'Hormonal Contraceptives',
'Hormonal Contraceptives (years)', 'IUD', 'IUD (years)', 'STDs',
'STDs (number)', 'STDs:condylomatosis',
'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
'Citology', 'Biopsy']#df.columns
path = "datas/risk_factors_cervical_cancer.csv" # 数据文件路径
data = pd.read_csv(path)
X = data[names[0:-4]]
Y = data[names[-4:]]
#随机森林可以处理多个目标变量的情况
X.head(5)
Y.head(5)
这个案例中需要预测的目标有四个目标值:Hiselmann、Schiller、Citlolgy、Biopsy。随机森林模型的一个特点是它可以同时预测多个属性。