一、Doing Basic classifications with Decision Trees
1.创建数据集(make classification datasets)
from sklearn import datasets
X,y =datasets.make_classification(n_samples=1000,n_features=3,n_redundant=0)
2. Import the object and then fit the model
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(X,y)
# Out[47]:
# DecisionTreeClassifier(class_weight=None, criterion='gini',
# max_depth=None,max_features=None, max_leaf_nodes=None,
# min_samples_leaf=1,min_samples_split=2,
# min_weight_fraction_leaf=0.0,presort=False, random_state=None,
# splitter='best')
preds = dt.predict(X)
(y==preds).mean()
# Out[49]: 1.0
3. Look at different accuracies with different n_features
n_features = 200
X,y =datasets.make_classification(750,n_features,n_informative=5)
import numpy as np
training = np.random.choice([True,False],p=[.75,.25],size=len(y))
for x in np.arange(1,n_features+1):
dt = DecisionTreeClassifier(max_depth=x)
dt.fit(X[training],y[training])
preds = dt.predict(X[~training])
accuracies.append((preds == y[~training]).mean())
import matplotlib.pyplot as plt
f,ax = plt.subplots(figsize=(7,5))
ax.plot(range(1,n_features+1),accuracies,color='k')
ax.set_title("Decision Tree Accuracy")
ax.set_ylabel("% Correct")
ax.set_xlabel("Max Depth")
From the above graph, we can see that wee can actually get pretty accuracy at a low max depth. Let’s take a closer look at the accuracy at low levels, say the first 15:
N=15
import matplotlib.pyplot as plt
f,ax = plt.subplots(figsize=(7,5))
ax.subplot(range(1,n_features+1)[:N],accuracies[:N],color='k')
ax.set_title("Decision Tree Accuracy")
ax.set_ylabel("% Correct")
ax.set_xlabel("Max Depth")
4. Tuning a Decision Tree Model
from sklearn import datasets,tree
X, y = datasets.make_classification(1000,20,n_informative =3)
dt = tree.DecisionTreeClassifier()
dt.fit(X,y)
from io import StringIO
import pydotplus
str_buffer = StringIO()
tree.export_graphviz(dt,out_file=str_buffer)
graph = pydotplus.graph_from_dot_data(str_buffer.getvalue())
graph.write_jpeg("myfile.jpg")
graph.write_pdf("myfile.pdf")
写成函数:
dt = tree.DecisionTreeClassifier(max_depth=5).fit(X,y)
def plot_dt(model, filename):
str_buffer = StringIO()
tree.export_graphviz(model,out_file=str_buffer)
graph = pydotplus.graph_from_dot_data(str_buffer.getvalue())
graph.write_jpg(filename)
plot_dt(dt,"myfile.png")
5. Using many Decision Trees – random forests
from sklearn import datasets
X,y = datasets.make_classification(1000)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X, y)
# To see how well we fit the training data
print("Accuracy:\t",(y == rf.predict(X)).mean())
print("Total Correct:\t",(y == rf.predict(X)).sum())
#Accuracy: 0.996
#Total Correct: 996
预测分类概率
probs = rf.predict_proba(X)
import pandas as pd
probs_df = pd.DataFrame(probs,columns=['0','1'])
probs_df['was_correct'] = rf.predict(X) == y
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=(7,5))
probs_df.groupby('0').was_correct.mean().plot(kind='bar',ax=ax)
ax.set_title("Accuracy at 0 class probability")
ax.set_ylabel("% Correct")
ax.set_xlabel("% trees for 0")
查看字段重要性分布
rf = RandomForestClassifier()
rf.fit(X, y)
f, ax = plt.subplots(figsize=(7,5))
ax.bar(range(len(rf.feature_importances_)),rf.feature_importances_)
ax.set_title("Feature Importances")
6.Tuning a random forest model
**# choices for max_features** 最大特征数量
from sklearn.metrics import confusion_matrix
max_feature_params = ['auto','sqrt','log2',.01,.5,.99]
confusion_matrixes = {}
for max_feature in max_feature_params:
rf = RandomForestClassifier(max_features = max_feature)
rf.fit(X[training],y[training])
y_pred = rf.predict(X[~training])
confusion_matrixes[max_feature]=confusion_matrix(y[~training],y_pred).ravel()
import pandas as pd
confusion_df = pd.DataFrame(confusion_matrixes)
import itertools
from matplotlib import pyplot as plt
f,ax = plt.subplots(figsize = (7,5))
confusion_df.plot(kind='bar',ax=ax)
ax.legend(loc='best')
ax.set_title("Guessed vs Correct (i,j) where i is the guess and j is the actual")
ax.grid()
ax.set_xticklabels([str((i,j)) for i,j in list(itertools.product(range(2),range(2)))])
ax.set_xlabel("Guessed vs Correct")
ax.set_ylabel("Correct")
# Choices of n_estimators -- The number of the trees in the forest
accuracy = lambda x : np.trace(x)/np.sum(x,dtype=float)
n_estimator_params = range(1,20)
confusion_matrixes = {}
for n_estimator in n_estimator_params:
rf = RandomForestClassifier(n_estimators = n_estimator)
rf.fit(X[training],y[training])
confusion_matrixes[n_estimator] = confusion_matrix(y[~training],rf.predict(X[~training]))
confusion_matrixes[n_estimator] = accuracy(confusion_matrixes[n_estimator])
accuracy_series = pd.Series(confusion_matrixes)
from matplotlib import pyplot as plt
f, ax = plt.subplots(figsize=(7,5))
accuracy_series.plot(kind='bar',ax=ax,color='k',alpha=.75)
ax.grid()
ax.set_title("Accuracy by Number of Estimators")
ax.set_ylim(0,1)
ax.set_ylabel("Accuracy")
ax.set_xlabel("Number of estimators")
plt.show()