1. python有已经封装好的XGboost库,直接pip install就可以调用, 抄的源代码如下:
from xgboost import XGBClassifier
from numpy import loadtxt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import plot_tree
import matplotlib.pyplot as plt
dataset=loadtxt('pima-indians-diabetes.data', delimiter='\t')
X=dataset[:,0:8]
Y=dataset[:,8]
seed = 22
test_size=0.9
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size, random_state=seed)
model1 = XGBClassifier()
model1.fit(X_train, y_train)
# plot_tree(model1)
# plt.show()
y_pre=model1.predict(X_test)
predictions=[round(value) for value in y_pre]
accuracy = accuracy_score(y_test,predictions)
print("accuracy: ")
print(accuracy)
2. XGBoost 以及所有的Boost tree只能处理数值型特征,无法处理枚举离散型特征。
离散型特征可以使用下面的方法转成数值型特征。详情见 https://blog.youkuaiyun.com/m0_37870649/article/details/104550054
总其起来,两个方法:label encoding 和one-hot encoding
下面是按照one-hot encoding的方式进行编码
def OneHot_DB():
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
raw_data_car = np.loadtxt('car.data', delimiter=',', dtype={'names': ('buying', 'maint', 'doors', 'persons', 'boost', 'safety', 'classifier' ),'formats': ('S5', 'S5', 'S5', 'S4','S5','S5', 'S5')})
print(raw_data_car)
df = pd.DataFrame(raw_data_car)
df.columns=['buying', 'maint', 'doors', 'persons', 'boost', 'safety', 'classifier']
le_buying=LabelEncoder()
le_maint=LabelEncoder()
le_doors=LabelEncoder()
le_persons=LabelEncoder()
le_boost=LabelEncoder()
le_safety=LabelEncoder()
le_classifier=LabelEncoder()
df['buying_encoder']=le_buying.fit_transform(df.buying)
df['maint_encoder']=le_maint.fit_transform(df.maint)
df['doors_encoder']=le_doors.fit_transform(df.doors)
df['persons_encoder']=le_persons.fit_transform(df.persons)
df['boost_encoder']=le_boost.fit_transform(df.boost)
df['safety_encoder']=le_safety.fit_transform(df.safety)
df['classifier_encoder']=le_classifier.fit_transform(df.classifier)
buying_ohe=OneHotEncoder()
maint_ohe=OneHotEncoder()
doors_ohe=OneHotEncoder()
persons_ohe=OneHotEncoder()
boost_ohe=OneHotEncoder()
safety_ohe=OneHotEncoder()
classifier_ohe=OneHotEncoder()
BUY=buying_ohe.fit_transform(df.buying_encoder.values.reshape(-1,1)).toarray()
MAINT=maint_ohe.fit_transform(df.maint_encoder.values.reshape(-1,1)).toarray()
DOORS=doors_ohe.fit_transform(df.doors_encoder.values.reshape(-1,1)).toarray()
PERSONS=persons_ohe.fit_transform(df.persons_encoder.values.reshape(-1,1)).toarray()
BOOST=boost_ohe.fit_transform(df.boost_encoder.values.reshape(-1,1)).toarray()
SAFETY=safety_ohe.fit_transform(df.safety_encoder.values.reshape(-1,1)).toarray()
CLASSIFIER=classifier_ohe.fit_transform(df.classifier_encoder.values.reshape(-1,1)).toarray()
digit_data_frame=pd.DataFrame()
digit_classifier_frame=pd.DataFrame()
dfOneHot = pd.DataFrame(BUY, columns=["Buying_" + str(int(i)) for i in range(BUY.shape[1])])
digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
dfOneHot = pd.DataFrame(MAINT, columns=["MAINT_" + str(int(i)) for i in range(MAINT.shape[1])])
digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
dfOneHot = pd.DataFrame(DOORS, columns=["DOORS_" + str(int(i)) for i in range(DOORS.shape[1])])
digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
dfOneHot = pd.DataFrame(PERSONS, columns=["PERSONS_" + str(int(i)) for i in range(PERSONS.shape[1])])
digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
dfOneHot = pd.DataFrame(BOOST, columns=["BOOST_" + str(int(i)) for i in range(BOOST.shape[1])])
digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
dfOneHot = pd.DataFrame(SAFETY, columns=["SAFETY_" + str(int(i)) for i in range(SAFETY.shape[1])])
digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
dfOneHot = pd.DataFrame(CLASSIFIER, columns=["CLASSIFIER_" + str(int(i)) for i in range(CLASSIFIER.shape[1])])
digit_classifier_frame=pd.concat([digit_classifier_frame,dfOneHot], axis=1)
return digit_data_frame, digit_classifier_frame
digit_data_frame, digit_classifier_frame = OneHot_DB()
print(digit_data_frame)
print(digit_classifier_frame)
3. 这样会产生两个问题:
(1)One-Hot编码方式会产生很大的稀疏矩阵,一般通过PCA进行降维
(2)XGBoost tree只适用于类别是一列的问题,对离散型类别进行Oneot编码后,类别就出现了多列,无法使用XGBoost tree
4. 鉴于One-Hot编码的这两个缺点,可以采用sklearn中提供的preprocessing方法进行离散/类别值处理成数值。如下
def load_by_pandas(sample_file, splitter, columns_name):
import pandas as pd
df = pd.read_csv(sample_file, sep=splitter)
df.columns=columns_name
print("number of raw samples: {}".format(len(df)))
return df
def feature_preprocess(df, target_col):
from sklearn import preprocessing
for f in df.columns:
if df[f].dtype=='object' or f in target_col:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df[f].values))
df[f] = lbl.transform(list(df[f].values))
df.fillna((-999), inplace=True)
return df
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
# digit_data_frame, digit_classifier_frame = OneHot_DB()
# print(digit_data_frame)
# print(digit_classifier_frame)
columns_name=['buying', 'maint', 'doors', 'persons', 'boost', 'safety', 'classifier']
raw_data=load_by_pandas('car.data', ',', columns_name)
target_column=['classifier']
df = feature_preprocess(raw_data,target_column)
print(df)
XGB_train_model(df)
附完整代码 , ;预测达到97%
def load_by_pandas(sample_file, splitter, columns_name):
import pandas as pd
df = pd.read_csv(sample_file, sep=splitter)
df.columns=columns_name
print("number of raw samples: {}".format(len(df)))
return df
def feature_preprocess(df, target_col):
from sklearn import preprocessing
for f in df.columns:
if df[f].dtype=='object' or f in target_col:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df[f].values))
df[f] = lbl.transform(list(df[f].values))
df.fillna((-999), inplace=True)
return df
def XGB_train(dataset, data_cols_name, target_col_name, test_size=0.33):
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, plot_tree
from sklearn.metrics import accuracy_score
rows, cols= dataset.shape
print(cols)
X=dataset[data_cols_name]
Y=dataset[target_col_name]
seed = 123456
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier()
model.fit(X_train, y_train)
plot_tree(model)
plt.show()
y_pre = model.predict(X_test)
predictions = [round(value) for value in y_pre]
accuracy = accuracy_score(y_test, predictions)
print("accuracy: ")
print(accuracy)
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
# digit_data_frame, digit_classifier_frame = OneHot_DB()
# print(digit_data_frame)
# print(digit_classifier_frame)
columns_name=['buying', 'maint', 'doors', 'persons', 'boost', 'safety', 'classifier']
raw_data=load_by_pandas('car.data', ',', columns_name)
target_column=['classifier']
data_columns = columns_name[0:len(columns_name) - 1]
df = feature_preprocess(raw_data, target_column)
XGB_train(df, data_columns, target_column)
生成的boost tree如下