XGBoost tree

本文介绍了如何使用Python的XGBoost库处理离散型特征,包括One-Hot编码和LabelEncoding,同时讨论了这种编码方法对XGBoost适用性的局限性。作者还提供了另一种使用sklearn预处理离散特征的解决方案,并给出了一个实际案例,展示了97%的预测准确率。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1. python有已经封装好的XGboost库,直接pip install就可以调用, 抄的源代码如下:

from xgboost import XGBClassifier
from numpy import loadtxt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import plot_tree
import matplotlib.pyplot as plt

dataset=loadtxt('pima-indians-diabetes.data', delimiter='\t')
X=dataset[:,0:8]
Y=dataset[:,8]

seed = 22
test_size=0.9
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size, random_state=seed)

model1 = XGBClassifier()
model1.fit(X_train, y_train)

# plot_tree(model1)
# plt.show()

y_pre=model1.predict(X_test)
predictions=[round(value) for value in y_pre]
accuracy = accuracy_score(y_test,predictions)
print("accuracy: ")
print(accuracy)

2. XGBoost 以及所有的Boost tree只能处理数值型特征,无法处理枚举离散型特征。

离散型特征可以使用下面的方法转成数值型特征。详情见 https://blog.youkuaiyun.com/m0_37870649/article/details/104550054

总其起来,两个方法:label encoding 和one-hot encoding

下面是按照one-hot encoding的方式进行编码


def OneHot_DB():
    from sklearn.preprocessing import LabelEncoder
    import pandas as pd
    from sklearn.preprocessing import OneHotEncoder
    raw_data_car = np.loadtxt('car.data', delimiter=',', dtype={'names': ('buying', 'maint', 'doors', 'persons', 'boost', 'safety', 'classifier' ),'formats': ('S5', 'S5', 'S5', 'S4','S5','S5', 'S5')})
    print(raw_data_car)
    df = pd.DataFrame(raw_data_car)
    df.columns=['buying', 'maint', 'doors', 'persons', 'boost', 'safety', 'classifier']


    le_buying=LabelEncoder()
    le_maint=LabelEncoder()
    le_doors=LabelEncoder()
    le_persons=LabelEncoder()
    le_boost=LabelEncoder()
    le_safety=LabelEncoder()
    le_classifier=LabelEncoder()

    df['buying_encoder']=le_buying.fit_transform(df.buying)
    df['maint_encoder']=le_maint.fit_transform(df.maint)
    df['doors_encoder']=le_doors.fit_transform(df.doors)
    df['persons_encoder']=le_persons.fit_transform(df.persons)
    df['boost_encoder']=le_boost.fit_transform(df.boost)
    df['safety_encoder']=le_safety.fit_transform(df.safety)
    df['classifier_encoder']=le_classifier.fit_transform(df.classifier)

    buying_ohe=OneHotEncoder()
    maint_ohe=OneHotEncoder()
    doors_ohe=OneHotEncoder()
    persons_ohe=OneHotEncoder()
    boost_ohe=OneHotEncoder()
    safety_ohe=OneHotEncoder()
    classifier_ohe=OneHotEncoder()

    BUY=buying_ohe.fit_transform(df.buying_encoder.values.reshape(-1,1)).toarray()
    MAINT=maint_ohe.fit_transform(df.maint_encoder.values.reshape(-1,1)).toarray()
    DOORS=doors_ohe.fit_transform(df.doors_encoder.values.reshape(-1,1)).toarray()
    PERSONS=persons_ohe.fit_transform(df.persons_encoder.values.reshape(-1,1)).toarray()
    BOOST=boost_ohe.fit_transform(df.boost_encoder.values.reshape(-1,1)).toarray()
    SAFETY=safety_ohe.fit_transform(df.safety_encoder.values.reshape(-1,1)).toarray()
    CLASSIFIER=classifier_ohe.fit_transform(df.classifier_encoder.values.reshape(-1,1)).toarray()



    digit_data_frame=pd.DataFrame()
    digit_classifier_frame=pd.DataFrame()

    dfOneHot = pd.DataFrame(BUY, columns=["Buying_" + str(int(i)) for i in range(BUY.shape[1])])
    digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
    dfOneHot = pd.DataFrame(MAINT, columns=["MAINT_" + str(int(i)) for i in range(MAINT.shape[1])])
    digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
    dfOneHot = pd.DataFrame(DOORS, columns=["DOORS_" + str(int(i)) for i in range(DOORS.shape[1])])
    digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
    dfOneHot = pd.DataFrame(PERSONS, columns=["PERSONS_" + str(int(i)) for i in range(PERSONS.shape[1])])
    digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
    dfOneHot = pd.DataFrame(BOOST, columns=["BOOST_" + str(int(i)) for i in range(BOOST.shape[1])])
    digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)
    dfOneHot = pd.DataFrame(SAFETY, columns=["SAFETY_" + str(int(i)) for i in range(SAFETY.shape[1])])
    digit_data_frame=pd.concat([digit_data_frame,dfOneHot], axis=1)

    dfOneHot = pd.DataFrame(CLASSIFIER, columns=["CLASSIFIER_" + str(int(i)) for i in range(CLASSIFIER.shape[1])])
    digit_classifier_frame=pd.concat([digit_classifier_frame,dfOneHot], axis=1)

    return digit_data_frame, digit_classifier_frame

digit_data_frame, digit_classifier_frame = OneHot_DB()
print(digit_data_frame)
print(digit_classifier_frame)

3. 这样会产生两个问题:

(1)One-Hot编码方式会产生很大的稀疏矩阵,一般通过PCA进行降维 

(2)XGBoost tree只适用于类别是一列的问题,对离散型类别进行Oneot编码后,类别就出现了多列,无法使用XGBoost tree

4. 鉴于One-Hot编码的这两个缺点,可以采用sklearn中提供的preprocessing方法进行离散/类别值处理成数值。如下

def load_by_pandas(sample_file, splitter, columns_name):
    import pandas as pd
    df = pd.read_csv(sample_file, sep=splitter)

    df.columns=columns_name
    print("number of raw samples: {}".format(len(df)))

    return df


def feature_preprocess(df, target_col):
    from sklearn import preprocessing
    for f in df.columns:
        if df[f].dtype=='object' or f in target_col:
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df[f].values))
            df[f] = lbl.transform(list(df[f].values))
    df.fillna((-999), inplace=True)
    return df


# Press the green button in the gutter to run the script.
if __name__ == '__main__':

    # digit_data_frame, digit_classifier_frame = OneHot_DB()
    # print(digit_data_frame)
    # print(digit_classifier_frame)

    columns_name=['buying', 'maint', 'doors', 'persons', 'boost', 'safety', 'classifier']
    raw_data=load_by_pandas('car.data', ',', columns_name)

    target_column=['classifier']
    df = feature_preprocess(raw_data,target_column)
    print(df)
    XGB_train_model(df)

附完整代码 , ;预测达到97%


def load_by_pandas(sample_file, splitter, columns_name):
    import pandas as pd
    df = pd.read_csv(sample_file, sep=splitter)

    df.columns=columns_name
    print("number of raw samples: {}".format(len(df)))

    return df


def feature_preprocess(df, target_col):
    from sklearn import preprocessing
    for f in df.columns:
        if df[f].dtype=='object' or f in target_col:
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df[f].values))
            df[f] = lbl.transform(list(df[f].values))
    df.fillna((-999), inplace=True)
    return df

def XGB_train(dataset, data_cols_name, target_col_name, test_size=0.33):
    from sklearn.model_selection import train_test_split
    import matplotlib.pyplot as plt
    from xgboost import XGBClassifier, plot_tree
    from sklearn.metrics import accuracy_score


    rows, cols= dataset.shape
    print(cols)
    X=dataset[data_cols_name]
    Y=dataset[target_col_name]

    seed = 123456
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
    model = XGBClassifier()
    model.fit(X_train, y_train)

    plot_tree(model)
    plt.show()

    y_pre = model.predict(X_test)
    predictions = [round(value) for value in y_pre]
    accuracy = accuracy_score(y_test, predictions)
    print("accuracy: ")
    print(accuracy)


# Press the green button in the gutter to run the script.
if __name__ == '__main__':

    # digit_data_frame, digit_classifier_frame = OneHot_DB()
    # print(digit_data_frame)
    # print(digit_classifier_frame)

    columns_name=['buying', 'maint', 'doors', 'persons', 'boost', 'safety', 'classifier']
    raw_data=load_by_pandas('car.data', ',', columns_name)

    target_column=['classifier']
    data_columns = columns_name[0:len(columns_name) - 1]

    df = feature_preprocess(raw_data, target_column)
    XGB_train(df, data_columns, target_column)

生成的boost tree如下

 

参考:https://www.jianshu.com/p/e76ff469183d

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值