import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
# INPUT_DIR = "../child-mind-institute-problematic-internet-use/"
df = pd.read_csv(INPUT_DIR + "train.csv")
df_test = pd.read_csv(INPUT_DIR + "test.csv")
df = df.dropna(subset=["sii"]).reset_index(drop=True)#解释:删除sii为空的行,重置索引
drop_cols = list(set(df.columns) - set(df_test.columns))#删除test.csv中不存在的列
X = df.drop(["id"] + drop_cols, axis=1)
y = df["sii"].astype(int)
cat_cols = X.columns[X.dtypes=="object"].tolist()
if cat_cols:
X[cat_cols] = X[cat_cols].astype("category")
#%%
a = y.mean()
b = y.var(ddof=0)
y_min = y.min()
y_max = y.max()
#%%
#自定义评估函数
def quadratic_weighted_kappa(preds, data):
y_true = data.get_label()
y_pred = preds.clip(y_min, y_max).round()# 将预测值限制在目标变量的最小值和最大值之间,并四舍五入
qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic") #"quadratic" 表示使用二次加权。
return 'QWK', qwk, True
#自定义目标函数
def qwk_obj(preds, dtrain):
labels = dtrain.get_label()
preds = preds.clip(y_min, y_max)
f = 1/2 * np.sum((preds - labels)**2)
g = 1/2 * np.sum((preds - a)**2 + b)#a:y.mean() b:y.var(ddof=0)
df = preds - labels
dg = preds - a
grad = (df/g - f*dg/g**2)*len(labels)#梯度
hess = np.ones(len(labels))# Hessian
return grad, hess
#%%
params = {
"objective": qwk_obj,#自定义目标函数
"metric": "None",#设置为 None,因为评估指标由自定义的 quadratic_weighted_kappa 函数提供。
"verbosity": -1,
"learning_rate": 0.01,
"num_leaves": 16,
"feature_fraction": 0.5
}
init_score = 2.0
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
folds = [(idx_train, idx_valid) for idx_train, idx_valid in skf.split(X, y)]
models = lgb.cv(
params=params,
train_set=lgb.Dataset(X, y, init_score=[init_score]*len(X)),
num_boost_round=10000,
folds=folds,
feval=quadratic_weighted_kappa,#自定义评估函数
callbacks=[
lgb.early_stopping(stopping_rounds=100, verbose=True),
lgb.log_evaluation(100)
],
return_cvbooster=True
)["cvbooster"].boosters
preds_oof = np.zeros(len(df))
for model, (idx_train, idx_valid) in zip(models, folds):
preds_oof[idx_valid] = model.predict(X.iloc[idx_valid]) + init_score
preds_oof = preds_oof.clip(y_min, y_max).round()
qwk = cohen_kappa_score(y, preds_oof, weights="quadratic")
print("QWK:", qwk)
扩展的 Cohen Kappa 评分计算
def extended_cohen_kappa_score(labels, preds):
f = np.sum((preds - labels)**2)
g = np.sum((preds - a) ** 2 + b)
return 1 - f / g
extended_cohen_kappa_score(y, preds_oof)
这个函数计算了一个扩展版的 Cohen Kappa 评分,它与 QWK 相同,只是实现方式不同。
最终返回值与 QWK 相同,都是评估模型预测精度的指标。
参考: