#导入需要的模块
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from heamy.pipeline import ModelsPipeline
from heamy.dataset import Dataset
from heamy.estimator import Classifier
from mlxtend.classifier import StackingClassifier
from sklearn import tree#导入需要的模块
clf = tree.DecisionTreeClassifier()#实例化
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings(‘ignore’)
#取一万条数据
data = pd.read_csv(“train.csv”)
train = data.iloc[0:10000,:]
#处理文本数据
train[‘issueDate’] = pd.to_datetime(train[‘issueDate’],format=’%Y-%m-%d’)
train[‘grade’] = train[‘grade’].map({‘A’:1,‘B’:7,‘C’:13,‘D’:19,‘E’:25,‘F’:31,‘G’:37})
train[‘employmentLength’] = train[‘employmentLength’].map({‘1 year’:1,‘2 years’:2,‘3 years’:3,‘4 years’:4,‘5 years’:5,‘6 years’:6,‘7 years’:7,‘8 years’:8,‘9 years’:9,‘10+ years’:10,’< 1 year’:0})
train[‘subGrade’] = train[‘subGrade’].map({‘E2’:27,‘D2’:21,‘D3’:22,‘A4’:5,‘C2’:15,‘A5’:6,‘C3’:16,‘B4’:11,‘B5’:12,‘E5’:30,
‘D4’:23,‘B3’:10,‘B2’:9,‘D1’:20,‘E1’:26,‘C5’:18,‘C1’:14,‘A2’:3,‘A3’:4,‘B1’:8,
‘E3’:28,‘F1’:32,‘C4’:17,‘A1’:2,‘D5’:24,‘F2’:33,‘E4’:29,‘F3’:34,‘G2’:39,‘F5’:36,
‘G3’:40,‘G1’:38,‘F4’:35,‘G4’:41,‘G5’:42})
train[‘earliesCreditLine’] = train[‘earliesCreditLine’].apply(lambda s: int(s[-4:]))
#中位数填充
train = train.fillna(train.median())
#去除方差为0的特征,id和日期
train = train.drop([“policyCode”,“id”,“issueDate”],axis=1)
#划分训练集测试集
x = train.drop([“isDefault”],axis=1)
y = train[[“isDefault”]]
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=2021)
##########串行
#决策树
tree_model = tree.DecisionTreeClassifier(max_depth=3).fit(x_train,y_train)
print(“决策树auc:”,roc_auc_score(y_test,tree_model.predict_proba(x_test)[:,-1]))
#逻辑回归
luoji_model = LogisticRegression().fit(x_train,y_train)
print(“逻辑回归auc:”,roc_auc_score(y_test,luoji_model.predict_proba(x_test)[:,-1]))
#决策树的结果加入原数据
train[‘prob_0’] = tree_model.predict_proba(x)[:,0]
#train[‘prob_1’] = tree_model.predict_proba(x)[:,1]
#划分训练集测试集
x = train.drop([“isDefault”],axis=1)
y = train[[“isDefault”]]
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=2021)
luoji_model = LogisticRegression().fit(x,y)
print(“串行后逻辑回归auc:”,roc_auc_score(y_test,luoji_model.predict_proba(x_test)[:,-1]))