数据预处理
1.读取数据
df = pd.read_csv('./dataset/train.csv') # 读取测试训练集数据
testDf = pd.read_csv('./dataset/test.csv') # 读取测集数据,测试集没有Survived
y = df.pop('Survived')
X= df
result = pd.concat([X,testgDf],axis=0) #拼接测试集和训练集方便预处理
2. 空数据处理
查询空数据
result.isnull().sum()
结果如下
PassengerId 0 Pclass 0 Name 0 Sex 0 Age 263 SibSp 0 Parch 0 Ticket 0 Fare 1 Cabin 1014 Embarked 2
空值处理
Cabin空值太多没有太大的用,直接删除
result.pop('Cabin')
对age空值处理,处理的方式为用名字的称谓的平均值填充age
def getCell(s):
return s.split(',')[1].split('.')[0].strip()
result['title'] = result['Name'].apply(getCell)
age_title_mean = result.groupby('title')['Age'].transform('mean')
result['Age'] = result['Age'].fillna(age_title_mean)
对Embarked 填充,使用众数填充
result['Embarked']=result['Embarked'].fillna(result['Embarked'].mode()[0])
特征扩展
result['sibOrParch'] = result['SibSp']|result['Parch']
result['sibandparch'] = result['SibSp']&result['Parch']
#age 根据kmeans分类
import numpy as np
income = np.array(result["Age"].tolist()).reshape(-1,1)
kbins = preprocessing.KBinsDiscretizer(5,encode = 'ordinal',strategy='kmeans')
test = kbins.fit_transform(income)
result['age_kmeans'] = pd.DataFrame(test)
#Fare 根据kmeans分类
income = np.array(result["Fare"].tolist()).reshape(-1,1)
kbins = preprocessing.KBinsDiscretizer(5,encode = 'ordinal',strategy='kmeans')
test = kbins.fit_transform(income)
result['Fare_kmeans'] = pd.DataFrame(test)
Onehot归一化处理
onehotColumns = ['Pclass','SibSp','Parch','Embarked','title','age_kmeans','Fare_kmeans','sibOrParch','sibandparch']
for c in onehotColumns:
tmpDf = pd.get_dummies(result[c],dtype = int,prefix=c)
result = pd.concat([result,tmpDf],axis=1)
result.update(tmpDf)
normalizer = preprocessing.MinMaxScaler()
test = normalizer.fit_transform(result[['Age','PassengerId','Ticket','Fare']])
特征筛选
卡方筛选
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
selector = SelectKBest(chi2, k=37)
X_selected = selector.fit_transform(X, y)
columns = selector.get_feature_names_out()
RFE 筛选
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
clf = LinearSVC()
rfecv = RFECV(estimator=clf, step=1, cv=3,
scoring='accuracy')
rfecv.fit(X[columns], y)
print("Optimal number of features : %d" % rfecv.n_features_)
print("Ranking of features : %s" % rfecv.ranking_)
print(rfecv.get_feature_names_out())
print(rfecv.score(X[columns], y))
模型训练和预测
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
X1 = X[rfecv.get_feature_names_out()]
lr = LogisticRegression()
param_grid = {
'C': [0.01,0.1,1, 10, 100,1000], # 正则化系数的倒数
'penalty': ['l1','l2'] # 正则化类型
}
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X[rfecv.get_feature_names_out()],y)
best_model = grid_search.best_estimator_
print(grid_search.best_params_)
y_pred = best_model.predict(X_test[rfecv.get_feature_names_out()])
# X2 = lr.fit(X[rfecv.get_feature_names_out()],y)
# y3 = lr.predict(X_test[rfecv.get_feature_names_out()])
accuracy = accuracy_score(y2, y_pred)
print("Accuracy:", accuracy)
# scores = cross_val_score(lr,X1,y,cv=20)
# print(scores)
最优参数和准确率:
{'C': 1, 'penalty': 'l2'}
Accuracy: 0.9521531100478469
4162

被折叠的 条评论
为什么被折叠?



