数据挖掘大作业
|
姓名 | 学号 |
---|---|
段阳阳 | 20185111075 |
王浩 | 20185111045 |
贾德智 | 20185111076 |
周成成 | 20185111033 |
乔伟 | 20185111046 |
刘宗敏 | 20185111006 |
数据读取
import pandas as pd
import numpy as np
#train= np.loadtxt(‘zhengqi_train.txt’,delimiter=’\t’)#导入数据
train=pd.read_csv(‘zhengqi_train.txt’,sep=’\t’)
test=pd.read_csv(‘zhengqi_test.txt’,sep=’\t’)
#test=np.loadtxt(‘zhengqi_test.txt’,delimiter=’\t’’)
train_x=train.drop([‘target’],axis=1)
all_data = pd.concat([train_x,test])
数据分析与数据观察(可视化)
import seaborn
import matplotlib.pyplot as plt
for col in all_data.columns:
seaborn.distplot(train[col])
seaborn.distplot(test[col])
plt.show()
由上面的图可知v5,v17,v28,v22,v11,v9的训练集和测试集的分布基本上不一致,这对模型的泛化能力有很大的影响,所以应该剔除
all_data.drop([‘V5’,‘V17’,‘V28’,‘V22’,‘V11’,‘V9’],axis=1,inplace=True)
对删除后的数据进行投影,简化数据的维度
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def meanX(dataX):
return np.mean(dataX,axis=0)#按列求均值
返回样本协方差的特征值
def variance(X):
average = meanX(XMat)
m, n = np.shape(XMat)
data_adjust = []
avgs = np.tile(average, (m, 1))#在列方向上重复average m次,在行上重复1次
data_adjust = XMat - avgs
covX = np.cov(data_adjust.T)
featValue, featVec= np.linalg.eig(covX)#求协方差矩阵的特征值和特征向量
return featValue
def pca(XMat, k):
average = meanX(XMat)
m, n = np.shape(XMat)
data_adjust = []
avgs = np.tile(average, (m, 1))
data_adjust = XMat - avgs #矩阵每一元素减去均值
covX = np.cov(data_adjust.T) #计算协方差矩阵
featValue, featVec= np.linalg.eig(covX) #求解协方差矩阵的特征值和特征向量
index = np.argsort(-featValue) #按照featValue从大到小排序
finalData = []
if k > n:
print(“k must lower than feature number”)
return
else:
注意特征向量是列向量,而numpy的二维矩阵(数组)a[m][n]中,a[1]表示第1行值
selectVec = np.matrix(featVec.T[index[:k]]) #所以这里需要进行转置
finalData = data_adjust * selectVec.T
reconData = (finalData * selectVec) + average
return finalData, reconData
XMat=all_data
特征谱的可视化
plt.bar(range(XMat.shape[1]), variance(XMat), width = 0.25)
plt.ylabel(‘variance’)
plt.show()
list_1=[]
list_2=[]
k=18#由上图可以取前18个为主要属性
list_1,list_2 = pca(XMat, k)
print(list_2)
数据标准化
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
data_minmax = pd.DataFrame(min_max_scaler.fit_transform(list_2),columns=list_2.columns)
import math
data_minmax[‘V0’] = data_minmax[‘V0’].apply(lambda x:math.exp(x))
data_minmax[‘V1’] = data_minmax[‘V1’].apply(lambda x:math.exp(x))
data_minmax[‘V6’] = data_minmax[‘V6’].apply(lambda x:math.exp(x))
data_minmax[‘V30’] = np.log1p(data_minmax[‘V30’])
#train[‘exp’] = train[‘target’].apply(lambda x:math.pow(1.5,x)+10)
X_scaled = pd.DataFrame(preprocessing.scale(data_minmax),columns = data_minmax.columns)
train_x = X_scaled.ix[0:len(train)-1]
test = X_scaled.ix[len(train):]
Y=train[‘target’]
特征选择
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
方差
threshold = 0.85
vt = VarianceThreshold().fit(train_x)
Find feature names
feat_var_threshold = train_x.columns[vt.variances_ > threshold * (1-threshold)]
train_x = train_x[feat_var_threshold]
test = test[feat_var_threshold]
单变量
X_scored = SelectKBest(score_func=f_regression, k=‘all’).fit(train_x, Y)
feature_scoring = pd.DataFrame({
‘feature’: train_x.columns,
‘score’: X_scored.scores_
})
head_feature_num = 18
feat_scored_headnum = feature_scoring.sort_values(‘score’, ascending=False).head(head_feature_num)[‘feature’]
train_x_head = train_x[train_x.columns[train_x.columns.isin(feat_scored_headnum)]]
X_scaled = pd.DataFrame(preprocessing.scale(train_x),columns = train_x.columns)
模型尝试
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC,LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import lightgbm as lgb
n_folds = 10
def rmsle_cv(model,train_x_head=train_x_head):
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_x_head)
rmse= -cross_val_score(model, train_x_head, Y, scoring=“neg_mean_squared_error”, cv = kf)
return(rmse)
svr = make_pipeline( SVR(kernel=‘linear’))
line = make_pipeline( LinearRegression())
lasso = make_pipeline( Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline( ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR1 = KernelRidge(alpha=0.6, kernel=‘polynomial’, degree=2, coef0=2.5)
KRR2 = KernelRidge(alpha=1.5, kernel=‘linear’, degree=2, coef0=2.5)
model_xgb = xgb.XGBRegressor(booster=‘gbtree’,colsample_bytree=0.8, gamma=0.1,
learning_rate=0.02, max_depth=5,
n_estimators=500,min_child_weight=0.8,
reg_alpha=0, reg_lambda=1,
subsample=0.8, silent=1,
random_state =42, nthread = 2)
score = rmsle_cv(svr)
print("\nSVR 得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
svr.fit(train_x_head,Y)
score = rmsle_cv(line)
print("\nLine 得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
score = rmsle_cv(lasso)
print("\nLasso 得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
score = rmsle_cv(ENet)
print(“ElasticNet 得分: {:.4f} ({:.4f})\n”.format(score.mean(), score.std()))
score = rmsle_cv(KRR2)
print(“Kernel Ridge2 得分: {:.4f} ({:.4f})\n”.format(score.mean(), score.std()))
KRR2.fit(train_x_head,Y)
head_feature_num = 18
feat_scored_headnum = feature_scoring.sort_values(‘score’, ascending=False).head(head_feature_num)[‘feature’]
train_x_head2 = train_x[train_x.columns[train_x.columns.isin(feat_scored_headnum)]]
X_scaled = pd.DataFrame(preprocessing.scale(train_x),columns = train_x.columns)
score = rmsle_cv(KRR1,train_x_head2)
print(“Kernel Ridge1 得分: {:.4f} ({:.4f})\n”.format(score.mean(), score.std()))
head_feature_num = 22
feat_scored_headnum = feature_scoring.sort_values(‘score’, ascending=False).head(head_feature_num)[‘feature’]
train_x_head3 = train_x[train_x.columns[train_x.columns.isin(feat_scored_headnum)]]
X_scaled = pd.DataFrame(preprocessing.scale(train_x),columns = train_x.columns)
score = rmsle_cv(model_xgb,train_x_head3)
print(“Xgboost 得分: {:.4f} ({:.4f})\n”.format(score.mean(), score.std()))
model_xgb.fit(train_x_head,Y)
简单模型融合
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def init(self, models):
self.models = models
遍历所有模型和数据
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
for model in self.models_:
model.fit(X, y)
return self
def predict(self, X):
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
return np.mean(predictions, axis=1)
averaged_models = AveragingModels(models = (svr,KRR2,model_xgb))
score = rmsle_cv(averaged_models)
print(" 对基模型集成后的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))