import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
def parseData(df):
"""
预处理数据
"""
df['rentType'][df['rentType']=='--'] = '未知方式'
def parseRoom(info, index):
res = int(info[index*2])
return res
df.insert(3,'室',None)
df.insert(4, '厅', None)
df.insert(5, '卫', None)
df['室'] = df['houseType'].apply(parseRoom, index=0)
df['厅'] = df['houseType'].apply(parseRoom, index=1)
df['卫'] = df['houseType'].apply(parseRoom, index=2)
df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1]))
#统计相同小区相同室,相同交易时间相同室,相同交易时间相同小区的特征.
df['houseType_1sumcsu']=df['室'].map(lambda x:str(x))+df['communityName'].map(lambda x:str(x))
df['houseType_2sumcsu']=df['室'].map(lambda x:str(x))+df['交易月份'].map(lambda x:str(x))
df['houseType_3sumcsu']=df['communityName'].map(lambda x:str(x))+df['交易月份'].map(lambda x:str(x))
#平滑操作
big_num_cols = ['totalTradeMoney','totalTradeArea','tradeMeanPrice','totalNewTradeMoney', 'totalNewTradeArea',
'tradeNewMeanPrice','remainNewNum', 'supplyNewNum', 'supplyLandArea',
'tradeLandArea','landTotalPrice','landMeanPrice','totalWorkers','newWorkers',
'residentPopulation','pv','uv']
for col in big_num_cols:
df[col] = df[col].map(lambda x: np.log1p(x))
# 转换object类型数据
columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate','houseType_1sumcsu','houseType_2sumcsu','houseType_3sumcsu']
for col in columns:
df[col] = df[col].astype('category')
# 处理pv和uv的空值
df['pv'].fillna(df['pv'].mean(),inplace=True)
df['uv'].fillna(df['uv'].mean(),inplace=True)
df['pv'] = df['pv'].astype('int')
df['uv'] = df['uv'].astype('int')
#df.loc[df['buildYear']=='暂无信息','buildYear'] = None
# 将buildYear列转换为整型数据
tmp = df['buildYear'].copy()
tmp2 = tmp[tmp!='暂无信息'].astype('int')
tmp[tmp=='暂无信息'] = tmp2.mode().iloc[0]
df['buildYear'] = tmp
df['buildYear'] = df['buildYear'].astype('int')
# 去掉部分特征,房屋朝向直接剔除
#df.drop('communityName',axis=1, inplace=True)
df.drop('city',axis=1,inplace=True)
df.drop('houseToward',axis=1,inplace=True)
df.drop('houseDecoration',axis=1,inplace=True)
df.drop(['ID'],axis=1,inplace=True)
return df
def washData(df_train, df_test):
"""
清洗数据
"""
#测试集里面面积只存在200以下,为了训练集与测试集相符只选区面积200以下的进行训练
df_train = df_train[(df_train['area']<=200)&(df_train['area']>6)]
df_train = df_train[df_train['tradeMoney']<=100000]
df_train = df_train.drop(df_train[(df_train.tradeMoney/df_train.area>300)].index,axis=0)
df_train = df_train.drop(df_train[(df_train.tradeMoney/df_train.area<25)].index,axis=0)
df_train = df_train.drop(df_train[df_train['houseType'] =='0室0厅1卫' ].index)
df_train=df_train.drop(df_train[df_train.totalFloor==0].index)
df_train = df_train.drop(df_train[(df_train['tradeMoney']>25000)&(df_train['area']<100)].index)
df_train = df_train.drop(df_train[(df_train['tradeMoney']<75000)&(df_train['area']>800)].index)
return df_train, df_test
def feature(df):
"""
特征
"""
# 将houseType转化为‘房间数’,‘厅数’,‘卫生间数’
'''
def parseRoom(info, index):
res = int(info[index*2])
return res
df.insert(3,'室',None)
df.insert(4, '厅', None)
df.insert(5, '卫', None)
df['室'] = df['houseType'].apply(parseRoom, index=0)
df['厅'] = df['houseType'].apply(parseRoom, index=1)
df['卫'] = df['houseType'].apply(parseRoom, index=2)
df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1]))
'''
# df['pv/uv'] = df['pv'] / df['uv']
#df['房间总数'] = df['室'] + df['厅'] + df['卫']
df.drop('houseType', axis=1, inplace=True)
df.drop('tradeTime', axis=1, inplace=True)
items=['area','室','厅']
#统计特征,过拟合验证,选择删除
'''
for item in items:
xiaoquname_mean=df.groupby('communityName',as_index=False)[item].agg({
item+'mean小区名':'mean',}
)
df = df.merge(xiaoquname_mean,on='communityName',how='left')
tmp=df.groupby(['communityName'],as_index=False)['area'].agg({
'ca_mean':'mean',
#'ca_max':'max',
#'ca_min':'min'
})
df = df.merge(tmp, on=['communityName'], how='left')
'''
#df['houseType_1sumcsu']=df['室'].map(lambda x:str(x))+df['communityName'].map(lambda x:str(x))
#df['houseType_2sumcsu']=df['室'].map(lambda x:str(x))+df['交易月份'].map(lambda x:str(x))
#将冗杂特征合并,并构造新特征
df['traffic']=df['subwayStationNum']+df['busStationNum']
df['edu']=df['interSchoolNum']+df['schoolNum']+df['privateSchoolNum']
df['livecondition']=df['drugStoreNum']+df['bankNum']+df['shopNum']+df['parkNum']+df['mallNum']+df['superMarketNum']+df['gymNum']
df['pepleroute']=df['newWorkers']/df['residentPopulation']
df['tradeMoneynew']=df['tradeMeanPrice']/df['tradeNewMeanPrice']
df['meanarea']=df['totalTradeArea']/df['tradeSecNum']
df['meanNewarea']=df['totalNewTradeArea']/df['tradeNewNum']
df['lostnum']=df['supplyNewNum']-df['tradeNewNum']
df['aggpeople']=df['newWorkers']/df['residentPopulation']
df['剩余新房卖出比'] = (df['tradeNewNum'])/df['remainNewNum']
df['二手售出比'] = (df['saleSecHouseNum'])/(df['tradeSecNum'])
df['新二价格比'] = df['tradeMeanPrice']/df['tradeNewMeanPrice']
df['当月新房售出比'] = (df['tradeNewNum'])/(df['supplyNewNum'])
df=df.drop(['subwayStationNum','busStationNum','interSchoolNum','schoolNum','privateSchoolNum','drugStoreNum','bankNum'],axis=1)
df=df.drop(['shopNum','parkNum','mallNum','superMarketNum','hospitalNum','gymNum'],axis=1)
#基于特征重要度删除特征
df=df.drop(columns=['tradeLandNum','landMeanPrice','supplyLandNum','landTotalPrice','tradeLandArea','lostnum','supplyNewNum','supplyLandArea','region','tradeNewNum','pepleroute','lookNum','uv','saleSecHouseNum','livecondition','pv'])
categorical_feats = ['rentType', 'houseFloor', 'plate','communityName','houseType_1sumcsu','houseType_2sumcsu','houseType_3sumcsu']
return df, categorical_feats
def getData(feature):
"""
获取数据
"""
train=pd.read_csv(r'C:\Users\lxc\Desktop\featurecup\train_data.csv')
test=pd.read_csv(r'C:\Users\lxc\Desktop\featurecup\test_a.csv')
train = parseData(train)
test = parseData(test)
train, test = washData(train, test)
train, col = feature(train)
test, col = feature(test)
target = train.pop('tradeMoney')
features = train.columns
categorical_feats = col
return train, test, target, features, categorical_feats
train, test, target, features, categorical_feats = getData(feature)
params = {
'num_leaves': 31,
'min_data_in_leaf': 20,
'min_child_samples':20,
'objective': 'regression',
'learning_rate': 0.01,
"boosting": "gbdt",
"feature_fraction": 0.8,
"bagging_freq": 1,
"bagging_fraction": 0.85,
"bagging_seed": 23,
"metric": 'rmse',
"lambda_l1": 0.2,
"nthread": 4,
}
folds = KFold(n_splits=5, shuffle=True, random_state=2333)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
print("fold {}".format(fold_))
trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx], categorical_feature=categorical_feats)
num_round = 10000
clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 200)
oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
print("CV Score: {:<8.5f}".format(r2_score(target, oof_lgb)))
CV Score: 0.91368 from sklearn.metrics import r2_score
def online_score(pred):
print("预测结果最大值:{},预测结果最小值:{}".format(pred.max(),pred.min()))
# a榜测分
conmbine1 = pd.read_csv(r'C:\Users\lxc\Desktop\sub_a_913.csv',engine = "python")
score1 = r2_score(pred, conmbine1)
print("对比913分数:{}".format(score1))
预测结果最大值:18220.05740957245,预测结果最小值:1253.910262976532
对比913分数:0.9542471406953082
总体来说,特征工程基于小区做很容易过拟合,做了很多特征都过拟合。
博客内容提到,特征工程在基于小区级别进行操作时容易导致过拟合现象,即使创建多个特征也会遇到这个问题。
1011

被折叠的 条评论
为什么被折叠?



