房租赛-特征

博客内容提到,特征工程在基于小区级别进行操作时容易导致过拟合现象,即使创建多个特征也会遇到这个问题。
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score


def parseData(df):
    """
    预处理数据
    """
    df['rentType'][df['rentType']=='--'] = '未知方式'
    
    def parseRoom(info, index):
        res = int(info[index*2])
        return res
    df.insert(3,'室',None)
    df.insert(4, '厅', None)
    df.insert(5, '卫', None)
    df['室'] = df['houseType'].apply(parseRoom, index=0)
    df['厅'] = df['houseType'].apply(parseRoom, index=1)
    df['卫'] = df['houseType'].apply(parseRoom, index=2)
    df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1]))
    
    #统计相同小区相同室,相同交易时间相同室,相同交易时间相同小区的特征.
    df['houseType_1sumcsu']=df['室'].map(lambda x:str(x))+df['communityName'].map(lambda x:str(x))
    df['houseType_2sumcsu']=df['室'].map(lambda x:str(x))+df['交易月份'].map(lambda x:str(x))
    df['houseType_3sumcsu']=df['communityName'].map(lambda x:str(x))+df['交易月份'].map(lambda x:str(x))
    
    #平滑操作
    big_num_cols = ['totalTradeMoney','totalTradeArea','tradeMeanPrice','totalNewTradeMoney', 'totalNewTradeArea',
                'tradeNewMeanPrice','remainNewNum', 'supplyNewNum', 'supplyLandArea',
                'tradeLandArea','landTotalPrice','landMeanPrice','totalWorkers','newWorkers',
                'residentPopulation','pv','uv']
    
    for col in big_num_cols:
        df[col] = df[col].map(lambda x: np.log1p(x))   
    
    # 转换object类型数据
    columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate','houseType_1sumcsu','houseType_2sumcsu','houseType_3sumcsu']
    for col in columns:
        df[col] = df[col].astype('category')
        

    # 处理pv和uv的空值
    df['pv'].fillna(df['pv'].mean(),inplace=True)
    df['uv'].fillna(df['uv'].mean(),inplace=True)
    df['pv'] = df['pv'].astype('int')
    df['uv'] = df['uv'].astype('int')
    #df.loc[df['buildYear']=='暂无信息','buildYear'] = None
    # 将buildYear列转换为整型数据

    tmp = df['buildYear'].copy()
    tmp2 = tmp[tmp!='暂无信息'].astype('int')
    tmp[tmp=='暂无信息'] = tmp2.mode().iloc[0]
    df['buildYear'] = tmp
    df['buildYear'] = df['buildYear'].astype('int')
    # 去掉部分特征,房屋朝向直接剔除
    #df.drop('communityName',axis=1, inplace=True)
    df.drop('city',axis=1,inplace=True)
    df.drop('houseToward',axis=1,inplace=True)
    df.drop('houseDecoration',axis=1,inplace=True)
    df.drop(['ID'],axis=1,inplace=True)
    
    return df


def washData(df_train, df_test):
    """
    清洗数据
    """
    #测试集里面面积只存在200以下,为了训练集与测试集相符只选区面积200以下的进行训练
    
    df_train = df_train[(df_train['area']<=200)&(df_train['area']>6)]
    df_train = df_train[df_train['tradeMoney']<=100000]
    df_train = df_train.drop(df_train[(df_train.tradeMoney/df_train.area>300)].index,axis=0)
    df_train = df_train.drop(df_train[(df_train.tradeMoney/df_train.area<25)].index,axis=0)
    df_train = df_train.drop(df_train[df_train['houseType'] =='0室0厅1卫' ].index)
    df_train=df_train.drop(df_train[df_train.totalFloor==0].index)
    df_train = df_train.drop(df_train[(df_train['tradeMoney']>25000)&(df_train['area']<100)].index)
    df_train = df_train.drop(df_train[(df_train['tradeMoney']<75000)&(df_train['area']>800)].index)
    
    return df_train, df_test


def feature(df):
    """
    特征
    """
    # 将houseType转化为‘房间数’,‘厅数’,‘卫生间数’
    '''
    def parseRoom(info, index):
        res = int(info[index*2])
        return res
    df.insert(3,'室',None)
    df.insert(4, '厅', None)
    df.insert(5, '卫', None)
    df['室'] = df['houseType'].apply(parseRoom, index=0)
    df['厅'] = df['houseType'].apply(parseRoom, index=1)
    df['卫'] = df['houseType'].apply(parseRoom, index=2)
    
    
    df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1]))
    '''
#     df['pv/uv'] = df['pv'] / df['uv']
     #df['房间总数'] = df['室'] + df['厅'] + df['卫']
    
    df.drop('houseType', axis=1, inplace=True)
    df.drop('tradeTime', axis=1, inplace=True)
    
    items=['area','室','厅']
    #统计特征,过拟合验证,选择删除
    '''
    for item in items:
        xiaoquname_mean=df.groupby('communityName',as_index=False)[item].agg({
            item+'mean小区名':'mean',}
         )
        df = df.merge(xiaoquname_mean,on='communityName',how='left')
    
    tmp=df.groupby(['communityName'],as_index=False)['area'].agg({
    'ca_mean':'mean',
    #'ca_max':'max',
    #'ca_min':'min'
    })

    df = df.merge(tmp, on=['communityName'], how='left')
    '''
    #df['houseType_1sumcsu']=df['室'].map(lambda x:str(x))+df['communityName'].map(lambda x:str(x))
    #df['houseType_2sumcsu']=df['室'].map(lambda x:str(x))+df['交易月份'].map(lambda x:str(x))
    #将冗杂特征合并,并构造新特征
    df['traffic']=df['subwayStationNum']+df['busStationNum']
    df['edu']=df['interSchoolNum']+df['schoolNum']+df['privateSchoolNum']
    df['livecondition']=df['drugStoreNum']+df['bankNum']+df['shopNum']+df['parkNum']+df['mallNum']+df['superMarketNum']+df['gymNum']
    df['pepleroute']=df['newWorkers']/df['residentPopulation']
    df['tradeMoneynew']=df['tradeMeanPrice']/df['tradeNewMeanPrice']
    df['meanarea']=df['totalTradeArea']/df['tradeSecNum']
    df['meanNewarea']=df['totalNewTradeArea']/df['tradeNewNum']
    df['lostnum']=df['supplyNewNum']-df['tradeNewNum']
    
    df['aggpeople']=df['newWorkers']/df['residentPopulation']
    df['剩余新房卖出比'] = (df['tradeNewNum'])/df['remainNewNum']
    df['二手售出比'] = (df['saleSecHouseNum'])/(df['tradeSecNum'])
    df['新二价格比'] = df['tradeMeanPrice']/df['tradeNewMeanPrice']
    df['当月新房售出比'] = (df['tradeNewNum'])/(df['supplyNewNum'])
    
    
    
    df=df.drop(['subwayStationNum','busStationNum','interSchoolNum','schoolNum','privateSchoolNum','drugStoreNum','bankNum'],axis=1)
    df=df.drop(['shopNum','parkNum','mallNum','superMarketNum','hospitalNum','gymNum'],axis=1)
    
    #基于特征重要度删除特征
    df=df.drop(columns=['tradeLandNum','landMeanPrice','supplyLandNum','landTotalPrice','tradeLandArea','lostnum','supplyNewNum','supplyLandArea','region','tradeNewNum','pepleroute','lookNum','uv','saleSecHouseNum','livecondition','pv'])
    
    
    
    categorical_feats = ['rentType', 'houseFloor', 'plate','communityName','houseType_1sumcsu','houseType_2sumcsu','houseType_3sumcsu']
    return df, categorical_feats


def getData(feature):
    """
    获取数据
    """
    train=pd.read_csv(r'C:\Users\lxc\Desktop\featurecup\train_data.csv')
    test=pd.read_csv(r'C:\Users\lxc\Desktop\featurecup\test_a.csv')
    
    train = parseData(train)
    test = parseData(test)
    train, test = washData(train, test)
    
    train, col = feature(train)
    test, col = feature(test)
    
    target = train.pop('tradeMoney')
    features = train.columns
    categorical_feats = col
    
    return train, test, target, features, categorical_feats


train, test, target, features, categorical_feats = getData(feature)
params = {
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'min_child_samples':20,
    'objective': 'regression',
    'learning_rate': 0.01,
    "boosting": "gbdt",
    "feature_fraction": 0.8,
    "bagging_freq": 1,
    "bagging_fraction": 0.85,
    "bagging_seed": 23,
    "metric": 'rmse',
    "lambda_l1": 0.2,
    "nthread": 4,
}


folds = KFold(n_splits=5, shuffle=True, random_state=2333)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 200)
    
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
    
print("CV Score: {:<8.5f}".format(r2_score(target, oof_lgb)))

CV Score: 0.91368 from sklearn.metrics import r2_score
def online_score(pred):
    print("预测结果最大值:{},预测结果最小值:{}".format(pred.max(),pred.min()))
    # a榜测分
    conmbine1 = pd.read_csv(r'C:\Users\lxc\Desktop\sub_a_913.csv',engine = "python")
    score1 = r2_score(pred, conmbine1)
    print("对比913分数:{}".format(score1))

预测结果最大值:18220.05740957245,预测结果最小值:1253.910262976532
对比913分数:0.9542471406953082

总体来说,特征工程基于小区做很容易过拟合,做了很多特征都过拟合。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值