kaggle 入门 rossmann xgboost

最新推荐文章于 2025-07-10 10:51:49 发布

whiker

最新推荐文章于 2025-07-10 10:51:49 发布

阅读量5.4k

点赞数

CC 4.0 BY-SA版权

分类专栏： kaggle

本文链接：https://blog.youkuaiyun.com/onepiecehuiyu/article/details/51628986

kaggle 专栏收录该内容

7 篇文章

订阅专栏

本文介绍了一个使用XGBoost模型进行商店销量预测的案例。通过特征工程优化，包括处理缺失值、创建新特征等手段，模型准确率大幅提升。最终模型在验证集上的RMSPE得分仅为0.14。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

对商店的某天进行销量预测

这个特征处理得很好，用上一个特征进行xgboost得到0.32的成绩，换成该特征得到0.14的成绩

特征：

Open为空赋为1

train中只看open为1且sales>0的记录

合并store和train&test

所有空值填0

'StoreType', 'Assortment', 'StateHoliday'中将0abcd变为01234

从日期中抽出Year, Month, Day, DOW, WOY

创建竞争对手开店月数，优惠月数，每条记录的所处月份是否是优惠月

几个特征的重要性为：Day>Store>竞争对手开店月数>WOY>DOW>优惠月数>CompetitionDistance>Month

代码：

#!/usr/bin/python

import csv

import operator

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import matplotlib

matplotlib.use("Agg")  # Needed to save figures
import matplotlib.pyplot as plt


def create_feature_map(features):
	outfile = open('xgb.fmap', 'w')
	for i, feat in enumerate(features):
		outfile.write('{0}\t{1}\tq\n'.format(i, feat))
	outfile.close()


def rmspe(y, yhat):
	return np.sqrt(np.mean((yhat / y - 1) ** 2))


def rmspe_xg(yhat, y):
	y = np.expm1(y.get_label())
	yhat = np.expm1(yhat)
	return "rmspe", rmspe(y, yhat)


# Gather some features
def build_features(features, data):
	# remove NaNs
	data.fillna(0, inplace=True)
	data.loc[data.Open.isnull(), 'Open'] = 1
	# Use some properties directly
	features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

	# Label encode some features
	features.extend(['StoreType', 'Assortment', 'StateHoliday'])
	mappings = {'0': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4}
	data.StoreType.replace(mappings, inplace=True)
	data.Assortment.replace(mappings, inplace=True)
	data.StateHoliday.replace(mappings, inplace=True)

	features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
	data['Year'] = data.Date.dt.year
	data['Month'] = data.Date.dt.month
	data['Day'] = data.Date.dt.day
	data['DayOfWeek'] = data.Date.dt.dayofweek
	data['WeekOfYear'] = data.Date.dt.weekofyear

	# CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
	# Calculate time competition open time in months
	features.append('CompetitionOpen')
	data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
							  (data.Month - data.CompetitionOpenSinceMonth)
	# Promo open time in months
	features.append('PromoOpen')
	data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
						(data.WeekOfYear - data.Promo2SinceWeek) / 4.0
	data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
	data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

	# Indicate that sales on that day are in promo interval
	features.append('IsPromoMonth')
	month2str = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', \
				 7: 'Jul', 8: 'Aug', 9: 'Sept', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
	data['monthStr'] = data.Month.map(month2str)
	data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
	data['IsPromoMonth'] = 0
	for interval in data.PromoInterval.unique():
		if interval != '':
			for month in interval.split(','):
				data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

	return data


## Start of main script

print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
		 'CompetitionOpenSinceMonth': np.dtype(int),
		 'StateHoliday': np.dtype(str),
		 'Promo2SinceWeek': np.dtype(int),
		 'SchoolHoliday': np.dtype(float),
		 'PromoInterval': np.dtype(str)}
train = pd.read_csv("../input/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("../input/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("../input/store.csv")

print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

print("augment features")
build_features(features, train)
build_features([], test)
print(features)


print('training data processed')

params = {"objective": "reg:linear",
		  "booster": "gbtree",
		  "eta": 0.3,
		  "max_depth": 10,
		  "subsample": 0.9,
		  "colsample_bytree": 0.7,
		  "silent": 1,
		  "seed": 1301
		  }
num_boost_round = 300

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
				early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_10_submission.csv", index=False)

# XGB feature importances
# Based on https://www.kaggle.com/mmueller/liberty-mutual-group-property-inspection-prediction/xgb-feature-importance-python/code

create_feature_map(features)
importance = gbm.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

featp = df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
fig_featp = featp.get_figure()
fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)

结果0.14