作者 by aigle / 2021-12-15 / 暂无评论
"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding more
average features and weekly average features on it.
"""
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
df_train = pd.read_csv(
'../input/train.csv', usecols=[1, 2, 3, 4, 5],
dtype={'onpromotion': bool},
converters={'unit_sales': lambda u: np.log1p(
float(u)) if float(u) > 0 else 0},
parse_dates=["date"],
skiprows=range(1, 66458909) # 2016-01-01
)
df_test = pd.read_csv(
"../input/test.csv", usecols=[0, 1, 2, 3, 4],
dtype={'onpromotion': bool},
parse_dates=["date"] # , date_parser=parser
).set_index(
['store_nbr', 'item_nbr', 'date']
)
items = pd.read_csv(
"../input/items.csv",
).set_index("item_nbr")
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train
promo_2017_train = df_2017.set_index(
["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train
df_2017 = df_2017.set_index(
["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)
items = items.reindex(df_2017.index.get_level_values(1))
def get_timespan(df, dt, minus, periods, freq='D'):
return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]
def prepare_dataset(t2017, is_train=True):
X = pd.DataFrame({
"day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
"mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
"mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
"mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
"mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
"mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
"mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
"promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
"promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
"promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
})
for i in range(7):
X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
for i in range(16):
X["promo_{}".format(i)] = promo_2017[
t2017 + timedelta(days=i)].values.astype(np.uint8)
if is_train:
y = df_2017[
pd.date_range(t2017, periods=16)
].values
return X, y
return X
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
delta = timedelta(days=7 * i)
X_tmp, y_tmp = prepare_dataset(
t2017 + delta
)
X_l.append(X_tmp)
y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)
print("Training and predicting models...")
params = {
'num_leaves': 31,
'objective': 'regression',
'min_data_in_leaf': 300,
'learning_rate': 0.1,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 2,
'metric': 'l2',
'num_threads': 4
}
MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
print("=" * 50)
print("Step %d" % (i+1))
print("=" * 50)
dtrain = lgb.Dataset(
X_train, label=y_train[:, i],
categorical_feature=cate_vars,
weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
)
dval = lgb.Dataset(
X_val, label=y_val[:, i], reference=dtrain,
weight=items["perishable"] * 0.25 + 1,
categorical_feature=cate_vars)
bst = lgb.train(
params, dtrain, num_boost_round=MAX_ROUNDS,
valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
)
print("\n".join(("%s: %.2f" % x) for x in sorted(
zip(X_train.columns, bst.feature_importance("gain")),
key=lambda x: x[1], reverse=True
)))
val_pred.append(bst.predict(
X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
test_pred.append(bst.predict(
X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))
print("Validation mse:", mean_squared_error(
y_val, np.array(val_pred).transpose()))
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
y_test, index=df_2017.index,
columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)https://www.kaggle.com/vrtjso/lgbm-one-step-ahead/script
# -*- coding: utf-8 -*-
import gc
import pandas as pd
from datetime import timedelta
dtypes = {'id':'uint32', 'item_nbr':'int32', 'store_nbr':'int8', 'unit_sales':'float32'}
train = pd.read_csv('../input/train.csv', usecols=[1,2,3,4], dtype=dtypes, parse_dates=['date'],
skiprows=range(1, 101688779) # 2017-01-01
)
train.loc[(train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
train['unit_sales'] = train['unit_sales'].apply(pd.np.log1p) #logarithm conversion
# creating records for all items, in all markets on all dates
# for correct calculation of daily unit sales averages.
u_dates = train.date.unique()
u_stores = train.store_nbr.unique()
u_items = train.item_nbr.unique()
train.set_index(['date', 'store_nbr', 'item_nbr'], inplace=True)
train = train.reindex(
pd.MultiIndex.from_product(
(u_dates, u_stores, u_items),
names=['date','store_nbr','item_nbr']
)
)
del u_dates, u_stores, u_items
gc.collect()
train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs
train.reset_index(inplace=True) # reset index and restoring unique columns
lastdate = train.iloc[train.shape[0]-1].date
train['dow'] = train['date'].dt.dayofweek
#Unit sales mean by item and store
ma_is = train[['item_nbr','store_nbr','unit_sales']].groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('maisall')
#Days of Week Means
#By tarobxl: https://www.kaggle.com/c/favorita-grocery-sales-forecasting/discussion/42948
ma_dw = train[['item_nbr','store_nbr','dow','unit_sales']].groupby(['item_nbr','store_nbr','dow'])['unit_sales'].mean().to_frame('madw')
ma_dw.reset_index(inplace=True)
train = train[train['date'].dt.year == 2017]
gc.collect()
ma_wk = ma_dw[['item_nbr','store_nbr','madw']].groupby(['store_nbr', 'item_nbr'])['madw'].mean().to_frame('mawk')
ma_wk.reset_index(inplace=True)
#Moving Averages
for i in [112,56,28,14,7,3,1]:
tmp = train[train.date>lastdate-timedelta(int(i))]
tmpg = tmp.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais'+str(i))
ma_is = ma_is.join(tmpg, how='left')
del tmp,tmpg
gc.collect()
ma_is['mais']=ma_is.median(axis=1)
ma_is.reset_index(inplace=True)
#Load test
test = pd.read_csv('../input/test.csv', dtype=dtypes, parse_dates=['date'])
test['dow'] = test['date'].dt.dayofweek
test = pd.merge(test, ma_is, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_wk, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])
del ma_is, ma_wk, ma_dw
gc.collect()
#Forecasting Test
test['unit_sales'] = test.mais
pos_idx = test['mawk'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'unit_sales'] = test_pos['mais'] * test_pos['madw'] / test_pos['mawk']
test.loc[:,'unit_sales'].fillna(0, inplace=True)
test['unit_sales'] = test['unit_sales'].apply(pd.np.expm1) # restoring unit values
#20% more for holidays
#By nimesh: https://www.kaggle.com/nimesh280/ma-forecasting-with-holiday-effect-lb-0-529
holiday = pd.read_csv('../input/holidays_events.csv', parse_dates=['date'])
holiday = holiday.loc[holiday['transferred'] == False]
test = pd.merge(test, holiday, how = 'left', on =['date'] )
test['transferred'].fillna(True, inplace=True)
test.loc[test['transferred'] == False, 'unit_sales'] *= 1.2
#50% more for promotion items
#By tarobxl: https://www.kaggle.com/tarobxl/overfit-lb-0-532-log-ma
test.loc[test['onpromotion'] == True, 'unit_sales'] *= 1.5
#Make submit
test[['id','unit_sales']].to_csv('ma8dspdays.csv', index=False, float_format='%.3f')https://www.kaggle.com/paulorzp/log-ma-with-special-days-lb-0-529/data
https://www.kaggle.com/sbongo/lgbm-xgb-lr-weighted-average-lb-0-514/notebook
"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding more
average features and weekly average features on it. It also replaces LGBM with XGB.
There is still room for improvement, but the current version is the best that can
run in a kernel.
"""
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
df_train = pd.read_csv(
'../input/train.csv', usecols=[1, 2, 3, 4, 5],
dtype={'onpromotion': bool},
converters={'unit_sales': lambda u: np.log1p(
float(u)) if float(u) > 0 else 0},
parse_dates=["date"],
skiprows=range(1, 66458909) # 2016-01-01
)
df_test = pd.read_csv(
"../input/test.csv", usecols=[0, 1, 2, 3, 4],
dtype={'onpromotion': bool},
parse_dates=["date"] # , date_parser=parser
).set_index(
['store_nbr', 'item_nbr', 'date']
)
items = pd.read_csv(
"../input/items.csv",
).set_index("item_nbr")
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train
promo_2017_train = df_2017.set_index(
["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train
df_2017 = df_2017.set_index(
["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)
items = items.reindex(df_2017.index.get_level_values(1))
def get_timespan(df, dt, minus, periods, freq='D'):
return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]
def prepare_dataset(t2017, is_train=True):
X = pd.DataFrame({
"day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
"mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
"mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
"mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
"mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
"mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
"mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
"promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
"promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
"promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
})
for i in range(7):
X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
for i in range(16):
X["promo_{}".format(i)] = promo_2017[
t2017 + timedelta(days=i)].values.astype(np.uint8)
if is_train:
y = df_2017[
pd.date_range(t2017, periods=16)
].values
return X, y
return X
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
delta = timedelta(days=7 * i)
X_tmp, y_tmp = prepare_dataset(
t2017 + delta
)
X_l.append(X_tmp)
y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)
print("Training and predicting models...")
MAX_ROUNDS = 280
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
print("=" * 50)
print("Step %d" % (i+1))
print("=" * 50)
model = CatBoostRegressor(
iterations=MAX_ROUNDS, learning_rate=0.5,
depth=4)
model.fit(
X_train, y_train[:, i],
cat_features=cate_vars)
val_pred.append(model.predict(X_val))
test_pred.append(model.predict(X_test))
print("Validation mse:", mean_squared_error(
y_val, np.array(val_pred).transpose()))
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
y_test, index=df_2017.index,
columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('cat1.csv', float_format='%.4f', index=None)https://www.kaggle.com/tunguz/catboost-starter-lb-0-517
https://www.kaggle.com/c/forest-cover-type-prediction/code?competitionId=3936&sortBy=scoreDescending
评论已关闭