In this project, I will build a Machine Learning model to forecast the next 7 days of water demand in the UK using historic demand data and weather data.
import pandas as pd
#also require openpyxl to open xlsx file with pandas
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib qt
# need PyQt installed
import calendar
from datetime import datetime, timedelta
plt.rcParams['figure.figsize'] = [16, 10]
from statsmodels.tsa.seasonal import seasonal_decompose
# Load the daily demand data
df_demand = pd.read_excel('Water Demand Data July 2022.xlsx')
df_demand['Date'] = pd.to_datetime(df_demand['Date'])
df_demand = df_demand.set_index('Date')
df_demand.plot(y='Demand', use_index=True)
# demand looks to have increased over time in this area
# looks like there may be some spurious datapoints in the dataset,
# it doesn't seem likely that true demand would drop so significantly
<AxesSubplot:xlabel='Date'>
Firstly, we plot the demand data provided. The following becomes apparent upon inspecting the data: \n
decomp = seasonal_decompose(df_demand, model='multiplicable', period=365)
ax_decomp = decomp.plot()
shift = 1
ax = df_demand.pct_change(shift).plot()
ax.axhline(-0.25, c='r') # 0.25 seems to be a sensible cutoff
dates_with_unusual_demand = df_demand[df_demand.pct_change(shift)<-0.25].dropna()
dates_with_unusual_demand
Demand | |
---|---|
Date | |
2009-08-02 | 472.683428 |
2014-12-31 | 308.646637 |
2022-07-13 | 413.161290 |
df_demand[(df_demand.index.month==12) & (df_demand.index.day==31)].hist(label = 'New Years Eve Demand Histogram')
plt.legend()
<matplotlib.legend.Legend at 0x231213bcc10>
From the histogram above, we have evidence that the 2014 New Years Day is not a typical New Years day, more likely to be a metering error, so we can safely interpolate this day.
df_demand[(df_demand.index.month==8) & (df_demand.index.day==2)].hist(label = "8th August Demand Histogram")
plt.legend()
<matplotlib.legend.Legend at 0x2312121dfa0>
The same may be true for the 8th of August 2009, however I'm less certain about this being spurious data. I still think it is safe to interpolate.
"""
Because we are only interpolating non-consecutive days, we can simply replace the days with metering errors with the previous days data.
There are better ways of doing this, however, because it's only 3 days of data, it won't make a huge difference to our model output.
"""
df_demand_before = df_demand.copy()
df_demand.loc[df_demand.index.isin(dates_with_unusual_demand.index), 'Demand'] = df_demand.loc[df_demand.index.isin(dates_with_unusual_demand.index+timedelta(days=-1)), 'Demand']
fig, axs = plt.subplots(2)
df_demand_before.plot(ax=axs[0])
axs[0].set_title('Before cleaning')
df_demand.plot(ax=axs[1])
axs[1].set_title('After cleaning')
plt.legend()
del df_demand_before
Now let's start invesigating potential features that we may be able to use in our ML project.
"""
In this code block, we're going to group the demand data by Month and Year
to see if there are any significant shifts in the Month-Month pattern of each year
"""
mean_monthly_demand = df_demand.groupby([df_demand.index.month, df_demand.index.year]).mean()
mean_monthly_demand.index = mean_monthly_demand.index.rename(['Month', 'Year'])
fig, axs = plt.subplots(2, figsize=(12,10))
for year in range(2007,2022):
mean_monthly_by_year = mean_monthly_demand.loc[mean_monthly_demand.index.get_level_values('Year') == year, 'Demand']
colour_weight = 1-((2022-year)/(2022-2007))+0.01
axs[0].plot(mean_monthly_by_year.index.get_level_values('Month'), mean_monthly_by_year.values, alpha=colour_weight, c = 'blue', label = year)
max_month = mean_monthly_by_year[mean_monthly_by_year.values == mean_monthly_by_year.max()]
min_month = mean_monthly_by_year[mean_monthly_by_year.values == mean_monthly_by_year.min()]
axs[0].scatter(max_month.index.get_level_values('Month'), max_month.values, c = 'r')
axs[0].scatter(min_month.index.get_level_values('Month'), min_month.values, c = 'g')
axs[0].set_title('Mean demand grouped by Month and Year')
axs[0].legend(bbox_to_anchor=(1., 1))
##############################################
"""
In this code block, we're going to group the demand data by Year
and look at the variation in mean yearly demand.
"""
mean_yearly_demand = df_demand.groupby(df_demand.index.year).mean()
for year in range(mean_yearly_demand.index.min(),mean_yearly_demand.index.max()):
mean_by_year = mean_yearly_demand.loc[mean_yearly_demand.index == year, 'Demand']
axs[1].bar(mean_by_year.index, mean_by_year.values, color = 'blue',label = year)
axs[1].set_title('Mean demand grouped by Year')
axs[1].set_ylim(mean_yearly_demand['Demand'].min()-50, mean_yearly_demand['Demand'].max()+50)
plt.show()
##############################################
"""
Next, we're going to group the demand data by day of the week
and look for variations in the average weekday (MTWTFSS).
"""
fig, ax = plt.subplots(1)
mean_weekday_demand = df_demand.groupby(df_demand.index.weekday).mean()
weekdays=dict(enumerate(calendar.day_name))
for wd in range(0,7):
mean_weekday = mean_weekday_demand.loc[mean_weekday_demand.index == wd, 'Demand']
ax.bar(weekdays[mean_weekday.index[0]], mean_weekday.values, color = 'blue')
ax.set_title('Mean demand grouped by Day of week')
ax.set_ylim(mean_weekday_demand['Demand'].min()-5, mean_weekday_demand['Demand'].max()+5)
plt.show()
##############################################
"""
Next, let's see if public holidays show us anything interesting.
To do this, we use the 'holidays' package, then get the UKs holidays for each year.
Because we are analysing data from the West Midlands area,
we only want the holidays celebrated in England (or all Nations of the UK on the same date, e.g. Christmas)
"""
import holidays
uk_holidays = [holidays.UnitedKingdom(years=y) for y in range(2007,2022)]
uk_holidays = {'Holiday': dict(pair for d in uk_holidays for pair in d.items())}
df_holidays = pd.DataFrame.from_dict(uk_holidays)
df_holidays_celebrated_by_all_nations = df_holidays[~df_holidays['Holiday'].str.contains("]")]
df_holidays_celebrated_by_one_or_multiple_nations = df_holidays[df_holidays['Holiday'].str.contains("]")]
df_holidays_celebrated_in_England = df_holidays_celebrated_by_one_or_multiple_nations[df_holidays_celebrated_by_one_or_multiple_nations['Holiday'].str.contains('England')]
df_holidays_celebrated_in_England = pd.concat([df_holidays_celebrated_in_England,df_holidays_celebrated_by_all_nations])
df_holidays_celebrated_in_England.index = pd.to_datetime(df_holidays_celebrated_in_England.index)
#####
fig, ax = plt.subplots(1)
mean_weekday_demand = df_demand.groupby(df_demand.index.dayofyear).mean()
mean_weekday_demand.plot(ax=ax)
ax.set_title('Mean demand grouped by Day of Year (holidays in dashed red)')
ax.set_ylim(mean_weekday_demand['Demand'].min()-5, mean_weekday_demand['Demand'].max()+5)
[ax.axvline(h, c='r', alpha = 0.7, ls='--') for h in df_holidays_celebrated_in_England.index.dayofyear.drop_duplicates()]
plt.show()
Interestingly, it appears that Tuesday is the day with the lowest average demand, with Sunday being the highest daily demand. As we saw before, demand is increasing over the years and the months with the maximum demand for the year does appear to be during the Summer months while the minimum monthly demand seems to be in Autumn/Winter.
Also, days marked as Holidays appear to show a decrease in demand. Let's explore this further:
from scipy.stats import ttest_ind, mannwhitneyu
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot
holiday_demand = df_demand[df_demand.index.isin(df_holidays_celebrated_in_England.index)]
# let's take a random sample of non-holiday days to compare
random_sample_of_non_holiday_demand = df_demand[~df_demand.index.isin(df_holidays_celebrated_in_England.index)].sample(len(df_holidays_celebrated_in_England), random_state = 123)
qqplot(holiday_demand.sort_values(by='Demand'), line='s')
qqplot(random_sample_of_non_holiday_demand.sort_values(by='Demand'), line='s')
# it looks like both datasets are approximately normally distributed, we can use a T test
ax = (random_sample_of_non_holiday_demand).hist(label = 'Non-holiday demand', alpha = 0.7, bins=20)
holiday_demand.hist(ax = ax, label = 'Holiday demand', alpha = 0.7, bins=20)
plt.legend()
test_result, p_value = ttest_ind(holiday_demand,random_sample_of_non_holiday_demand, axis=0)
print(p_value)
# this dataset shows that there is less demand during holidays than non holidays, worth keeping in as a feature
[0.00016727]
Indeed, there does appear to be a statistically significant decrease in the means of the demand during 'Holidays' than 'Non-Holidays'. This is possibly due to the fact that on public holidays, commercial properties will be closed, including high-demand factories, leading to a drop in overall demand.
Next, let's take a look at weather data to see the impact it has on demand.
I have used the Open Midas Dataset for this analysis, available here (you may need to create a login to download the data).
def format_temperature_data(df):
"""
As the Met Office temperature time series data has a 12 hour interval, we want to convert this to 24 hours (to match our demand dataset)
"""
df['ob_end_time'] = pd.to_datetime(df['ob_end_time'])
df['Date'] = df['ob_end_time'].dt.date
max_air_temp = df.groupby('Date').max()['max_air_temp']
min_air_temp = df.groupby('Date').min()['min_air_temp']
air_temps = pd.concat([max_air_temp, min_air_temp], axis=1)
return air_temps
def format_rainfall_data(df):
"""
Load Midas rainfall dataset. Data is at 24 hour intervals so no need to group.
"""
df['ob_date'] = pd.to_datetime(df['ob_date'])
df = df.set_index('ob_date')
return df['prcp_amt']
def format_wind_data(df):
"""
Load Midas wind dataset. Data is at 1 hour intervals we need to group by date and take the mean windfall.
"""
df['ob_end_time'] = pd.to_datetime(df['ob_end_time'])
df['Date'] = df['ob_end_time'].dt.date
mean_wind_df = df.groupby('Date').mean()['mean_wind_speed']
return mean_wind_df
def load_weather_data(path, header_size, data_type):
"""
Single function to load all data types.
"""
print(f'\n{data_type:#^20} ')
dfs = []
for year in range(2007,2022):
print(f'\n{year:#^20} ')
file_path = f'{path}_{year}.csv'
df_temporary = pd.read_csv(file_path, skiprows = header_size, skipfooter=1)
if data_type=='Temperature':
df_temporary = format_temperature_data(df_temporary)
elif data_type=='Rainfall':
df_temporary = format_rainfall_data(df_temporary)
elif data_type=='Wind':
df_temporary = format_wind_data(df_temporary)
dates_in_year = pd.date_range(start=f"{year}-01-01", end=f"{year}-12-31")
missing_dates = dates_in_year.difference(df_temporary.index)
if len(missing_dates)>0:
print("Missing dates: ", missing_dates.values)
dfs.append(df_temporary)
df = pd.concat(dfs)
return df
temperature_df = load_weather_data('./Temperature Data/midas-open_uk-daily-temperature-obs_dv-202207_west-midlands_24102_coventry-coundon_qcv-1', 90, 'Temperature')
rainfall_df = load_weather_data('./Rainfall Data/midas-open_uk-daily-rain-obs_dv-202207_shropshire_00636_edge_qcv-1', 61, 'Rainfall')
wind_df = load_weather_data('./Wind Data/midas-open_uk-mean-wind-obs_dv-202207_west-midlands_24102_coventry-coundon_qcv-1', 79, 'Wind')
####Temperature##### ########2007########
c:\users\johnd\appdata\local\programs\python\python38\lib\site-packages\pandas\util\_decorators.py:311: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'. return func(*args, **kwargs)
Missing dates: ['2007-01-27T00:00:00.000000000' '2007-01-28T00:00:00.000000000' '2007-01-29T00:00:00.000000000' '2007-01-30T00:00:00.000000000' '2007-01-31T00:00:00.000000000'] ########2008######## ########2009######## ########2010######## ########2011######## ########2012######## ########2013######## ########2014######## ########2015######## ########2016######## ########2017######## ########2018######## ########2019######## Missing dates: ['2019-07-10T00:00:00.000000000' '2019-07-11T00:00:00.000000000' '2019-07-12T00:00:00.000000000' '2019-07-13T00:00:00.000000000' '2019-07-14T00:00:00.000000000' '2019-07-15T00:00:00.000000000' '2019-07-16T00:00:00.000000000' '2019-07-17T00:00:00.000000000' '2019-07-18T00:00:00.000000000' '2019-07-19T00:00:00.000000000'] ########2020######## ########2021######## ######Rainfall###### ########2007######## ########2008######## ########2009######## ########2010######## ########2011######## ########2012######## ########2013######## ########2014######## ########2015######## ########2016######## ########2017######## ########2018######## Missing dates: ['2018-06-21T00:00:00.000000000' '2018-06-22T00:00:00.000000000' '2018-06-23T00:00:00.000000000' '2018-08-02T00:00:00.000000000' '2018-08-03T00:00:00.000000000' '2018-08-04T00:00:00.000000000' '2018-08-05T00:00:00.000000000' '2018-08-06T00:00:00.000000000' '2018-08-07T00:00:00.000000000' '2018-08-08T00:00:00.000000000' '2018-08-09T00:00:00.000000000' '2018-08-10T00:00:00.000000000' '2018-08-11T00:00:00.000000000' '2018-08-12T00:00:00.000000000' '2018-08-13T00:00:00.000000000' '2018-08-14T00:00:00.000000000' '2018-08-15T00:00:00.000000000' '2018-08-16T00:00:00.000000000' '2018-08-17T00:00:00.000000000' '2018-08-18T00:00:00.000000000' '2018-08-19T00:00:00.000000000' '2018-08-20T00:00:00.000000000' '2018-08-21T00:00:00.000000000' '2018-08-22T00:00:00.000000000' '2018-08-23T00:00:00.000000000' '2018-08-24T00:00:00.000000000' '2018-08-25T00:00:00.000000000' '2018-08-26T00:00:00.000000000' '2018-08-27T00:00:00.000000000' '2018-08-28T00:00:00.000000000' '2018-08-29T00:00:00.000000000' '2018-08-30T00:00:00.000000000' '2018-08-31T00:00:00.000000000'] ########2019######## ########2020######## ########2021######## ########Wind######## ########2007######## Missing dates: ['2007-01-27T00:00:00.000000000' '2007-01-28T00:00:00.000000000' '2007-01-29T00:00:00.000000000' '2007-01-30T00:00:00.000000000' '2007-01-31T00:00:00.000000000'] ########2008######## ########2009######## ########2010######## Missing dates: ['2010-07-31T00:00:00.000000000' '2010-08-01T00:00:00.000000000'] ########2011######## ########2012######## ########2013######## ########2014######## ########2015######## ########2016######## ########2017######## ########2018######## ########2019######## Missing dates: ['2019-07-11T00:00:00.000000000' '2019-07-12T00:00:00.000000000' '2019-07-13T00:00:00.000000000' '2019-07-14T00:00:00.000000000' '2019-07-15T00:00:00.000000000' '2019-07-16T00:00:00.000000000' '2019-07-17T00:00:00.000000000' '2019-07-18T00:00:00.000000000'] ########2020######## ########2021########
Next, we create a dataset with all of the variables we want to train our ML model on.
dataset = pd.merge(df_demand,temperature_df, left_index=True, right_index = True,how="outer")
dataset = pd.merge(dataset,rainfall_df, left_index=True, right_index = True,how="outer")
dataset = pd.merge(dataset,wind_df, left_index=True, right_index = True,how="outer")
dataset = pd.merge(dataset,df_holidays_celebrated_in_England, left_index=True, right_index = True,how="outer")
dataset.loc[~dataset['Holiday'].isna(), 'Holiday'] = 1
dataset.loc[dataset['Holiday'].isna(), 'Holiday'] = 0
dataset['Month'] = dataset.index.month
dataset['DayOfYear'] = dataset.index.dayofyear
dataset['DayOfWeek'] = dataset.index.dayofweek
ax = dataset.plot.scatter(x='max_air_temp', y='Demand', label = 'Max Air temp vs Demand', alpha = 0.5)
ax = dataset.plot.scatter(x='min_air_temp', y='Demand', label = 'Min Air temp vs Demand', alpha = 0.5)
ax = dataset.plot.scatter(x='min_air_temp', y='max_air_temp', label = 'Min Air temp vs Max Air Temp', alpha = 0.5)
ax = dataset.plot.scatter(x='prcp_amt', y='Demand', label = 'Rainfall vs Demand', alpha = 0.5)
ax = dataset.plot.scatter(x='mean_wind_speed', y='Demand', label = 'Wind Speed vs Demand', alpha = 0.5)
It is very clear that when the max temperature goes above ~22 degrees C, demand increases. This is going to be a very important feature in our dataset. Rainfall and wind show negligible correlation with daily demand.
Now let's make sure our data is clean
"""
because we don't have 2022 data for the weather data, let's cut this year out
"""
dataset = dataset[dataset.index.year <2022]
dataset[dataset.isna()['Demand']==True]
Demand | max_air_temp | min_air_temp | prcp_amt | mean_wind_speed | Holiday | Month | DayOfYear | DayOfWeek | |
---|---|---|---|---|---|---|---|---|---|
2007-01-01 | NaN | 9.3 | 2.9 | 6.6 | 8.260870 | 1 | 1 | 1 | 0 |
2009-08-02 | NaN | 19.9 | 9.6 | 1.5 | 4.041667 | 0 | 8 | 214 | 6 |
2014-12-31 | NaN | 6.4 | -2.0 | 0.0 | 4.041667 | 0 | 12 | 365 | 2 |
"""
because we are working with time series data,
we may want to interpolate the dataset to make sure any nans are filled sensibly
"""
print("Nans Before: \n", dataset.isna().sum())
print(" ")
#still a few nans in the dataset, we can simply interpolate these rows
dataset = dataset.interpolate()
dataset = dataset.dropna()
print("Nans After:\n", dataset.isna().sum())
dataset = dataset.sort_index()
Nans Before: Demand 3 max_air_temp 15 min_air_temp 16 prcp_amt 33 mean_wind_speed 15 Holiday 0 Month 0 DayOfYear 0 DayOfWeek 0 dtype: int64 Nans After: Demand 0 max_air_temp 0 min_air_temp 0 prcp_amt 0 mean_wind_speed 0 Holiday 0 Month 0 DayOfYear 0 DayOfWeek 0 dtype: int64
rainfall = dataset[dataset['prcp_amt']==0]['Demand'].dropna()
no_rainfall = dataset[dataset['prcp_amt']>0]['Demand'].dropna()
fig, ax = plt.subplots()
rainfall.hist(ax = ax,alpha = 0.7, label = 'Rainfall')
no_rainfall.hist(ax=ax,alpha = 0.7, label = 'No Rainfall')
plt.legend()
plt.show()
ax = pd.plotting.scatter_matrix(dataset, diagonal='kde')
Firstly, let's identify which features are important
# let's look at feature importance
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
def prepare_training_set_and_test_set(df, label_col, split):
"""
Split the dataset into training/test sets and separate the features and labels
"""
training_set = df.loc[df.index[:int(len(df.index)*split)]]
test_set = df.loc[df.index[int(len(df.index)*split):]]
y_train = training_set[label_col].values
X_train = training_set.drop(columns = label_col)
X_train = X_train.values.astype(np.float64)
y_test = test_set[label_col].values
X_test = test_set.drop(columns = label_col)
X_test = X_test.values.astype(np.float64)
return X_train, y_train, X_test, y_test, training_set, test_set
X_train, y_train, X_test, y_test, df_training_set, df_test_set = prepare_training_set_and_test_set(dataset, 'Demand', 0.8)
def mape(y_true, y_pred):
return round(np.mean(np.abs((y_true - y_pred) / y_true)) * 100, 2)
"""
Next, let's choose a model for us to use.
We will evaluate each models performace using the mean absolute percentage error (MAPE)
"""
models = {
"GradientBoostingRegressor":GradientBoostingRegressor(),
"DecisionTreeRegressor":DecisionTreeRegressor(),
"LinearRegression":LinearRegression(),
"SGDRegressor" : SGDRegressor(),
"KernelRidge" : KernelRidge(),
"ElasticNet" : ElasticNet(),
"BayesianRidge" : BayesianRidge(),
"SVR" : SVR()}
for k,m in models.items():
mod = m.fit(X_train, y_train)
print(k, mape(y_test, mod.predict(X_test)))
# GradientBoostingRegressor seems to do a good job here
GradientBoostingRegressor 6.36 DecisionTreeRegressor 7.14 LinearRegression 6.4 SGDRegressor 1452975775542.07 KernelRidge 14.79 ElasticNet 6.41 BayesianRidge 6.41 SVR 6.74
Once we've chosen our model, we fit it to the data, then use the 'shap' package to identify feature importance
rgr = GradientBoostingRegressor().fit(X_train, y_train)
import shap
explainer = shap.Explainer(rgr.predict, X_train.astype(np.float64))
shap_values = explainer(df_test_set.drop(columns = ['Demand']).astype(np.float64))
Exact explainer: 1097it [00:17, 33.40it/s]
fig, ax = plt.subplots()
ax = shap.plots.bar(shap_values)
fig, ax = plt.subplots()
ax = shap.summary_plot(shap_values, alpha=0.5)
The plots above show that, as expected, max temperature is a very important feature when predicting the demand on a given day. Day of the year is also quite important, however, this may just be a proxy measure for temperature.
The above model simply predicts a given days demand when given that days temperature, rainfall, etc. However, we want to forecast the demand for n days in the future. To do this, I am assuming that the forecasted weather data will be available upon predicting a new dataset and that previous days data will also be available.
# Now we need to shift the data
def shift_df_forward(df, shift_rows_forward):
"""
Shift the dataframe forwards in time
"""
dfs = []
for n in range(1,shift_rows_forward+1):
cols = [c + f' (t+{n})' for c in df.columns]
new_df = df.shift(-n)
new_df.columns = cols
dfs.append(new_df)
df_shifted_forwards = pd.concat(dfs, axis=1)
return df_shifted_forwards
def shift_df_backwards(df, shift_rows_backwards):
"""
Shift the dataframe backwards in time
"""
dfs = []
for n in range(1,shift_rows_backwards+1):
cols = [c + f' (t-{n})' for c in df.columns]
new_df = df.shift(n)
new_df.columns = cols
dfs.append(new_df)
df_shifted_backwards = pd.concat(dfs, axis=1)
return df_shifted_backwards
n_days_ahead_to_predict = 2
shifted_dataset_backwards = shift_df_backwards(dataset,7) # we provide 7 days of data prior to 'today'
shifted_dataset_forwards = shift_df_forward(dataset,n_days_ahead_to_predict)
shifted_dataset = pd.concat([dataset,shifted_dataset_backwards,shifted_dataset_forwards], axis=1)
print(shifted_dataset.columns)
shifted_dataset = shifted_dataset.dropna()
shifted_dataset = shifted_dataset.sort_index()
Index(['Demand', 'max_air_temp', 'min_air_temp', 'prcp_amt', 'mean_wind_speed', 'Holiday', 'Month', 'DayOfYear', 'DayOfWeek', 'Demand (t-1)', 'max_air_temp (t-1)', 'min_air_temp (t-1)', 'prcp_amt (t-1)', 'mean_wind_speed (t-1)', 'Holiday (t-1)', 'Month (t-1)', 'DayOfYear (t-1)', 'DayOfWeek (t-1)', 'Demand (t-2)', 'max_air_temp (t-2)', 'min_air_temp (t-2)', 'prcp_amt (t-2)', 'mean_wind_speed (t-2)', 'Holiday (t-2)', 'Month (t-2)', 'DayOfYear (t-2)', 'DayOfWeek (t-2)', 'Demand (t-3)', 'max_air_temp (t-3)', 'min_air_temp (t-3)', 'prcp_amt (t-3)', 'mean_wind_speed (t-3)', 'Holiday (t-3)', 'Month (t-3)', 'DayOfYear (t-3)', 'DayOfWeek (t-3)', 'Demand (t-4)', 'max_air_temp (t-4)', 'min_air_temp (t-4)', 'prcp_amt (t-4)', 'mean_wind_speed (t-4)', 'Holiday (t-4)', 'Month (t-4)', 'DayOfYear (t-4)', 'DayOfWeek (t-4)', 'Demand (t-5)', 'max_air_temp (t-5)', 'min_air_temp (t-5)', 'prcp_amt (t-5)', 'mean_wind_speed (t-5)', 'Holiday (t-5)', 'Month (t-5)', 'DayOfYear (t-5)', 'DayOfWeek (t-5)', 'Demand (t-6)', 'max_air_temp (t-6)', 'min_air_temp (t-6)', 'prcp_amt (t-6)', 'mean_wind_speed (t-6)', 'Holiday (t-6)', 'Month (t-6)', 'DayOfYear (t-6)', 'DayOfWeek (t-6)', 'Demand (t-7)', 'max_air_temp (t-7)', 'min_air_temp (t-7)', 'prcp_amt (t-7)', 'mean_wind_speed (t-7)', 'Holiday (t-7)', 'Month (t-7)', 'DayOfYear (t-7)', 'DayOfWeek (t-7)', 'Demand (t+1)', 'max_air_temp (t+1)', 'min_air_temp (t+1)', 'prcp_amt (t+1)', 'mean_wind_speed (t+1)', 'Holiday (t+1)', 'Month (t+1)', 'DayOfYear (t+1)', 'DayOfWeek (t+1)', 'Demand (t+2)', 'max_air_temp (t+2)', 'min_air_temp (t+2)', 'prcp_amt (t+2)', 'mean_wind_speed (t+2)', 'Holiday (t+2)', 'Month (t+2)', 'DayOfYear (t+2)', 'DayOfWeek (t+2)'], dtype='object')
def prepare_training_set_and_test_set_with_shifts(df, split):
training_set = df.loc[df.index[:int(len(df.index)*split)]]
test_set = df.loc[df.index[int(len(df.index)*split):]]
training_set = training_set.sort_index()
test_set = test_set.sort_index()
return training_set, test_set
def clean_training_and_test_sets(training_set, test_set, label_col, predict_days_ahead):
"""
We need to remove the additional demand columns that we would not have if we were predicting new data.
i.e. if we are predicting day n+7, we're not going to have days n+1 -> n+6
"""
y_train = training_set[label_col + f' (t+{predict_days_ahead})'].values
all_future_demand_cols = training_set.columns[training_set.columns.str.contains(label_col+" \(t\+")]
X_train = training_set.drop(columns = all_future_demand_cols)
X_train = X_train.values.astype(np.float64)
y_test = test_set[label_col + f' (t+{predict_days_ahead})'].values
X_test = test_set.drop(columns = all_future_demand_cols)
X_test = X_test.values.astype(np.float64)
return X_train, y_train, X_test, y_test
def prepare_row_for_prediction(row, label_col, predict_days_ahead):
y_true = row[label_col + f' (t+{d})']
all_future_demand_cols = row.index[row.index.str.contains(label_col+" \(t\+")]
cleaned_row = row.drop(index=all_future_demand_cols, axis=0)
return y_true, cleaned_row
df_training_set, df_test_set = prepare_training_set_and_test_set_with_shifts(shifted_dataset, 0.9)
X_train, y_train, X_test, y_test = clean_training_and_test_sets(df_training_set, df_test_set, 'Demand', n_days_ahead_to_predict)
rgr = GradientBoostingRegressor(random_state=123).fit(X_train, y_train)
explainer = shap.Explainer(rgr.predict, X_train.astype(np.float64))
all_future_demand_cols = df_test_set.columns[df_test_set.columns.str.contains('Demand'+" \(t\+")]
shap_values = explainer(df_test_set.drop(columns = all_future_demand_cols).astype(np.float64))
Permutation explainer: 548it [00:53, 8.79it/s]
fig, ax = plt.subplots()
ax = shap.plots.bar(shap_values, max_display=30)
fig, ax = plt.subplots()
ax = shap.summary_plot(shap_values, alpha = 0.5)
There are two approaches we can take next:
(i.e. multiple single-output model vs single multi-output model)
############ single output models
n_days_ahead_to_predict = 7
shifted_dataset_backwards = shift_df_backwards(dataset,7)
shifted_dataset_forwards = shift_df_forward(dataset,n_days_ahead_to_predict)
shifted_dataset = pd.concat([dataset,shifted_dataset_backwards,shifted_dataset_forwards], axis=1)
print(shifted_dataset.columns)
shifted_dataset = shifted_dataset.dropna()
shifted_dataset = shifted_dataset.sort_index()
##########################################
random_date = datetime(2021,7,29) # random date in the test set
df_training_set, df_test_set = prepare_training_set_and_test_set_with_shifts(shifted_dataset, 0.9)
# Next, select a random set of rows (20 days before the random date above)
random_rows = df_test_set.loc[(df_test_set.index>random_date-timedelta(days=20)) & (df_test_set.index<=random_date)]
df_test_set_method1 = df_test_set.copy()
df_output = pd.DataFrame([])
t1 = datetime.now()
for d in range(1,n_days_ahead_to_predict+1):
print(f'{d:#^20}')
#####################################
X_train, y_train, X_test, y_test = clean_training_and_test_sets(df_training_set, df_test_set, 'Demand', n_days_ahead_to_predict)
#######################################
"""
Here, we are splitting the training/test set into a set of features
with a single label column (and removing all of the extra demand
data that we would not have if we were predicting a new dataset).
Then, we get a random row from the test set, use this data to predict (using a GradientBoostingRegressor)
a median value, a 5th percentile value, and a 95th percentile value for the label and add these to a dataframe to later compare.
"""
y_true, row_to_predict = prepare_row_for_prediction(random_rows.iloc[-1], 'Demand', d)
df_output.loc[random_rows.iloc[-1].name+timedelta(days=d), 'True'] = y_true
for alpha in [0.05, 0.5, 0.95]:
rgr = GradientBoostingRegressor(loss='quantile', alpha = alpha).fit(X_train, y_train)
pred = rgr.predict(row_to_predict.values.reshape(1, -1))
df_output.loc[random_rows.iloc[-1].name+timedelta(days=d), f'Prediction_{alpha}'] = pred[0]
df_test_set_method1[f'Prediction (t+{d})_{alpha}'] = rgr.predict(X_test)
print(f"Time to train method 2: {datetime.now()-t1}")
Index(['Demand', 'max_air_temp', 'min_air_temp', 'prcp_amt', 'mean_wind_speed', 'Holiday', 'Month', 'DayOfYear', 'DayOfWeek', 'Demand (t-1)', ... 'DayOfWeek (t+6)', 'Demand (t+7)', 'max_air_temp (t+7)', 'min_air_temp (t+7)', 'prcp_amt (t+7)', 'mean_wind_speed (t+7)', 'Holiday (t+7)', 'Month (t+7)', 'DayOfYear (t+7)', 'DayOfWeek (t+7)'], dtype='object', length=135) #########1########## #########2########## #########3########## #########4########## #########5########## #########6########## #########7########## Time to train method 2: 0:02:50.153578
"""
plot the predictions of a random set of 7 days
"""
def plot_method1_output():
fig, ax = plt.subplots()
ax_single_output_model = random_rows['Demand'].plot()
df_output.plot(ax = ax_single_output_model)
ax_single_output_model.fill_between(df_output.index, df_output['Prediction_0.05'],df_output['Prediction_0.95'], interpolate=True, color='skyblue', alpha=0.5)
ax_single_output_model.set_title(f'Single Output Forecasting Model using Gradient Boosting')
ax_single_output_model.legend()
return ax_single_output_model
plot_method1_output()
<AxesSubplot:title={'center':'Single Output Forecasting Model using Gradient Boosting'}>
"""
Extract the columns names for the median, upper and lower bound predictions
"""
median_cols = df_test_set_method1.columns[df_test_set_method1.columns.str.endswith('0.5')]
upper_bound_cols = df_test_set_method1.columns[df_test_set_method1.columns.str.endswith('0.95')]
lower_bound_cols = df_test_set_method1.columns[df_test_set_method1.columns.str.endswith('0.05')]
############# Multi output model
all_label_cols = df_test_set.columns[df_test_set.columns.str.contains('Demand'+" \(t\+")]
all_feature_cols = df_test_set.columns[~df_test_set.columns.str.contains('Demand'+" \(t\+")]
X_train = df_training_set[all_feature_cols].values
y_train = df_training_set[all_label_cols].values
X_test = df_test_set[all_feature_cols].values
y_test = df_test_set[all_label_cols].values
df_test_set_method2 = df_test_set.copy()
"""
Need to use a MultiOutputRegressor in combination with a GradientBoostingRegressor
as GradientBoostingRegressor does not support multi output on it's own
"""
from sklearn.multioutput import MultiOutputRegressor
t1 = datetime.now()
for alpha in [0.05, 0.5, 0.95]:
rgr = MultiOutputRegressor(GradientBoostingRegressor(loss='quantile', alpha=alpha, random_state=123)).fit(X_train, y_train)
pred_cols = [f'Prediction (t+{d})_{alpha}' for d in range(1,y_test.shape[1]+1)]
df_test_set_method2.loc[:,pred_cols] = rgr.predict(X_test)
print(f"Time to train method 2: {datetime.now()-t1}")
Time to train method 2: 0:03:13.404533
"""
plot the predictions of a random set of 7 days
"""
def plot_method_2_output():
fig, ax = plt.subplots()
random_date_range = df_test_set_method2.index[(df_test_set_method2.index >random_date-timedelta(days=20)) & (df_test_set_method2.index <=random_date)]
predicted_date_range = df_test_set_method2.index[(df_test_set_method2.index >random_date) & (df_test_set_method2.index <=random_date+timedelta(days = n_days_ahead_to_predict))]
ax_multi_output_model = df_test_set_method2.loc[random_date_range,'Demand'].plot(label = 'Data')
ax_multi_output_model = df_test_set_method2.loc[predicted_date_range,'Demand'].plot(label = 'Actual')
median_cols = df_test_set_method2.columns[df_test_set_method2.columns.str.endswith('0.5')]
upper_bound_cols = df_test_set_method2.columns[df_test_set_method2.columns.str.endswith('0.95')]
lower_bound_cols = df_test_set_method2.columns[df_test_set_method2.columns.str.endswith('0.05')]
ax_multi_output_model.plot(predicted_date_range,df_test_set_method2.loc[random_date,median_cols], label = 'Median Predicted')
ax_multi_output_model.plot(predicted_date_range,df_test_set_method2.loc[random_date,upper_bound_cols], label = 'Upper bound')
ax_multi_output_model.plot(predicted_date_range,df_test_set_method2.loc[random_date,lower_bound_cols], label = 'Lower Bound')
ax_multi_output_model.fill_between(predicted_date_range, df_test_set_method2.loc[random_date,lower_bound_cols].values.astype(np.float64),df_test_set_method2.loc[random_date,upper_bound_cols].values.astype(np.float64), interpolate=True, color='skyblue', alpha=0.5)
ax_multi_output_model.set_title(f'Multi Output Forecasting Model using Gradient Boosting')
ax_multi_output_model.legend()
return ax_multi_output_model
plot_method_2_output()
<AxesSubplot:title={'center':'Multi Output Forecasting Model using Gradient Boosting'}>
def evaluate_methods(df, nth_day, method_number, median_cols, upper_bound_cols, lower_bound_cols, plot=True):
"""
I chose 3 different evaluation metrics for the entire test set:
1. mean bound width (90% confidence interval)
2. percentage of days where actual demand is outside of these bounds
3. mean MAPE (using the median value as the predicted values)
"""
actual_demand_n_days_ahead = df.loc[:,df.columns.str.contains(f'Demand \(t\+{nth_day}\)')]
median_predicted_n_days_ahead = df[median_cols].loc[:,df[median_cols].columns.str.contains(f't\+{nth_day}\)')]
upper_bound_predicted_n_days_ahead = df[upper_bound_cols].loc[:,df[upper_bound_cols].columns.str.contains(f't\+{nth_day}\)')]
lower_bound_predicted_n_days_ahead = df[lower_bound_cols].loc[:,df[lower_bound_cols].columns.str.contains(f't\+{nth_day}\)')]
#plt.fill_between(predicted_date_range, df_test_set_method2.loc[random_date,lower_bound_cols].values.astype(np.float64),df_test_set_method2.loc[random_date,upper_bound_cols].values.astype(np.float64), interpolate=True, color='skyblue', alpha=0.5)
if plot == True:
ax = actual_demand_n_days_ahead.plot()
median_predicted_n_days_ahead.plot(ax=ax)
upper_bound_predicted_n_days_ahead.plot(ax=ax)
lower_bound_predicted_n_days_ahead.plot(ax=ax)
plt.title(f'Prediction Method {method_number} at day {nth_day}')
actual_data_above_upper_bound = (actual_demand_n_days_ahead.values>upper_bound_predicted_n_days_ahead.values).sum()
actual_data_below_lower_bound = (actual_demand_n_days_ahead.values<lower_bound_predicted_n_days_ahead.values).sum()
pctage_of_days_outside_of_bound = (actual_data_above_upper_bound+actual_data_below_lower_bound)/len(actual_demand_n_days_ahead)
mean_bound_width = np.mean(upper_bound_predicted_n_days_ahead.values - lower_bound_predicted_n_days_ahead.values)
mean_mape = mape(actual_demand_n_days_ahead.values,median_predicted_n_days_ahead.values )
return mean_bound_width, pctage_of_days_outside_of_bound, mean_mape
marker_size = 50
fig, ax1 = plt.subplots()
fig, ax2 = plt.subplots()
fig, ax3 = plt.subplots()
for n in range(1,n_days_ahead_to_predict+1):
mean_bound_width_method1, pctage_of_days_outside_of_bound_method1, mean_mape_method1 = evaluate_methods(df_test_set_method1, n, 1, median_cols, upper_bound_cols, lower_bound_cols, plot=False)
mean_bound_width_method2, pctage_of_days_outside_of_bound_method2, mean_mape_method2 = evaluate_methods(df_test_set_method2, n, 2, median_cols, upper_bound_cols, lower_bound_cols, plot=False)
if n ==1:
ax1.scatter(n, mean_bound_width_method1, c='b', marker = 'x', s=marker_size, label = 'Method 1 - Mean Bound Width')
ax1.scatter(n, mean_bound_width_method2, c='r', marker = 'o', s=marker_size, label = 'Method 2 - Mean Bound Width')
ax2.scatter(n, pctage_of_days_outside_of_bound_method1, c='b', marker = 'x', s=marker_size, label = 'Method 1 - % outside of bounds')
ax2.scatter(n, pctage_of_days_outside_of_bound_method2, c='r', marker = 'o', s=marker_size, label = 'Method 2 - % outside of bounds')
ax3.scatter(n, mean_mape_method1, c='b', marker = 'x', s=marker_size, label = 'Method 1 - Mean MAPE')
ax3.scatter(n, mean_mape_method2, c='r', marker = 'o', s=marker_size, label = 'Method 2 - Mean MAPE')
else:
ax1.scatter(n, mean_bound_width_method1, c='b', marker = 'x' ,s=marker_size)
ax1.scatter(n, mean_bound_width_method2, c='r', marker = 'o',s=marker_size)
ax2.scatter(n, pctage_of_days_outside_of_bound_method1, c='b', marker = 'x',s=marker_size)
ax2.scatter(n, pctage_of_days_outside_of_bound_method2, c='r', marker = 'o',s=marker_size)
ax3.scatter(n, mean_mape_method1, c='b', marker = 'x', s=marker_size)
ax3.scatter(n, mean_mape_method2, c='r', marker = 'o', s=marker_size)
ax1.legend()
ax2.legend()
ax3.legend()
<matplotlib.legend.Legend at 0x231351a73d0>
Conclusion:
If I had more time I would like to: