import pandas as pd
#also require openpyxl to open xlsx file with pandas
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib qt
# need PyQt installed
import calendar
from datetime import datetime, timedelta

plt.rcParams['figure.figsize'] = [16, 10]

from statsmodels.tsa.seasonal import seasonal_decompose

# Load the daily demand data 
df_demand = pd.read_excel('Water Demand Data July 2022.xlsx')
df_demand['Date'] = pd.to_datetime(df_demand['Date'])
df_demand = df_demand.set_index('Date')

df_demand.plot(y='Demand', use_index=True)
# demand looks to have increased over time in this area
# looks like there may be some spurious datapoints in the dataset, 
# it doesn't seem likely that true demand would drop so significantly

<AxesSubplot:xlabel='Date'>


decomp = seasonal_decompose(df_demand, model='multiplicable', period=365)


ax_decomp = decomp.plot()


shift = 1
ax = df_demand.pct_change(shift).plot()
ax.axhline(-0.25, c='r') # 0.25 seems to be a sensible cutoff

dates_with_unusual_demand = df_demand[df_demand.pct_change(shift)<-0.25].dropna()
dates_with_unusual_demand


df_demand[(df_demand.index.month==12) & (df_demand.index.day==31)].hist(label = 'New Years Eve Demand Histogram')
plt.legend()

<matplotlib.legend.Legend at 0x231213bcc10>


df_demand[(df_demand.index.month==8) & (df_demand.index.day==2)].hist(label = "8th August Demand Histogram")
plt.legend()

<matplotlib.legend.Legend at 0x2312121dfa0>


"""
Because we are only interpolating non-consecutive days, we can simply replace the days with metering errors with the previous days data.
There are better ways of doing this, however, because it's only 3 days of data, it won't make a huge difference to our model output.
"""
df_demand_before = df_demand.copy()
df_demand.loc[df_demand.index.isin(dates_with_unusual_demand.index), 'Demand'] = df_demand.loc[df_demand.index.isin(dates_with_unusual_demand.index+timedelta(days=-1)), 'Demand']


fig, axs = plt.subplots(2)
df_demand_before.plot(ax=axs[0])
axs[0].set_title('Before cleaning')
df_demand.plot(ax=axs[1])
axs[1].set_title('After cleaning')
plt.legend()
del df_demand_before


"""
In this code block, we're going to group the demand data by Month and Year 
to see if there are any significant shifts in the Month-Month pattern of each year
"""

mean_monthly_demand = df_demand.groupby([df_demand.index.month, df_demand.index.year]).mean()
mean_monthly_demand.index = mean_monthly_demand.index.rename(['Month', 'Year'])

fig, axs = plt.subplots(2, figsize=(12,10))
for year in range(2007,2022):
    mean_monthly_by_year = mean_monthly_demand.loc[mean_monthly_demand.index.get_level_values('Year') == year, 'Demand']
    colour_weight = 1-((2022-year)/(2022-2007))+0.01
    axs[0].plot(mean_monthly_by_year.index.get_level_values('Month'), mean_monthly_by_year.values, alpha=colour_weight,  c = 'blue', label = year)
    max_month = mean_monthly_by_year[mean_monthly_by_year.values == mean_monthly_by_year.max()]
    
    min_month = mean_monthly_by_year[mean_monthly_by_year.values == mean_monthly_by_year.min()]
    
    axs[0].scatter(max_month.index.get_level_values('Month'), max_month.values,  c = 'r')
    axs[0].scatter(min_month.index.get_level_values('Month'), min_month.values,  c = 'g')
axs[0].set_title('Mean demand grouped by Month and Year')
axs[0].legend(bbox_to_anchor=(1., 1))


##############################################

"""
In this code block, we're going to group the demand data by Year 
and look at the variation in mean yearly demand.
"""

mean_yearly_demand = df_demand.groupby(df_demand.index.year).mean()

for year in range(mean_yearly_demand.index.min(),mean_yearly_demand.index.max()):
    mean_by_year = mean_yearly_demand.loc[mean_yearly_demand.index == year, 'Demand']
    axs[1].bar(mean_by_year.index, mean_by_year.values, color = 'blue',label = year)
axs[1].set_title('Mean demand grouped by Year')
axs[1].set_ylim(mean_yearly_demand['Demand'].min()-50, mean_yearly_demand['Demand'].max()+50)
plt.show()


##############################################

"""
Next, we're going to group the demand data by day of the week
and look for variations in the average weekday (MTWTFSS).
"""


fig, ax = plt.subplots(1)
mean_weekday_demand = df_demand.groupby(df_demand.index.weekday).mean()

weekdays=dict(enumerate(calendar.day_name))

for wd in range(0,7):
    mean_weekday = mean_weekday_demand.loc[mean_weekday_demand.index == wd, 'Demand']
    ax.bar(weekdays[mean_weekday.index[0]], mean_weekday.values, color = 'blue')
ax.set_title('Mean demand grouped by Day of week')
ax.set_ylim(mean_weekday_demand['Demand'].min()-5, mean_weekday_demand['Demand'].max()+5)
plt.show()

##############################################

"""
Next, let's see if public holidays show us anything interesting.
To do this, we use the 'holidays' package, then get the UKs holidays for each year.
Because we are analysing data from the West Midlands area, 
we only want the holidays celebrated in England (or all Nations of the UK on the same date, e.g. Christmas)
"""

import holidays
uk_holidays = [holidays.UnitedKingdom(years=y) for y in range(2007,2022)]
uk_holidays = {'Holiday': dict(pair for d in uk_holidays for pair in d.items())}

df_holidays = pd.DataFrame.from_dict(uk_holidays)
df_holidays_celebrated_by_all_nations = df_holidays[~df_holidays['Holiday'].str.contains("]")]
df_holidays_celebrated_by_one_or_multiple_nations = df_holidays[df_holidays['Holiday'].str.contains("]")]
df_holidays_celebrated_in_England = df_holidays_celebrated_by_one_or_multiple_nations[df_holidays_celebrated_by_one_or_multiple_nations['Holiday'].str.contains('England')]
df_holidays_celebrated_in_England = pd.concat([df_holidays_celebrated_in_England,df_holidays_celebrated_by_all_nations])
df_holidays_celebrated_in_England.index = pd.to_datetime(df_holidays_celebrated_in_England.index)

#####

fig, ax = plt.subplots(1)
mean_weekday_demand = df_demand.groupby(df_demand.index.dayofyear).mean()
mean_weekday_demand.plot(ax=ax)

ax.set_title('Mean demand grouped by Day of Year (holidays in dashed red)')
ax.set_ylim(mean_weekday_demand['Demand'].min()-5, mean_weekday_demand['Demand'].max()+5)
[ax.axvline(h, c='r', alpha = 0.7, ls='--') for h in df_holidays_celebrated_in_England.index.dayofyear.drop_duplicates()]
plt.show()


from scipy.stats import ttest_ind, mannwhitneyu
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot

holiday_demand = df_demand[df_demand.index.isin(df_holidays_celebrated_in_England.index)]

# let's take a random sample of non-holiday days to compare
random_sample_of_non_holiday_demand = df_demand[~df_demand.index.isin(df_holidays_celebrated_in_England.index)].sample(len(df_holidays_celebrated_in_England), random_state = 123)

qqplot(holiday_demand.sort_values(by='Demand'), line='s')
qqplot(random_sample_of_non_holiday_demand.sort_values(by='Demand'), line='s')
# it looks like both datasets are approximately normally distributed, we can use a T test



ax = (random_sample_of_non_holiday_demand).hist(label = 'Non-holiday demand', alpha = 0.7, bins=20)
holiday_demand.hist(ax = ax, label = 'Holiday demand', alpha = 0.7, bins=20)
plt.legend()



test_result, p_value = ttest_ind(holiday_demand,random_sample_of_non_holiday_demand, axis=0)
print(p_value)
# this dataset shows that there is less demand during holidays than non holidays, worth keeping in as a feature

[0.00016727]


def format_temperature_data(df):
    """
    As the Met Office temperature time series data has a 12 hour interval, we want to convert this to 24 hours (to match our demand dataset)
    """
    df['ob_end_time'] = pd.to_datetime(df['ob_end_time'])
    df['Date'] = df['ob_end_time'].dt.date
    max_air_temp = df.groupby('Date').max()['max_air_temp']
    min_air_temp = df.groupby('Date').min()['min_air_temp']

    air_temps = pd.concat([max_air_temp, min_air_temp], axis=1)
    return air_temps

def format_rainfall_data(df):
    """
    Load Midas rainfall dataset. Data is at 24 hour intervals so no need to group.
    """
    df['ob_date'] = pd.to_datetime(df['ob_date'])
    df = df.set_index('ob_date')
    return df['prcp_amt']

def format_wind_data(df):
    """
    Load Midas wind dataset. Data is at 1 hour intervals we need to group by date and take the mean windfall.
    """
    df['ob_end_time'] = pd.to_datetime(df['ob_end_time'])
    df['Date'] = df['ob_end_time'].dt.date
    mean_wind_df = df.groupby('Date').mean()['mean_wind_speed']
    return mean_wind_df
def load_weather_data(path, header_size, data_type):
    """
    Single function to load all data types.
    """
    print(f'\n{data_type:#^20} ')
    
    dfs = []
    for year in range(2007,2022):
        print(f'\n{year:#^20} ')
        file_path = f'{path}_{year}.csv'
        df_temporary = pd.read_csv(file_path, skiprows = header_size, skipfooter=1)
        if data_type=='Temperature':
            df_temporary = format_temperature_data(df_temporary)
        elif data_type=='Rainfall':
            df_temporary = format_rainfall_data(df_temporary)
        elif data_type=='Wind':
            df_temporary = format_wind_data(df_temporary)
            

        dates_in_year = pd.date_range(start=f"{year}-01-01", end=f"{year}-12-31")
        missing_dates = dates_in_year.difference(df_temporary.index)
        if len(missing_dates)>0: 
            print("Missing dates: ", missing_dates.values)
        dfs.append(df_temporary)
    df = pd.concat(dfs)
    return df

temperature_df = load_weather_data('./Temperature Data/midas-open_uk-daily-temperature-obs_dv-202207_west-midlands_24102_coventry-coundon_qcv-1', 90, 'Temperature')
rainfall_df = load_weather_data('./Rainfall Data/midas-open_uk-daily-rain-obs_dv-202207_shropshire_00636_edge_qcv-1', 61, 'Rainfall')
wind_df = load_weather_data('./Wind Data/midas-open_uk-mean-wind-obs_dv-202207_west-midlands_24102_coventry-coundon_qcv-1', 79, 'Wind')

####Temperature##### 

########2007########

c:\users\johnd\appdata\local\programs\python\python38\lib\site-packages\pandas\util\_decorators.py:311: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  return func(*args, **kwargs)

Missing dates:  ['2007-01-27T00:00:00.000000000' '2007-01-28T00:00:00.000000000'
 '2007-01-29T00:00:00.000000000' '2007-01-30T00:00:00.000000000'
 '2007-01-31T00:00:00.000000000']

########2008######## 

########2009######## 

########2010######## 

########2011######## 

########2012######## 

########2013######## 

########2014######## 

########2015######## 

########2016######## 

########2017######## 

########2018######## 

########2019######## 
Missing dates:  ['2019-07-10T00:00:00.000000000' '2019-07-11T00:00:00.000000000'
 '2019-07-12T00:00:00.000000000' '2019-07-13T00:00:00.000000000'
 '2019-07-14T00:00:00.000000000' '2019-07-15T00:00:00.000000000'
 '2019-07-16T00:00:00.000000000' '2019-07-17T00:00:00.000000000'
 '2019-07-18T00:00:00.000000000' '2019-07-19T00:00:00.000000000']

########2020######## 

########2021######## 

######Rainfall###### 

########2007######## 

########2008######## 

########2009######## 

########2010######## 

########2011######## 

########2012######## 

########2013######## 

########2014######## 

########2015######## 

########2016######## 

########2017######## 

########2018######## 
Missing dates:  ['2018-06-21T00:00:00.000000000' '2018-06-22T00:00:00.000000000'
 '2018-06-23T00:00:00.000000000' '2018-08-02T00:00:00.000000000'
 '2018-08-03T00:00:00.000000000' '2018-08-04T00:00:00.000000000'
 '2018-08-05T00:00:00.000000000' '2018-08-06T00:00:00.000000000'
 '2018-08-07T00:00:00.000000000' '2018-08-08T00:00:00.000000000'
 '2018-08-09T00:00:00.000000000' '2018-08-10T00:00:00.000000000'
 '2018-08-11T00:00:00.000000000' '2018-08-12T00:00:00.000000000'
 '2018-08-13T00:00:00.000000000' '2018-08-14T00:00:00.000000000'
 '2018-08-15T00:00:00.000000000' '2018-08-16T00:00:00.000000000'
 '2018-08-17T00:00:00.000000000' '2018-08-18T00:00:00.000000000'
 '2018-08-19T00:00:00.000000000' '2018-08-20T00:00:00.000000000'
 '2018-08-21T00:00:00.000000000' '2018-08-22T00:00:00.000000000'
 '2018-08-23T00:00:00.000000000' '2018-08-24T00:00:00.000000000'
 '2018-08-25T00:00:00.000000000' '2018-08-26T00:00:00.000000000'
 '2018-08-27T00:00:00.000000000' '2018-08-28T00:00:00.000000000'
 '2018-08-29T00:00:00.000000000' '2018-08-30T00:00:00.000000000'
 '2018-08-31T00:00:00.000000000']

########2019######## 

########2020######## 

########2021######## 

########Wind######## 

########2007######## 
Missing dates:  ['2007-01-27T00:00:00.000000000' '2007-01-28T00:00:00.000000000'
 '2007-01-29T00:00:00.000000000' '2007-01-30T00:00:00.000000000'
 '2007-01-31T00:00:00.000000000']

########2008######## 

########2009######## 

########2010######## 
Missing dates:  ['2010-07-31T00:00:00.000000000' '2010-08-01T00:00:00.000000000']

########2011######## 

########2012######## 

########2013######## 

########2014######## 

########2015######## 

########2016######## 

########2017######## 

########2018######## 

########2019######## 
Missing dates:  ['2019-07-11T00:00:00.000000000' '2019-07-12T00:00:00.000000000'
 '2019-07-13T00:00:00.000000000' '2019-07-14T00:00:00.000000000'
 '2019-07-15T00:00:00.000000000' '2019-07-16T00:00:00.000000000'
 '2019-07-17T00:00:00.000000000' '2019-07-18T00:00:00.000000000']

########2020######## 

########2021########


dataset = pd.merge(df_demand,temperature_df, left_index=True, right_index = True,how="outer")
dataset = pd.merge(dataset,rainfall_df, left_index=True, right_index = True,how="outer")
dataset = pd.merge(dataset,wind_df, left_index=True, right_index = True,how="outer")
dataset = pd.merge(dataset,df_holidays_celebrated_in_England, left_index=True, right_index = True,how="outer")

dataset.loc[~dataset['Holiday'].isna(), 'Holiday'] = 1
dataset.loc[dataset['Holiday'].isna(), 'Holiday'] = 0

dataset['Month'] = dataset.index.month
dataset['DayOfYear'] = dataset.index.dayofyear
dataset['DayOfWeek'] = dataset.index.dayofweek


ax = dataset.plot.scatter(x='max_air_temp', y='Demand', label = 'Max Air temp vs Demand', alpha = 0.5)
ax = dataset.plot.scatter(x='min_air_temp', y='Demand', label = 'Min Air temp vs Demand', alpha = 0.5)
ax = dataset.plot.scatter(x='min_air_temp', y='max_air_temp', label = 'Min Air temp vs Max Air Temp', alpha = 0.5)
ax = dataset.plot.scatter(x='prcp_amt', y='Demand', label = 'Rainfall vs Demand', alpha = 0.5)
ax = dataset.plot.scatter(x='mean_wind_speed', y='Demand', label = 'Wind Speed vs Demand', alpha = 0.5)


"""
because we don't have 2022 data for the weather data, let's cut this year out
"""
dataset = dataset[dataset.index.year <2022]


dataset[dataset.isna()['Demand']==True]


"""
because we are working with time series data,
we may want to interpolate the dataset to make sure any nans are filled sensibly
"""
print("Nans Before: \n", dataset.isna().sum())
print(" ")
#still a few nans in the dataset, we can simply interpolate these rows
dataset = dataset.interpolate()
dataset = dataset.dropna()
print("Nans After:\n", dataset.isna().sum())

dataset = dataset.sort_index()

Nans Before: 
 Demand              3
max_air_temp       15
min_air_temp       16
prcp_amt           33
mean_wind_speed    15
Holiday             0
Month               0
DayOfYear           0
DayOfWeek           0
dtype: int64
 
Nans After:
 Demand             0
max_air_temp       0
min_air_temp       0
prcp_amt           0
mean_wind_speed    0
Holiday            0
Month              0
DayOfYear          0
DayOfWeek          0
dtype: int64


rainfall = dataset[dataset['prcp_amt']==0]['Demand'].dropna()
no_rainfall = dataset[dataset['prcp_amt']>0]['Demand'].dropna()

fig, ax = plt.subplots()
rainfall.hist(ax = ax,alpha = 0.7, label = 'Rainfall')
no_rainfall.hist(ax=ax,alpha = 0.7, label = 'No Rainfall')
plt.legend()
plt.show()


ax = pd.plotting.scatter_matrix(dataset, diagonal='kde')


# let's look at feature importance
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR


def prepare_training_set_and_test_set(df, label_col, split):
    """
    Split the dataset into training/test sets and separate the features and labels
    """
    training_set = df.loc[df.index[:int(len(df.index)*split)]]
    test_set = df.loc[df.index[int(len(df.index)*split):]]
    
    y_train = training_set[label_col].values
    X_train = training_set.drop(columns = label_col)
    X_train = X_train.values.astype(np.float64)
    
    y_test = test_set[label_col].values
    X_test = test_set.drop(columns = label_col)
    X_test = X_test.values.astype(np.float64)
    return X_train, y_train, X_test, y_test, training_set, test_set

X_train, y_train, X_test, y_test, df_training_set, df_test_set = prepare_training_set_and_test_set(dataset, 'Demand', 0.8)



def mape(y_true, y_pred):
    return round(np.mean(np.abs((y_true - y_pred) / y_true)) * 100, 2)

"""
Next, let's choose a model for us to use. 
We will evaluate each models performace using the mean absolute percentage error (MAPE)
"""


models = {
"GradientBoostingRegressor":GradientBoostingRegressor(),
"DecisionTreeRegressor":DecisionTreeRegressor(),
"LinearRegression":LinearRegression(),
"SGDRegressor" : SGDRegressor(),
"KernelRidge" : KernelRidge(),
"ElasticNet" : ElasticNet(),
"BayesianRidge" : BayesianRidge(),
"SVR" : SVR()}

for k,m in models.items():
    mod = m.fit(X_train, y_train)
    print(k, mape(y_test, mod.predict(X_test)))
    
# GradientBoostingRegressor seems to do a good job here

GradientBoostingRegressor 6.36
DecisionTreeRegressor 7.14
LinearRegression 6.4
SGDRegressor 1452975775542.07
KernelRidge 14.79
ElasticNet 6.41
BayesianRidge 6.41
SVR 6.74


rgr = GradientBoostingRegressor().fit(X_train, y_train)


import shap
explainer = shap.Explainer(rgr.predict, X_train.astype(np.float64))

shap_values = explainer(df_test_set.drop(columns = ['Demand']).astype(np.float64))

Exact explainer: 1097it [00:17, 33.40it/s]


fig, ax = plt.subplots()
ax = shap.plots.bar(shap_values)


fig, ax = plt.subplots()
ax = shap.summary_plot(shap_values, alpha=0.5)


# Now we need to shift the data


def shift_df_forward(df, shift_rows_forward):
    """ 
    Shift the dataframe forwards in time
    """
    dfs = []
    for n in range(1,shift_rows_forward+1):
        cols = [c + f' (t+{n})' for c in df.columns]
        new_df = df.shift(-n)
        new_df.columns = cols
        
        dfs.append(new_df)
    df_shifted_forwards = pd.concat(dfs, axis=1)
    
    return df_shifted_forwards
    
def shift_df_backwards(df, shift_rows_backwards):
    """ 
    Shift the dataframe backwards in time
    """
    dfs = []
    for n in range(1,shift_rows_backwards+1):
        cols = [c + f' (t-{n})' for c in df.columns]
        new_df = df.shift(n)
        new_df.columns = cols
        
        dfs.append(new_df)
    df_shifted_backwards = pd.concat(dfs, axis=1)
    
    return df_shifted_backwards


n_days_ahead_to_predict = 2

shifted_dataset_backwards = shift_df_backwards(dataset,7) # we provide 7 days of data prior to 'today'
shifted_dataset_forwards = shift_df_forward(dataset,n_days_ahead_to_predict)

shifted_dataset = pd.concat([dataset,shifted_dataset_backwards,shifted_dataset_forwards], axis=1)
print(shifted_dataset.columns)
shifted_dataset = shifted_dataset.dropna()

shifted_dataset = shifted_dataset.sort_index()

Index(['Demand', 'max_air_temp', 'min_air_temp', 'prcp_amt', 'mean_wind_speed',
       'Holiday', 'Month', 'DayOfYear', 'DayOfWeek', 'Demand (t-1)',
       'max_air_temp (t-1)', 'min_air_temp (t-1)', 'prcp_amt (t-1)',
       'mean_wind_speed (t-1)', 'Holiday (t-1)', 'Month (t-1)',
       'DayOfYear (t-1)', 'DayOfWeek (t-1)', 'Demand (t-2)',
       'max_air_temp (t-2)', 'min_air_temp (t-2)', 'prcp_amt (t-2)',
       'mean_wind_speed (t-2)', 'Holiday (t-2)', 'Month (t-2)',
       'DayOfYear (t-2)', 'DayOfWeek (t-2)', 'Demand (t-3)',
       'max_air_temp (t-3)', 'min_air_temp (t-3)', 'prcp_amt (t-3)',
       'mean_wind_speed (t-3)', 'Holiday (t-3)', 'Month (t-3)',
       'DayOfYear (t-3)', 'DayOfWeek (t-3)', 'Demand (t-4)',
       'max_air_temp (t-4)', 'min_air_temp (t-4)', 'prcp_amt (t-4)',
       'mean_wind_speed (t-4)', 'Holiday (t-4)', 'Month (t-4)',
       'DayOfYear (t-4)', 'DayOfWeek (t-4)', 'Demand (t-5)',
       'max_air_temp (t-5)', 'min_air_temp (t-5)', 'prcp_amt (t-5)',
       'mean_wind_speed (t-5)', 'Holiday (t-5)', 'Month (t-5)',
       'DayOfYear (t-5)', 'DayOfWeek (t-5)', 'Demand (t-6)',
       'max_air_temp (t-6)', 'min_air_temp (t-6)', 'prcp_amt (t-6)',
       'mean_wind_speed (t-6)', 'Holiday (t-6)', 'Month (t-6)',
       'DayOfYear (t-6)', 'DayOfWeek (t-6)', 'Demand (t-7)',
       'max_air_temp (t-7)', 'min_air_temp (t-7)', 'prcp_amt (t-7)',
       'mean_wind_speed (t-7)', 'Holiday (t-7)', 'Month (t-7)',
       'DayOfYear (t-7)', 'DayOfWeek (t-7)', 'Demand (t+1)',
       'max_air_temp (t+1)', 'min_air_temp (t+1)', 'prcp_amt (t+1)',
       'mean_wind_speed (t+1)', 'Holiday (t+1)', 'Month (t+1)',
       'DayOfYear (t+1)', 'DayOfWeek (t+1)', 'Demand (t+2)',
       'max_air_temp (t+2)', 'min_air_temp (t+2)', 'prcp_amt (t+2)',
       'mean_wind_speed (t+2)', 'Holiday (t+2)', 'Month (t+2)',
       'DayOfYear (t+2)', 'DayOfWeek (t+2)'],
      dtype='object')


def prepare_training_set_and_test_set_with_shifts(df, split):
    training_set = df.loc[df.index[:int(len(df.index)*split)]]
    test_set = df.loc[df.index[int(len(df.index)*split):]]
    
    training_set = training_set.sort_index()
    test_set = test_set.sort_index()
    return training_set, test_set

def clean_training_and_test_sets(training_set, test_set, label_col, predict_days_ahead):
    """
    We need to remove the additional demand columns that we would not have if we were predicting new data.
    i.e. if we are predicting day n+7, we're not going to have days n+1 -> n+6
    """
    y_train = training_set[label_col + f' (t+{predict_days_ahead})'].values
    all_future_demand_cols = training_set.columns[training_set.columns.str.contains(label_col+" \(t\+")]
    X_train = training_set.drop(columns = all_future_demand_cols)
    X_train = X_train.values.astype(np.float64)
    
    y_test = test_set[label_col + f' (t+{predict_days_ahead})'].values
    X_test = test_set.drop(columns = all_future_demand_cols)
    X_test = X_test.values.astype(np.float64)
    
    return X_train, y_train, X_test, y_test

def prepare_row_for_prediction(row, label_col, predict_days_ahead):
    y_true = row[label_col + f' (t+{d})']
    
    all_future_demand_cols = row.index[row.index.str.contains(label_col+" \(t\+")]
    
    cleaned_row = row.drop(index=all_future_demand_cols, axis=0)
    return y_true, cleaned_row


df_training_set, df_test_set = prepare_training_set_and_test_set_with_shifts(shifted_dataset,   0.9)
X_train, y_train, X_test, y_test = clean_training_and_test_sets(df_training_set, df_test_set, 'Demand', n_days_ahead_to_predict)


rgr = GradientBoostingRegressor(random_state=123).fit(X_train, y_train)


explainer = shap.Explainer(rgr.predict, X_train.astype(np.float64))

all_future_demand_cols = df_test_set.columns[df_test_set.columns.str.contains('Demand'+" \(t\+")]

shap_values = explainer(df_test_set.drop(columns = all_future_demand_cols).astype(np.float64))

Permutation explainer: 548it [00:53,  8.79it/s]


fig, ax = plt.subplots()
ax = shap.plots.bar(shap_values, max_display=30)


fig, ax = plt.subplots()
ax = shap.summary_plot(shap_values, alpha = 0.5)


############ single output models

n_days_ahead_to_predict = 7

shifted_dataset_backwards = shift_df_backwards(dataset,7)
shifted_dataset_forwards = shift_df_forward(dataset,n_days_ahead_to_predict)

shifted_dataset = pd.concat([dataset,shifted_dataset_backwards,shifted_dataset_forwards], axis=1)
print(shifted_dataset.columns)
shifted_dataset = shifted_dataset.dropna()

shifted_dataset = shifted_dataset.sort_index()

##########################################

random_date = datetime(2021,7,29) # random date in the test set

df_training_set, df_test_set = prepare_training_set_and_test_set_with_shifts(shifted_dataset,   0.9)

# Next, select a random set of rows (20 days before the random date above)
random_rows = df_test_set.loc[(df_test_set.index>random_date-timedelta(days=20)) & (df_test_set.index<=random_date)]

df_test_set_method1 = df_test_set.copy()

df_output = pd.DataFrame([])

t1 = datetime.now()
for d in range(1,n_days_ahead_to_predict+1):
    print(f'{d:#^20}')
    
    #####################################
    
    X_train, y_train, X_test, y_test = clean_training_and_test_sets(df_training_set, df_test_set, 'Demand', n_days_ahead_to_predict)
    
    #######################################
    
    

    """
    Here, we are splitting the training/test set into a set of features 
    with a single label column (and removing all of the extra demand 
    data that we would not have if we were predicting a new dataset).
    
    Then, we get a random row from the test set, use this data to predict (using a GradientBoostingRegressor)
    a median value, a 5th percentile value, and a 95th percentile value for the label and add these to a dataframe to later compare.
    
    """
    
    
    y_true, row_to_predict = prepare_row_for_prediction(random_rows.iloc[-1], 'Demand', d)
    df_output.loc[random_rows.iloc[-1].name+timedelta(days=d), 'True'] = y_true
    
    for alpha in [0.05, 0.5, 0.95]:
        rgr = GradientBoostingRegressor(loss='quantile', alpha = alpha).fit(X_train, y_train)
        pred = rgr.predict(row_to_predict.values.reshape(1, -1))
        df_output.loc[random_rows.iloc[-1].name+timedelta(days=d), f'Prediction_{alpha}'] = pred[0]
        
        df_test_set_method1[f'Prediction (t+{d})_{alpha}'] = rgr.predict(X_test)


print(f"Time to train method 2: {datetime.now()-t1}")

Index(['Demand', 'max_air_temp', 'min_air_temp', 'prcp_amt', 'mean_wind_speed',
       'Holiday', 'Month', 'DayOfYear', 'DayOfWeek', 'Demand (t-1)',
       ...
       'DayOfWeek (t+6)', 'Demand (t+7)', 'max_air_temp (t+7)',
       'min_air_temp (t+7)', 'prcp_amt (t+7)', 'mean_wind_speed (t+7)',
       'Holiday (t+7)', 'Month (t+7)', 'DayOfYear (t+7)', 'DayOfWeek (t+7)'],
      dtype='object', length=135)
#########1##########
#########2##########
#########3##########
#########4##########
#########5##########
#########6##########
#########7##########
Time to train method 2: 0:02:50.153578


"""
plot the predictions of a random set of 7 days
"""
def plot_method1_output():
    fig, ax = plt.subplots()
    ax_single_output_model = random_rows['Demand'].plot()
    df_output.plot(ax = ax_single_output_model)
    ax_single_output_model.fill_between(df_output.index, df_output['Prediction_0.05'],df_output['Prediction_0.95'], interpolate=True, color='skyblue', alpha=0.5)

    ax_single_output_model.set_title(f'Single Output Forecasting Model using Gradient Boosting')
    ax_single_output_model.legend()
    return ax_single_output_model
plot_method1_output()

<AxesSubplot:title={'center':'Single Output Forecasting Model using Gradient Boosting'}>


"""
Extract the columns names for the median, upper and lower bound predictions
"""
median_cols = df_test_set_method1.columns[df_test_set_method1.columns.str.endswith('0.5')]
upper_bound_cols = df_test_set_method1.columns[df_test_set_method1.columns.str.endswith('0.95')]
lower_bound_cols = df_test_set_method1.columns[df_test_set_method1.columns.str.endswith('0.05')]


############# Multi output model
all_label_cols = df_test_set.columns[df_test_set.columns.str.contains('Demand'+" \(t\+")]
all_feature_cols = df_test_set.columns[~df_test_set.columns.str.contains('Demand'+" \(t\+")]
X_train = df_training_set[all_feature_cols].values
y_train = df_training_set[all_label_cols].values

X_test = df_test_set[all_feature_cols].values
y_test = df_test_set[all_label_cols].values


df_test_set_method2 = df_test_set.copy()


"""
Need to use a MultiOutputRegressor in combination with a GradientBoostingRegressor
as GradientBoostingRegressor does not support multi output on it's own
"""
from sklearn.multioutput import MultiOutputRegressor
t1 = datetime.now()
for alpha in [0.05, 0.5, 0.95]:
    rgr = MultiOutputRegressor(GradientBoostingRegressor(loss='quantile', alpha=alpha, random_state=123)).fit(X_train, y_train)
    pred_cols = [f'Prediction (t+{d})_{alpha}' for d in range(1,y_test.shape[1]+1)]
    df_test_set_method2.loc[:,pred_cols] = rgr.predict(X_test)
    
print(f"Time to train method 2: {datetime.now()-t1}")

Time to train method 2: 0:03:13.404533


"""
plot the predictions of a random set of 7 days
"""
def plot_method_2_output():
    fig, ax = plt.subplots()
    random_date_range = df_test_set_method2.index[(df_test_set_method2.index >random_date-timedelta(days=20)) & (df_test_set_method2.index <=random_date)]
    predicted_date_range = df_test_set_method2.index[(df_test_set_method2.index >random_date) & (df_test_set_method2.index <=random_date+timedelta(days = n_days_ahead_to_predict))]
    ax_multi_output_model = df_test_set_method2.loc[random_date_range,'Demand'].plot(label = 'Data')
    ax_multi_output_model = df_test_set_method2.loc[predicted_date_range,'Demand'].plot(label = 'Actual')

    median_cols = df_test_set_method2.columns[df_test_set_method2.columns.str.endswith('0.5')]
    upper_bound_cols = df_test_set_method2.columns[df_test_set_method2.columns.str.endswith('0.95')]
    lower_bound_cols = df_test_set_method2.columns[df_test_set_method2.columns.str.endswith('0.05')]

    ax_multi_output_model.plot(predicted_date_range,df_test_set_method2.loc[random_date,median_cols], label = 'Median Predicted')
    ax_multi_output_model.plot(predicted_date_range,df_test_set_method2.loc[random_date,upper_bound_cols], label = 'Upper bound')
    ax_multi_output_model.plot(predicted_date_range,df_test_set_method2.loc[random_date,lower_bound_cols], label = 'Lower Bound')

    ax_multi_output_model.fill_between(predicted_date_range, df_test_set_method2.loc[random_date,lower_bound_cols].values.astype(np.float64),df_test_set_method2.loc[random_date,upper_bound_cols].values.astype(np.float64), interpolate=True, color='skyblue', alpha=0.5)
    ax_multi_output_model.set_title(f'Multi Output Forecasting Model using Gradient Boosting')
    ax_multi_output_model.legend()
    return ax_multi_output_model
plot_method_2_output()

<AxesSubplot:title={'center':'Multi Output Forecasting Model using Gradient Boosting'}>


def evaluate_methods(df, nth_day, method_number, median_cols, upper_bound_cols, lower_bound_cols, plot=True):
    """
    I chose 3 different evaluation metrics for the entire test set: 
        1. mean bound width (90% confidence interval)
        2. percentage of days where actual demand is outside of these bounds
        3. mean MAPE (using the median value as the predicted values)
    """
    actual_demand_n_days_ahead = df.loc[:,df.columns.str.contains(f'Demand \(t\+{nth_day}\)')]
    median_predicted_n_days_ahead = df[median_cols].loc[:,df[median_cols].columns.str.contains(f't\+{nth_day}\)')]

    upper_bound_predicted_n_days_ahead = df[upper_bound_cols].loc[:,df[upper_bound_cols].columns.str.contains(f't\+{nth_day}\)')]
    lower_bound_predicted_n_days_ahead = df[lower_bound_cols].loc[:,df[lower_bound_cols].columns.str.contains(f't\+{nth_day}\)')]
    #plt.fill_between(predicted_date_range, df_test_set_method2.loc[random_date,lower_bound_cols].values.astype(np.float64),df_test_set_method2.loc[random_date,upper_bound_cols].values.astype(np.float64), interpolate=True, color='skyblue', alpha=0.5)

    if plot == True:
        ax = actual_demand_n_days_ahead.plot()
        median_predicted_n_days_ahead.plot(ax=ax)
        upper_bound_predicted_n_days_ahead.plot(ax=ax)
        lower_bound_predicted_n_days_ahead.plot(ax=ax)
        plt.title(f'Prediction Method {method_number} at day {nth_day}')

    actual_data_above_upper_bound = (actual_demand_n_days_ahead.values>upper_bound_predicted_n_days_ahead.values).sum()
    actual_data_below_lower_bound = (actual_demand_n_days_ahead.values<lower_bound_predicted_n_days_ahead.values).sum()

    pctage_of_days_outside_of_bound =  (actual_data_above_upper_bound+actual_data_below_lower_bound)/len(actual_demand_n_days_ahead)
    
    mean_bound_width = np.mean(upper_bound_predicted_n_days_ahead.values - lower_bound_predicted_n_days_ahead.values)
    
    mean_mape = mape(actual_demand_n_days_ahead.values,median_predicted_n_days_ahead.values )
    return mean_bound_width, pctage_of_days_outside_of_bound, mean_mape


marker_size = 50
fig, ax1 = plt.subplots()
fig, ax2 = plt.subplots()
fig, ax3 = plt.subplots()
for n in range(1,n_days_ahead_to_predict+1):

    mean_bound_width_method1, pctage_of_days_outside_of_bound_method1, mean_mape_method1 = evaluate_methods(df_test_set_method1, n, 1, median_cols, upper_bound_cols, lower_bound_cols, plot=False)
    mean_bound_width_method2, pctage_of_days_outside_of_bound_method2, mean_mape_method2 = evaluate_methods(df_test_set_method2, n, 2, median_cols, upper_bound_cols, lower_bound_cols, plot=False)
    
    if n ==1:
        ax1.scatter(n, mean_bound_width_method1, c='b', marker = 'x', s=marker_size, label = 'Method 1 - Mean Bound Width')
        ax1.scatter(n, mean_bound_width_method2, c='r', marker = 'o', s=marker_size, label = 'Method 2 - Mean Bound Width')

        ax2.scatter(n, pctage_of_days_outside_of_bound_method1, c='b', marker = 'x', s=marker_size, label = 'Method 1 - % outside of bounds')
        ax2.scatter(n, pctage_of_days_outside_of_bound_method2, c='r', marker = 'o', s=marker_size, label = 'Method 2 - % outside of bounds')
        
        ax3.scatter(n, mean_mape_method1, c='b', marker = 'x', s=marker_size, label = 'Method 1 - Mean MAPE')
        ax3.scatter(n, mean_mape_method2, c='r', marker = 'o', s=marker_size, label = 'Method 2 - Mean MAPE')
    else:
        ax1.scatter(n, mean_bound_width_method1, c='b', marker = 'x' ,s=marker_size)
        ax1.scatter(n, mean_bound_width_method2, c='r', marker = 'o',s=marker_size)

        ax2.scatter(n, pctage_of_days_outside_of_bound_method1, c='b', marker = 'x',s=marker_size)
        ax2.scatter(n, pctage_of_days_outside_of_bound_method2, c='r', marker = 'o',s=marker_size)
        
        ax3.scatter(n, mean_mape_method1, c='b', marker = 'x', s=marker_size)
        ax3.scatter(n, mean_mape_method2, c='r', marker = 'o', s=marker_size)
        
ax1.legend()
ax2.legend()
ax3.legend()

<matplotlib.legend.Legend at 0x231351a73d0>

	Demand
Date
2009-08-02	472.683428
2014-12-31	308.646637
2022-07-13	413.161290

	Demand	max_air_temp	min_air_temp	prcp_amt	mean_wind_speed	Holiday	Month	DayOfYear	DayOfWeek
2007-01-01	NaN	9.3	2.9	6.6	8.260870	1	1	1	0
2009-08-02	NaN	19.9	9.6	1.5	4.041667	0	8	214	6
2014-12-31	NaN	6.4	-2.0	0.0	4.041667	0	12	365	2

Water Demand Forecasting¶

EDA¶

Machine Learning training¶

Method 1¶

Method 2¶