import pandas as pd
import re
from io import StringIO
import numpy as np
import requests
import seaborn as sn
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error
from scipy import stats
from scipy.stats import ttest_ind
from datetime import datetime

def month_cols_to_rows(df, output_value_col_name):
    """
    Formats a string containing time series data to a pandas DataFrame

    This function simply wraps the ``+`` operator, and does not
    do anything interesting, except for illustrating what
    the docstring of a very simple function looks like.

    Parameters
    ----------
    num1 : int
        First number to add.
    num2 : int
        Second number to add.

    Returns
    -------
    int
        The sum of ``num1`` and ``num2``.
    """
    
    if len(df.columns) != 12:
        print('Are you sure there are 12 months (plus the year as index) present in this DataFrame?')
        return
    new_df = []
    for y in df.index:
        month_start = 0
        for row in df[df.index == y].values:
            for r in row:
                month_start = month_start++1
                new_df.append([y,month_start,r])

    new_df=pd.DataFrame(new_df)
    new_df.columns = ['Year', 'Month', output_value_col_name]
    #new_df = new_df.set_index('Year')
    
    return new_df

def format_type_1(s):
    """
    Formats a string containing time series data to a pandas DataFrame

    This function simply wraps the ``+`` operator, and does not
    do anything interesting, except for illustrating what
    the docstring of a very simple function looks like.

    Parameters
    ----------
    num1 : int
        First number to add.
    num2 : int
        Second number to add.

    Returns
    -------
    int
        The sum of ``num1`` and ``num2``.
    """
    
    s = StringIO(re.sub(' +', ',', s))

    d = pd.read_csv(s)

    # When reading a string as a CSV, pandas automatically assigns the first row as the header, 
    # we want to move that header back to the rows and recreate an appropriate header
    d.loc[len(d)] = list(d.columns)
    d.columns = np.arange(0,len(d.columns),1)
    return d


############################
r = requests.get('http://climate.rutgers.edu/snowcover/files/moncov.eurasia.txt')
eurasian_snow_cover_data = format_type_1(r.text)
eurasian_snow_cover_data.columns = ['Year', 'Month', 'Eurasian_Snow_Cover']
eurasian_snow_cover_data = eurasian_snow_cover_data.apply(pd.to_numeric, errors='coerce')
eurasian_snow_cover_data['Eurasian_Snow_Cover'] = eurasian_snow_cover_data['Eurasian_Snow_Cover']/10e6 # Sq Km to Mil Sq km
#####################


############################
# r = requests.get("https://www.metoffice.gov.uk/hadobs/hadcet/data/meantemp_daily_totals.txt")
# s = StringIO(re.sub(' +', ',', r.text)) #replace all spaces (and consecutive spaces) with commas
# CET = pd.read_csv(s)
# CET = CET.set_index('Date')
# CET['Year and Month'] = CET.index.str[:-3]
# CE_Monthly = CET.groupby('Year and Month').mean()
# CE_Monthly = CE_Monthly.reset_index()

# CE_Monthly[['Year', 'Month']] = CE_Monthly['Year and Month'].str.split('-', expand=True)
# CE_Monthly = CE_Monthly.drop(columns = 'Year and Month')
# CE_Monthly = CE_Monthly.rename(columns={'Value':'CET Mean'})
############################


############################
r = requests.get("https://origin.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/detrend.nino34.ascii.txt")
s = StringIO(re.sub(' +', ',', r.text[1:])) #replace all spaces (and consecutive spaces) with commas
Nino = pd.read_csv(s)
Nino = Nino[['YR','MON','TOTAL']]
Nino = Nino.rename(columns = {'YR':'Year','MON':'Month','TOTAL':'Nino3.4'})
Nino = Nino.apply(pd.to_numeric, errors='coerce')
############################


############################
r = requests.get("https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/index/ersst.v5.pdo.dat")
s = StringIO(re.sub(' +', ',', r.text[18:])) #replace all spaces (and consecutive spaces) with commas
PDO = pd.read_csv(s)
PDO = PDO.set_index('Year')
PDO = month_cols_to_rows(PDO, 'PDO')
PDO = PDO.apply(pd.to_numeric, errors='coerce')
############################


############################
r = requests.get("https://www.cpc.ncep.noaa.gov/data/indices/soi")
s = StringIO(re.sub(' +', ',', r.text[242:-6485].replace('-999.9',' '))) #replace all spaces (and consecutive spaces) with commas

SOI = pd.read_csv(s)
SOI = SOI.set_index('YEAR')
SOI = SOI.drop(columns = ['Unnamed: 13'])
SOI = month_cols_to_rows(SOI, 'SOI')
SOI = SOI.apply(pd.to_numeric, errors='coerce')
############################


############################
sunspots = pd.read_json("https://services.swpc.noaa.gov/json/solar-cycle/observed-solar-cycle-indices.json")
sunspots = sunspots[['time-tag','ssn']]
sunspots[['Year', 'Month']]=sunspots['time-tag'].str.split('-', expand=True)
sunspots = sunspots.drop(columns = ['time-tag'])
sunspots = sunspots.rename(columns={'ssn': 'Sunspots'})
sunspots = sunspots.apply(pd.to_numeric, errors='coerce')


r = requests.get("https://www.cpc.ncep.noaa.gov/data/indices/qbo.u50.index")
s = StringIO(re.sub(' +', ',', r.text[243:4765].replace('-999.9',' ')))
QBO = pd.read_csv(s)
QBO = QBO.drop(columns = ['Unnamed: 13'])
QBO = QBO.set_index('YEAR')
QBO = month_cols_to_rows(QBO, 'QBO')
QBO = QBO.apply(pd.to_numeric, errors='coerce')


#####################
r = requests.get("https://psl.noaa.gov/data/correlation/amon.us.data")
AMO = format_type_1(r.text[21:-135])
AMO = AMO.set_index(0)
AMO = month_cols_to_rows(AMO, 'AMO')
AMO = AMO.apply(pd.to_numeric, errors='coerce')
#####################


#####################
r = requests.get("https://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii")
AO = format_type_1(r.text[2:])
AO.columns = ['Year', 'Month', 'AO']
AO = AO.apply(pd.to_numeric, errors='coerce')
#####################


#####################
r = requests.get("https://www.cpc.ncep.noaa.gov/products/precip/CWlink/pna/norm.nao.monthly.b5001.current.ascii")
NAO = format_type_1(r.text[2:])
NAO.columns = ['Year', 'Month', 'NAO']
NAO = NAO.apply(pd.to_numeric, errors='coerce')
#####################


#####################
r = requests.get("https://www.cpc.ncep.noaa.gov/products/precip/CWlink/pna/norm.pna.monthly.b5001.current.ascii")
PNA = format_type_1(r.text[2:])
PNA.columns = ['Year', 'Month', 'PNA']
PNA = PNA.apply(pd.to_numeric, errors='coerce')
#####################


############################
r = requests.get("https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Rainfall/date/England.txt")
s = StringIO(re.sub(' +', ',', r.text[264:]))
UK_rainfall = pd.read_csv(s)
UK_rainfall = UK_rainfall.drop(columns = ['win','spr','sum','aut','ann'])
UK_rainfall = UK_rainfall.set_index('year')
UK_rainfall = month_cols_to_rows(UK_rainfall, 'UK_rainfall')
UK_rainfall = UK_rainfall.apply(pd.to_numeric, errors='coerce')
############################


############################
r = requests.get("https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Tmean/date/England.txt")
s = StringIO(re.sub(' +', ',', r.text[259:]))
UK_mean_temp = pd.read_csv(s)
UK_mean_temp = UK_mean_temp.drop(columns = ['win','spr','sum','aut','ann'])
UK_mean_temp = UK_mean_temp.set_index('year')
UK_mean_temp = month_cols_to_rows(UK_mean_temp, 'UK_mean_temp')
UK_mean_temp = UK_mean_temp.apply(pd.to_numeric, errors='coerce')
############################


features_df = pd.merge(eurasian_snow_cover_data,SOI, on=['Year', 'Month'])
features_df = pd.merge(features_df,Nino, on=['Year', 'Month'])
features_df = pd.merge(features_df,PDO, on=['Year', 'Month'])
features_df = pd.merge(features_df,sunspots, on=['Year', 'Month'])
features_df = pd.merge(features_df,QBO, on=['Year', 'Month'])
features_df = pd.merge(features_df,AMO, on=['Year', 'Month'])
features_df = pd.merge(features_df,AO, on=['Year', 'Month'])
features_df = pd.merge(features_df,NAO, on=['Year', 'Month'])
features_df = pd.merge(features_df,PNA, on=['Year', 'Month'])
features_df = pd.merge(features_df,UK_rainfall, on=['Year', 'Month'])
features_df = pd.merge(features_df,UK_mean_temp, on=['Year', 'Month'])



features_df = features_df.sort_values(by=['Year', 'Month'])
features_df['Year and Month'] = pd.to_datetime(features_df['Year'].astype(str)+"-"+features_df['Month'].astype(str), format = '%Y-%m')
features_df = features_df.set_index('Year and Month')
features_df = features_df.drop(columns = ['Year', 'Month'])


features_df = features_df.replace(99.99, np.nan)
features_df = features_df.replace(-99.99, np.nan)
print(features_df.isna().sum(axis=1))
features_df = features_df.interpolate() # interpolate between points around missing values, if there's a Nan at the last row, just use the previous months value

Year and Month
1979-01-01    0
1979-02-01    0
1979-03-01    0
1979-04-01    0
1979-05-01    0
             ..
2022-07-01    0
2022-08-01    0
2022-09-01    0
2022-10-01    0
2022-11-01    0
Length: 527, dtype: int64


pd.options.mode.chained_assignment = None  # default='warn' 

def combine_winters(features_df, winter_months):
    winter_month_mask = [x in winter_months for x in features_df.index.month]
    all_winters = features_df.loc[winter_month_mask,:]
    
    
    all_winters.loc[:,'Winter Start Year'] = 0
    for ind in all_winters.index:
        if ind.month > 11:
            all_winters.loc[ind, 'Winter Start Year'] = ind.year
        elif ind.month < 3:
            all_winters.loc[ind, 'Winter Start Year'] = ind.year-1
            
    all_winter_means = all_winters.groupby('Winter Start Year').mean()['UK_mean_temp']#.values
    all_winter_means.name = 'Winter Mean Temperature'
    
    return all_winters, all_winter_means


winter_months = [12,1,2]

all_winters, all_winter_means = combine_winters(features_df, winter_months)


plt.rcParams['figure.figsize'] = [12, 10]
all_winter_means.plot(legend=True)

<AxesSubplot:xlabel='Winter Start Year'>


all_months_with_winter_means = pd.merge(features_df,all_winter_means, left_on=features_df.index.year, right_index=True)
all_months_with_winter_means = all_months_with_winter_means.drop(columns=['key_0'])

corr_mat = all_months_with_winter_means[all_months_with_winter_means.index.month==11].corr()
sn.heatmap(corr_mat, annot=True)

<AxesSubplot:>


pd.plotting.scatter_matrix(all_months_with_winter_means[all_months_with_winter_means.index.month==11], diagonal='kde')

array([[<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='SOI', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='PDO', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='QBO', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='AMO', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='AO', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='NAO', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='PNA', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='Eurasian_Snow_Cover'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='Eurasian_Snow_Cover'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='SOI'>,
        <AxesSubplot:xlabel='SOI', ylabel='SOI'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='SOI'>,
        <AxesSubplot:xlabel='PDO', ylabel='SOI'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='SOI'>,
        <AxesSubplot:xlabel='QBO', ylabel='SOI'>,
        <AxesSubplot:xlabel='AMO', ylabel='SOI'>,
        <AxesSubplot:xlabel='AO', ylabel='SOI'>,
        <AxesSubplot:xlabel='NAO', ylabel='SOI'>,
        <AxesSubplot:xlabel='PNA', ylabel='SOI'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='SOI'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='SOI'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='SOI'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='SOI', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='PDO', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='QBO', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='AMO', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='AO', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='NAO', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='PNA', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='Nino3.4'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='Nino3.4'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='PDO'>,
        <AxesSubplot:xlabel='SOI', ylabel='PDO'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='PDO'>,
        <AxesSubplot:xlabel='PDO', ylabel='PDO'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='PDO'>,
        <AxesSubplot:xlabel='QBO', ylabel='PDO'>,
        <AxesSubplot:xlabel='AMO', ylabel='PDO'>,
        <AxesSubplot:xlabel='AO', ylabel='PDO'>,
        <AxesSubplot:xlabel='NAO', ylabel='PDO'>,
        <AxesSubplot:xlabel='PNA', ylabel='PDO'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='PDO'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='PDO'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='PDO'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='SOI', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='PDO', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='QBO', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='AMO', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='AO', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='NAO', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='PNA', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='Sunspots'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='Sunspots'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='QBO'>,
        <AxesSubplot:xlabel='SOI', ylabel='QBO'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='QBO'>,
        <AxesSubplot:xlabel='PDO', ylabel='QBO'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='QBO'>,
        <AxesSubplot:xlabel='QBO', ylabel='QBO'>,
        <AxesSubplot:xlabel='AMO', ylabel='QBO'>,
        <AxesSubplot:xlabel='AO', ylabel='QBO'>,
        <AxesSubplot:xlabel='NAO', ylabel='QBO'>,
        <AxesSubplot:xlabel='PNA', ylabel='QBO'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='QBO'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='QBO'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='QBO'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='AMO'>,
        <AxesSubplot:xlabel='SOI', ylabel='AMO'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='AMO'>,
        <AxesSubplot:xlabel='PDO', ylabel='AMO'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='AMO'>,
        <AxesSubplot:xlabel='QBO', ylabel='AMO'>,
        <AxesSubplot:xlabel='AMO', ylabel='AMO'>,
        <AxesSubplot:xlabel='AO', ylabel='AMO'>,
        <AxesSubplot:xlabel='NAO', ylabel='AMO'>,
        <AxesSubplot:xlabel='PNA', ylabel='AMO'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='AMO'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='AMO'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='AMO'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='AO'>,
        <AxesSubplot:xlabel='SOI', ylabel='AO'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='AO'>,
        <AxesSubplot:xlabel='PDO', ylabel='AO'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='AO'>,
        <AxesSubplot:xlabel='QBO', ylabel='AO'>,
        <AxesSubplot:xlabel='AMO', ylabel='AO'>,
        <AxesSubplot:xlabel='AO', ylabel='AO'>,
        <AxesSubplot:xlabel='NAO', ylabel='AO'>,
        <AxesSubplot:xlabel='PNA', ylabel='AO'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='AO'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='AO'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='AO'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='NAO'>,
        <AxesSubplot:xlabel='SOI', ylabel='NAO'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='NAO'>,
        <AxesSubplot:xlabel='PDO', ylabel='NAO'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='NAO'>,
        <AxesSubplot:xlabel='QBO', ylabel='NAO'>,
        <AxesSubplot:xlabel='AMO', ylabel='NAO'>,
        <AxesSubplot:xlabel='AO', ylabel='NAO'>,
        <AxesSubplot:xlabel='NAO', ylabel='NAO'>,
        <AxesSubplot:xlabel='PNA', ylabel='NAO'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='NAO'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='NAO'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='NAO'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='PNA'>,
        <AxesSubplot:xlabel='SOI', ylabel='PNA'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='PNA'>,
        <AxesSubplot:xlabel='PDO', ylabel='PNA'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='PNA'>,
        <AxesSubplot:xlabel='QBO', ylabel='PNA'>,
        <AxesSubplot:xlabel='AMO', ylabel='PNA'>,
        <AxesSubplot:xlabel='AO', ylabel='PNA'>,
        <AxesSubplot:xlabel='NAO', ylabel='PNA'>,
        <AxesSubplot:xlabel='PNA', ylabel='PNA'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='PNA'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='PNA'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='PNA'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='SOI', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='PDO', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='QBO', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='AMO', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='AO', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='NAO', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='PNA', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='UK_rainfall'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='UK_rainfall'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='SOI', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='PDO', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='QBO', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='AMO', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='AO', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='NAO', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='PNA', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='UK_mean_temp'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='UK_mean_temp'>],
       [<AxesSubplot:xlabel='Eurasian_Snow_Cover', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='SOI', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='Nino3.4', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='PDO', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='Sunspots', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='QBO', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='AMO', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='AO', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='NAO', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='PNA', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='UK_rainfall', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='UK_mean_temp', ylabel='Winter Mean Temperature'>,
        <AxesSubplot:xlabel='Winter Mean Temperature', ylabel='Winter Mean Temperature'>]],
      dtype=object)


ax = all_months_with_winter_means['AMO'].plot(legend=True)
all_months_with_winter_means['Winter Mean Temperature'].plot(ax=ax, secondary_y=True, legend=True)

<AxesSubplot:>


#######################################################################



def euclidian(arr_a, arr_b):
    return np.sum((arr_a-arr_b)**2)**0.5


def find_analogue_years(search_months, search_year, n_analogues):
    month_mask = [x in search_months for x in features_df.index.month]
    months_to_search_for = features_df[month_mask & (features_df.index.year==search_year)]

    ############# 

    months_to_search = features_df[month_mask]
    
    
    
    years = pd.DataFrame([], columns = ['Similarity'])
    for ind, data in months_to_search.iterrows():
        if ind.year == search_year:
            continue
        
        ed = euclidian(data.values, months_to_search_for.values)

        if ind.year in years.index:
            #years.loc[ind.year]=np.mean([years.loc[ind.year].values[0], abs(row[['Nino3.4', 'AMO', 'AO', 'NAO']].values -months_to_search_for[['Nino3.4', 'AMO', 'AO', 'NAO']].values).sum()])
            years.loc[ind.year]=np.nanmean([years.loc[ind.year].values[0], ed])
            
        else:
            #years.loc[ind.year]=abs(row[['Nino3.4', 'AMO', 'AO', 'NAO']].values -months_to_search_for[['Nino3.4', 'AMO', 'AO', 'NAO']].values).sum()
            years.loc[ind.year]=ed
            
    most_similar_years = years.nsmallest(n_analogues, 'Similarity')
    return most_similar_years


def get_temperatures_of_similar_years(all_winters, most_similar_years):
    temps = []
    for y in most_similar_years.index:
        temps.append(all_winters.loc[(all_winters['Winter Start Year'] ==y)]['UK_mean_temp'].mean())
    temps = np.array(temps)
    return temps


## Finding Analogues

months = [11]

current_year = 2022


most_similar_years = find_analogue_years(months, current_year, 3)

temps = get_temperatures_of_similar_years(all_winters, most_similar_years)

plt.hist(all_winter_means)
plt.axvline(np.mean(all_winter_means), c='red')
plt.axvline(np.mean(all_winter_means)+np.std(all_winter_means), c='yellow')
plt.axvline(np.mean(all_winter_means)-np.std(all_winter_means), c='yellow')
plt.hist(temps)    


percentile = stats.percentileofscore(all_winter_means.values, np.mean(temps))
print("Prediction: this winter (as compared with the data from {date}) \nwill be in the {p}th percentile of all UK winters in the dataset".format(date = current_year, p = round(100-percentile,2)))

Prediction: this winter (as compared with the data from 2022) 
will be in the 43.18th percentile of all UK winters in the dataset


############################################################################
def evaluate_method1(all_winters, months_to_check,axs, subplotx, subploty):
    print("################")
    print("Months: "+str(months_to_check))
    predicted_based_on_analogues = []
    actual_winter_temps = []
    for i in np.arange(min(all_winters.index.year),max(all_winters.index.year),1):

        similar_years = find_analogue_years(months_to_check, i, 2)
        temps = get_temperatures_of_similar_years(all_winters, similar_years)

        predicted_based_on_analogues.append(np.nanmean(temps))
        actual_winter_temps.append(all_winter_means[all_winter_means.index==i].values[0])

    axs[subplotx,subploty].set_title("Months: "+','.join(str(i) for i in months_to_check))
    axs[subplotx,subploty].set_xlabel('Actual Winter Mean Temperatures')
    axs[subplotx,subploty].set_ylabel('Predicted Winter Mean\n Temperatures \n(based on analogue years)')
    axs[subplotx,subploty].plot(np.array(actual_winter_temps), np.array(actual_winter_temps), c='red')
    axs[subplotx,subploty].scatter(np.array(actual_winter_temps), np.array(predicted_based_on_analogues))

    corr, _ = pearsonr(np.array(actual_winter_temps), np.array(predicted_based_on_analogues))
    print("Correlation: "+ str(corr))
    print('Covariance: \n'+ str(np.cov(np.array(actual_winter_temps), np.array(predicted_based_on_analogues))))
    print('r^2:'+ str(r2_score(np.array(actual_winter_temps), np.array(predicted_based_on_analogues))))

    return

fig, axs = plt.subplots(3,2)
fig.tight_layout(pad=3.0)
months_to_check = [9,]
evaluate_method1(all_winters, months_to_check,axs, 0,0)
months_to_check = [10,]
evaluate_method1(all_winters, months_to_check,axs, 1,0)
months_to_check = [11,]
evaluate_method1(all_winters, months_to_check,axs, 2,0)
months_to_check = [9,10,11]
evaluate_method1(all_winters, months_to_check,axs, 0,1)
months_to_check = [9,10]
evaluate_method1(all_winters, months_to_check,axs, 1,1)
months_to_check = [10,11]
evaluate_method1(all_winters, months_to_check,axs, 2,1)

################
Months: [9]
Correlation: 0.2793496919412732
Covariance: 
[[1.29841024 0.28562262]
 [0.28562262 0.80515012]]
r^2:-0.18055949102460334
################
Months: [10]
Correlation: 0.015337089842354663
Covariance: 
[[1.29841024 0.01483327]
 [0.01483327 0.7204039 ]]
r^2:-0.5467480416368937
################
Months: [11]
Correlation: 0.23692215198773736
Covariance: 
[[1.29841024 0.22674542]
 [0.22674542 0.70542974]]
r^2:-0.2115039262029268
################
Months: [9, 10, 11]
Correlation: -0.4151455555482108
Covariance: 
[[ 1.29841024 -0.38786268]
 [-0.38786268  0.67226929]]
r^2:-1.1233995013352693
################
Months: [9, 10]
Correlation: -0.140836717459464
Covariance: 
[[ 1.29841024 -0.13069091]
 [-0.13069091  0.66320383]]
r^2:-0.7140313660305264
################
Months: [10, 11]
Correlation: -0.12157188134393226
Covariance: 
[[ 1.29841024 -0.10451981]
 [-0.10451981  0.56927095]]
r^2:-0.643946677369037


#### ML training with single month

def dataframe_to_training_and_test_sets(df, pct_split, months_used_for_prediction, label_col):
    
    autumn = [x in months_used_for_prediction for x in df.index.month]
    df_filtered = df[autumn]
    
    df_train = df_filtered.loc[df_filtered.index[:int(len(df_filtered.index)*pct_split)],:]
    df_test = df_filtered.loc[df_filtered.index[int(len(df_filtered.index)*pct_split):],:]
    

    y_train = df_train[label_col].round(3).values
    X_train = df_train.drop(columns = [label_col]).values

    y_test = df_test[label_col].round(3).values
    X_test = df_test.drop(columns = [label_col]).values
    return X_train,y_train, X_test, y_test, df_train, df_test

label_col = 'Winter Mean Temperature'
all_months_with_winter_means['Month'] = all_months_with_winter_means.index.month
all_months_with_winter_means_shuffled = all_months_with_winter_means.sample(frac=1, random_state=123) # going to shuffle the data so that we have some of the more recent, warmer years in the training set
X_train, y_train, X_test, y_test, df_train, df_test = dataframe_to_training_and_test_sets(all_months_with_winter_means_shuffled, 0.8, [9,10,11], label_col)


from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import model_selection
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, RepeatedKFold

from sklearn.utils import class_weight
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
def run_exps(X_train: pd.DataFrame , y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame) -> pd.DataFrame:
    '''
    Lightweight script to test many models and find winners
    :param X_train: training split
    :param y_train: training target vector
    :param X_test: test split
    :param y_test: test target vector
    :return: DataFrame of predictions
    '''
    
    dfs = []
    models = [
              ('LR', LinearRegression()), 
              ('SVR', SVR()),
              ('RFR', RandomForestRegressor()),
              ('GBR', GradientBoostingRegressor()),
                ('XGR', XGBRegressor()),
            ]
    results = []
    names = []
    scoring = ['r2']
    for name, model in models:
            kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=123)
            cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
            clf = model.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            print(name)
            print("R^2: ",r2_score(y_test, y_pred))
            results.append(cv_results)
            names.append(name)
    return 

run_exps(X_train, y_train, X_test, y_test)

LR
R^2:  0.1601504961664052
SVR
R^2:  -0.007091468448938043
RFR
R^2:  0.4165154425033264
GBR
R^2:  0.44685758860671754
XGR
R^2:  0.32658177027496804


def naive_confidence_intervals(model, X, y, alpha):
    residuals = y - model.predict(X)
    #plt.hist(residuals) # histogram indicates the errors are not normally distributed, therefore need to use the jackknife method
    ci = np.quantile(residuals, 1 - alpha)
    return ci

def jackknife_confidence_intervals(model, X_train, y_train, X_test, y_test, alpha):
    kf = KFold(n_splits=len(y_train)-1, shuffle=True, random_state=123)
    res = []
    for train_index, test_index in kf.split(X_train):
        X_train_, X_test_ = X_train[train_index], X_train[test_index]
        y_train_, y_test_ = y_train[train_index], y_train[test_index]

        model.fit(X_train_, y_train_)
        res.extend(list(y_test_ - model.predict(X_test_)))
    ci = np.quantile(res, 1 - alpha)
    return ci

def train_and_evaluate(model_type, X_train, y_train, X_test, y_test):
    model = model_type
    model.fit(X_train,y_train)
    # define model evaluation method
    
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

    scores = cross_val_score(model, X_train,y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    scores = np.abs(scores)
    print('Mean MAE: %.5f (%.8f)' % (scores.mean(), scores.std()) )

    mae = mean_absolute_error(y_test, model.predict(X_test))
    mape = mean_absolute_percentage_error(y_test, model.predict(X_test))
    
    ci = jackknife_confidence_intervals(model, X_train, y_train, X_test, y_test, 0.2) # CI at the 80th percentile
    #ci = naive_confidence_intervals(model,X_test, y_test, 0.05)
    
    r2 = r2_score(y_test, model.predict(X_test))
    print("r^2 (test set): ", r2)

    
    print("MAE (test set): ",mae)
    return model, ci

def scatter_plot_of_results(y_test, predicted_vals, interval):
    plt.scatter(y_test, predicted_vals, label='Predictions')
    plt.errorbar(y_test, predicted_vals, yerr=interval, fmt='o', label = "CI at 80th pctile")
    plt.plot(y_test, y_test, label='Perfect prediction line', c='r')
    plt.xlabel('Actual ($^\circ$C)')
    plt.ylabel('Predicted ($^\circ$C)')
    plt.title('Actual vs predicted')
    plt.legend()
    return


model, interval = train_and_evaluate(RandomForestRegressor(), X_train, y_train, X_test, y_test)
scatter_plot_of_results(y_test, model.predict(X_test), interval)

Mean MAE: 0.78609 (0.21484384)
r^2 (test set):  0.45429900014233293
MAE (test set):  0.7878688461538469


def get_forecast(percentile):
    forecasts = {0: 'Very Cold', 20: 'Colder than average', 40: 'Around Average', 60: 'Milder than average', 80: 'Very Mild',100: ""}
    keys = list(forecasts.keys())
    predicted_forecast = forecasts[[keys[k] for k in range(len(keys)) if percentile > keys[k] and  percentile < keys[k+1]][0]]
    return predicted_forecast

def winter_prediction(percentile, date):
    print("Prediction: this winter (as predicted with the latest data from {date} \nwill be in the {p}th percentile of all UK winters from the dataset\n".format(date = predicted_from_date, p = round(percentile,2)))

features_df['Month'] = features_df.index.month
predicted_from_date = features_df.index[-1]
predict_this_winter = model.predict(np.array(features_df.loc[predicted_from_date].values).reshape(1,-1))[0]
predict_this_winter

4.644899999999998


percentile = stats.percentileofscore(all_winter_means.values, predict_this_winter)
winter_prediction(percentile, predicted_from_date)
get_forecast(percentile)

Prediction: this winter (as predicted with the latest data from 2022-11-01 00:00:00 
will be in the 56.82th percentile of all UK winters from the dataset

'Around Average'


#### ML training with many months

def dataframe_with_lookback(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [(df.columns[j]+'(t-%d)' % (i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [(df.columns[j]) for j in range(n_vars)]
        else:
            names += [(df.columns[j]+'(t+%d)' % (i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg


df_shifted = dataframe_with_lookback(all_months_with_winter_means, n_in=2, n_out=1, dropnan=True)
df_shifted['Month'] = df_shifted.index.month

df_shifted = df_shifted.drop(columns = [col for col in df_shifted.columns if label_col in col and col != label_col])

df_shifted = df_shifted.sample(frac=1.0, random_state=123)
X_train, y_train, X_test, y_test, df_train, df_test = dataframe_to_training_and_test_sets(df_shifted, 0.8, [9,10,11], label_col)


run_exps(X_train, y_train, X_test, y_test)

LR
R^2:  -0.10038915887336408
SVR
R^2:  -0.21862456388972906
RFR
R^2:  0.18530770947299635
GBR
R^2:  0.36601567331815943
XGR
R^2:  0.15298225397762866


model, interval = train_and_evaluate(XGBRegressor(), X_train, y_train, X_test, y_test)
scatter_plot_of_results(y_test, model.predict(X_test), interval)

Mean MAE: 0.90439 (0.13379845)
r^2 (test set):  0.17664277208590873
MAE (test set):  0.8521753973594078


predicted_from_date = df_shifted.index[-1]
features = df_shifted.drop(columns=['Winter Mean Temperature']).loc[predicted_from_date]

predict_this_winter = model.predict(features.values.reshape(1,-1))[0]

percentile = stats.percentileofscore(all_winter_means.values, predict_this_winter)
winter_prediction(percentile, predicted_from_date)
get_forecast(percentile)

Prediction: this winter (as predicted with the latest data from 2021-09-01 00:00:00 
will be in the 36.36th percentile of all UK winters from the dataset

'Colder than average'


######## Feature Importance Section

# define the method
rfe = RFE(estimator=XGBRegressor(), n_features_to_select=5)
rfe.fit(X_train,y_train)

df_feature_selection = pd.DataFrame([], columns = ['Feature Name', 'Ranking'])
for i in range(X_train.shape[1]):
    #print('Column: %d, Name: %s,  Selected %s, Rank: %.3f' % (i,df_train.drop(columns=['Winter Mean Temperature']).columns[i],  rfe.support_[i], rfe.ranking_[i]))
    df_feature_selection = df_feature_selection.append({'Feature Name': df_train.drop(columns=['Winter Mean Temperature']).columns[i], 'Ranking': rfe.ranking_[i]}, ignore_index=True)

df_feature_selection.sort_values('Ranking')


#df_shifted = df_shifted[['UK_rainfall','NAO', 'QBO', 'AO(t-1)', 'AMO','Month', 'Winter Mean Temperature']]
df_shifted = df_shifted[['UK_rainfall','Nino3.4(t-1)', 'Sunspots(t-1)', 'SOI(t-1)', 'AMO','Month', 'Winter Mean Temperature']]
X_train, y_train, X_test, y_test, df_train, df_test = dataframe_to_training_and_test_sets(df_shifted, 0.8, [9,10,11], 'Winter Mean Temperature')


model, interval = train_and_evaluate(XGBRegressor(), X_train, y_train, X_test, y_test)
scatter_plot_of_results(y_test, model.predict(X_test), interval)

Mean MAE: 0.69935 (0.17526646)
r^2 (test set):  -0.2521115883065892
MAE (test set):  1.1058803530472976


df_shifted = dataframe_with_lookback(all_months_with_winter_means.drop(columns='Winter Mean Temperature'), n_in=2, n_out=1, dropnan=True)
df_shifted['Month'] = df_shifted.index.month


n_months_ahead = 1

df_shifted['UK_mean_temp (t+1)'] = df_shifted['UK_mean_temp']
df_shifted['UK_mean_temp (t+1)'] = df_shifted.shift(-n_months_ahead)['UK_mean_temp (t+1)']
df_shifted = df_shifted.dropna(axis=0)


X_train, y_train, X_test, y_test, df_train, df_test = dataframe_to_training_and_test_sets(df_shifted, 0.8, [9,10,11], 'UK_mean_temp (t+1)')


run_exps(X_train, y_train, X_test, y_test)

LR
R^2:  0.5209439613753628
SVR
R^2:  -0.02476782421697754
RFR
R^2:  0.6411423830554864
GBR
R^2:  0.6380541197413085
XGR
R^2:  0.6643732491398218


model, interval = train_and_evaluate(XGBRegressor(), X_train, y_train, X_test, y_test)
scatter_plot_of_results(y_test, model.predict(X_test), interval)

Mean MAE: 1.18007 (0.27404972)
r^2 (test set):  0.6657475221909956
MAE (test set):  1.106194936312162


df_shifted2 = dataframe_with_lookback(features_df, n_in=2, n_out=1, dropnan=True)
df_shifted2['Month'] = df_shifted2.index.month

df_shifted2['UK_mean_temp (t+1)'] = df_shifted2['UK_mean_temp']
df_shifted2['UK_mean_temp (t+1)'] = df_shifted2.shift(-n_months_ahead)['UK_mean_temp (t+1)']
#df_shifted2 = df_shifted2.dropna(axis=0)


predict_using_month = df_shifted2.index[-1] # get last row of dataset (should be the latest data)
df_shifted2_to_predict = df_shifted2[df_shifted2.index == predict_using_month]
df_shifted2_to_predict = df_shifted2_to_predict.drop(columns = 'UK_mean_temp (t+1)')
predict_next_month = model.predict(df_shifted2_to_predict.values.reshape(1,-1))[0]


from datetime import timedelta
print(predict_next_month, "+/-", interval)
mean_UK_temps_for_predicted_month = df_shifted2[df_shifted2.index.month ==predict_using_month.month+n_months_ahead]['UK_mean_temp']
mean_UK_temps_for_predicted_month.hist()
plt.axvline(mean_UK_temps_for_predicted_month.mean(), c='r')
plt.axvline(predict_next_month, c='g')
plt.axvline(predict_next_month+interval, c='yellow')
plt.axvline(predict_next_month-interval, c='yellow')
percentile = stats.percentileofscore(mean_UK_temps_for_predicted_month, predict_next_month)


print("Predicted forecast for month of {date} \n{fc}".format(date = str((predict_using_month+timedelta(days=31*n_months_ahead)).date())[:-3], fc= get_forecast(percentile)))

4.5014324 +/- 1.2403095245361329
Predicted forecast for month of 2022-12 
Colder than average


all_months_with_winter_means[all_months_with_winter_means.index.month==11][['AO', 'Nino3.4', 'Winter Mean Temperature']].plot.scatter(x='AO',y= 'Winter Mean Temperature')

<AxesSubplot:xlabel='AO', ylabel='Winter Mean Temperature'>


month_to_predict = 12
df_shifted_test = df_shifted2[['Eurasian_Snow_Cover', 'SOI', 'Nino3.4', 'PDO', 'Sunspots', 'QBO',
       'AMO', 'AO', 'NAO', 'PNA', 'UK_rainfall', 'UK_mean_temp', 'Month',
       'UK_mean_temp (t+1)']][df_shifted2['Month']==month_to_predict]

corr_mat = df_shifted_test.corr()
sn.heatmap(corr_mat, annot=True)
plt.show()


colder_than_average = (df_shifted_test['UK_mean_temp (t+1)'] < df_shifted_test['UK_mean_temp (t+1)'].mean())
warmer_than_average = (df_shifted_test['UK_mean_temp (t+1)'] > df_shifted_test['UK_mean_temp (t+1)'].mean())
df_shifted_test['Class'] = 0
df_shifted_test.loc[warmer_than_average,'Class'] = 0
df_shifted_test.loc[colder_than_average,'Class'] = 1
ao = df_shifted_test['AO']
month_temp_class = df_shifted_test['Class']#['UK_mean_temp (t+1)']
df_shifted_test = df_shifted_test.dropna()
print(r2_score(ao, month_temp_class))
plt.scatter(ao, month_temp_class)

-0.5169153689455155

<matplotlib.collections.PathCollection at 0x1f62fb86fa0>


from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(ao.values.reshape(-1, 1), month_temp_class.values)

print("Acc: ",clf.score(ao.values.reshape(-1, 1), month_temp_class.values))
print(f"Probability of colder than avg month {month_to_predict}: ",clf.predict_proba(ao[-1].reshape(1, -1))[0][1])

Acc:  0.6976744186046512
Probability of colder than avg month 12:  0.40535315552836126


from sklearn.decomposition import PCA
pca = PCA(n_components=1)

principalComponents = pca.fit_transform(df_shifted_test[['AO', 'NAO']].values)


clf = LogisticRegression(random_state=0).fit(principalComponents.reshape(-1, 1), month_temp_class.values)


print(clf.score(principalComponents.reshape(-1, 1), month_temp_class.values))
print(f"Probability of colder than avg month {month_to_predict}: ",clf.predict_proba(principalComponents[-1].reshape(1, -1))[0][1])

0.6744186046511628
Probability of colder than avg month 12:  0.4190134249885004


# AO daily data

r2 = requests.get("https://ftp.cpc.ncep.noaa.gov/cwlinks/norm.daily.ao.cdas.z1000.19500101_current.csv")
response = [l.split(',') for l in r2.text.split('\n')]
daily_ao = pd.DataFrame(response)
daily_ao.columns = daily_ao.iloc[0]
daily_ao = daily_ao.drop(daily_ao.index[0])
daily_ao = daily_ao[(daily_ao['day'].astype(str)=='None')==False]

daily_ao['ao_index_cdas'] = pd.to_numeric(daily_ao['ao_index_cdas'])
daily_ao['year'] = pd.to_numeric(daily_ao['year'])
daily_ao['month'] = pd.to_numeric(daily_ao['month'])
daily_ao['day'] = pd.to_numeric(daily_ao['day'])

#daily_ao['Datetime'] = daily_ao.apply(lambda x: datetime(int(x['year']), int(x['month']), int(x['day']),0,0,0), axis=1) 
daily_ao_grouped = daily_ao[daily_ao['year']==2022].groupby([daily_ao['year'],daily_ao['month']])['ao_index_cdas'].mean()


# NAO daily data
r3 = requests.get("https://ftp.cpc.ncep.noaa.gov/cwlinks/norm.daily.nao.cdas.z500.19500101_current.csv")
response = [l.split(',') for l in r3.text.split('\n')]
daily_nao = pd.DataFrame(response)
daily_nao.columns = daily_nao.iloc[0]
daily_nao = daily_nao.drop(daily_nao.index[0])
daily_nao = daily_nao[(daily_nao['day'].astype(str)=='None')==False]

daily_nao['nao_index_cdas'] = pd.to_numeric(daily_nao['nao_index_cdas'])
daily_nao['year'] = pd.to_numeric(daily_nao['year'])
daily_nao['month'] = pd.to_numeric(daily_nao['month'])
daily_nao['day'] = pd.to_numeric(daily_nao['day'])

#daily_nao['Datetime'] = daily_nao.apply(lambda x: datetime(int(x['year']), int(x['month']), int(x['day']),0,0,0), axis=1) 

daily_nao_grouped = daily_nao[daily_nao['year']==2022].groupby([daily_nao['year'],daily_nao['month']])['nao_index_cdas'].mean()


principalComponents = pca.fit_transform(np.array([daily_ao_grouped.iloc[-1], daily_nao_grouped.iloc[-1]]).reshape(1, -1))

c:\users\johnd\appdata\local\programs\python\python38\lib\site-packages\sklearn\decomposition\_pca.py:525: RuntimeWarning: invalid value encountered in true_divide
  explained_variance_ = (S**2) / (n_samples - 1)


print(f"Probability next month being older than avg: ",clf.predict_proba(principalComponents)[0][1])

Probability next month being older than avg:  0.4378278474821144


from scipy.stats import linregress
new_df = df_shifted2[(df_shifted2.index.month>11) | (df_shifted2.index.month<3)]
new_df.plot.scatter(x='AO', y='UK_mean_temp (t+1)')#[df_shifted2.index.year==2018]
slope, intercept, r_value, p_value, std_err = linregress(new_df['AO'].values, new_df['UK_mean_temp (t+1)'].values)
line = intercept + slope*new_df['AO']
stdev = np.std(line -new_df['AO'])
plt.plot(new_df['AO'], line, 'r', label='fitted line')
plt.plot(new_df['AO'], line+stdev*1.96, 'r', label='fitted line')
plt.plot(new_df['AO'], line-stdev*1.96, 'r', label='fitted line')

[<matplotlib.lines.Line2D at 0x1f6301d4ac0>]


new_df['Cooler than average'] = 0
colder_than_average = (new_df['UK_mean_temp (t+1)'] < new_df['UK_mean_temp (t+1)'].mean())
new_df.loc[colder_than_average,'Cooler than average'] = 1
dist_cta = new_df[new_df['Cooler than average'] == 0]['AO']
dist_wta = new_df[new_df['Cooler than average'] == 1]['AO']
dist_cta.hist(alpha = 0.5)
dist_wta.hist(alpha = 0.5)
print(ttest_ind(dist_cta,dist_wta, equal_var=True))

Ttest_indResult(statistic=3.542370582880197, pvalue=0.0005553922279930768)

UK Winter Temperature prediction¶

Can we use the Climate Indices data from the Autumn months to predict/estimate the mean upcoming winter temperature?¶

Method 1: Finding Analogues¶

Method 2: Machine learning¶

Method 3: ML with a lookback¶

Method 4: Using ML to predict next months mean temperature¶

Additional EDA¶

Using PCA?¶

What about using the latest months data to predict the next months temperature?¶

Is there a statistically significant difference between colder than avg Decembers that have a negative AO and a positive AO?¶

	Feature Name	Ranking
36	UK_rainfall	1
32	AMO	1
17	Sunspots(t-1)	1
15	Nino3.4(t-1)	1
14	SOI(t-1)	1
35	PNA	2
4	Sunspots(t-2)	3
7	AO(t-2)	4
16	PDO(t-1)	5
30	Sunspots	6
6	AMO(t-2)	7
3	PDO(t-2)	8
2	Nino3.4(t-2)	9
27	SOI	10
26	Eurasian_Snow_Cover	11
33	AO	12
13	Eurasian_Snow_Cover(t-1)	13
10	UK_rainfall(t-2)	14
9	PNA(t-2)	15
8	NAO(t-2)	16
28	Nino3.4	17
19	AMO(t-1)	18
22	PNA(t-1)	19
11	UK_mean_temp(t-2)	20
23	UK_rainfall(t-1)	21
31	QBO	22
0	Eurasian_Snow_Cover(t-2)	23
20	AO(t-1)	24
29	PDO	25
1	SOI(t-2)	26
5	QBO(t-2)	27
18	QBO(t-1)	28
34	NAO	29
21	NAO(t-1)	30
37	UK_mean_temp	31
24	UK_mean_temp(t-1)	32
12	Month(t-2)	33
25	Month(t-1)	34
38	Month	35