LSTM for streamflow prediction
In [2]: import numpy as np
import pandas as pd
import torch
from [Link] import Dataset
from torch import nn
import numpy as np
from [Link] import DataLoader
import [Link] as plt
# tutorial used for inspiration: [Link]
Define input variables, define input data
into Pandas DataFrame and normalize
data
In [20]: # define target dataset to predict and other variables for model
target = 'WL'
sequence_length = 365
batch_size = 512
learning_rate = 0.1
num_hidden_units = 20
epochs = 50
epochs_to_save = [1, 2, 3, 5, 10, 50]
train_start = '2015-01-01' # starts 1 year later due to padding in figure
train_end = '2020-12-31'
# for reproducibility figure
test_start = '2015-01-01' # starts 1 year later due to padding in figure
test_end = '2020-12-31'
# for prediction plot in training data
start_prediction = '2015-01-01' # starts 1 year later due to padding in figur
end_prediction = '2020-12-31'
In [3]: rf = pd.read_csv('[Link]', parse_dates=['Time'], index_col='Time'
q = pd.read_csv('[Link]', parse_dates=['Time'], index_col='Time')
data = [Link](q)
data = [Link](columns=['Hour', 'BAHADURABAD'])
data[target] = data[target].fillna(data[target].min())
In [26]: [Link][-1]
Out[26]: Timestamp('2020-12-31 [Link]+0000', tz='UTC')
In [4]: df = data
In [5]: # plot feature and taget data
fig, ax = [Link](nrows=6, ncols=1, figsize=(20,17))
[Link](ax=ax,subplots=True);
y_labels = ['Precipitation [mm/day', 'Maximum air temperature [°C]',
'Minimum air temperature [°C]', 'Vapour pressure [Pa]',
'Solar radiation [W/m2]', 'Discharge [m3/d]']
ax[0].set_title(f'Feature and target (discharge) data');
for i, iax in enumerate(ax):
iax.set_ylabel(y_labels[i])
[Link]().remove()
if iax is ax[-1]:
iax.set_xlabel('Date [days]')
else:
iax.set_xlabel('')
In [6]: # divide dataset in training and test set
features = list([Link]([target]))
df_train = [Link][train_start:train_end].copy()
df_test = [Link][test_start:test_end].copy()
df_predict = [Link][start_prediction:end_prediction].copy()
# print fraction the test set is of all the used data
print('Test set fraction: ', len(df_test) / (len(df_train) + len(df_test)))
# normalize data
target_mean = df_train[target].mean()
target_stdev = df_train[target].std()
for col in df_train.columns:
mean = df_train[col].mean()
stdev = df_train[col].std()
df_train[col] = (df_train[col] - mean) / stdev
df_test[col] = (df_test[col] - mean) / stdev
df_predict[col] = (df_predict[col] - mean) / stdev
Test set fraction: 0.5
Classes and functions to make LSTM
Class for PyTorch dataloader
In [7]: class TimeSeriesDataset(Dataset):
def __init__(self, dataframe, target, features, sequence_length=5):
print('data shape',[Link])
# save which are the features and target data and sequence length
[Link] = features
[Link] = target
self.sequence_length = sequence_length
# load data from dataframe into tensor
y = [Link](dataframe[[Link]].values).float()
X = [Link](dataframe[[Link]].values).float()
# remove padding from data and save data in class
self.y = y[sequence_length:]
self.X = X[sequence_length:,:]
# make extra dataseries for padding
[Link] = y[:sequence_length*2]
[Link] = X[:sequence_length*2,:]
self.sequence_length = sequence_length
def __len__(self):
# return length shape
return [Link][0]
def __getitem__(self, i):
# select data from the dataset without padding and return data
if i > self.sequence_length:
Xr = self.X[i-self.sequence_length:i,:]
yr = self.y[i-1]
return Xr , yr
# select from the padding dataset and return data
else:
Xr = [Link][i:i+self.sequence_length,:]
yr = [Link][i+self.sequence_length - 1]
return Xr, yr
Images about LSTM and class for LSTM model
In [8]: class LSTM([Link]):
def __init__(self, num_features, hidden_units):
super().__init__()
# save needed info to initialize layers
self.num_features = num_features
self.hidden_units = hidden_units
self.num_layers = 1
# first LSTM layer of model
self.lstm1 = [Link](input_size=num_features,
hidden_size=hidden_units,
batch_first=True,
num_layers=self.num_layers)
# second LSTM layer of model
self.lstm2 = [Link](input_size=hidden_units,
hidden_size=hidden_units,
batch_first=True,
num_layers=self.num_layers)
# dropout layer
[Link] = [Link](p=0.1)
# linear layer
[Link] = [Link](in_features=self.hidden_units, out_features=1
def forward(self, x):
batch_size = [Link][0]
# initialize c0 and h0
c0 = [Link](self.num_layers, batch_size,
self.hidden_units).requires_grad_()
h0 = [Link](self.num_layers, batch_size,
self.hidden_units).requires_grad_()
# load data into layers as written in the paper and return the output
# data
x, (_, _) = self.lstm1(x, (h0, c0))
x = [Link](x)
_, (ht,_) = self.lstm2(x, (h0, c0))
out = [Link](ht[0]).flatten()
return out
Functions to train the model, test the model and predict
with the model (functions used below are from the
tutorial)
In [21]: def train_model(data_loader, model, loss_function, optimizer):
# initialize variables
num_batches = len(data_loader)
total_loss = 0
# iterate through data from dataloader and run model and calculate loss
[Link]()
for X, y in data_loader:
output = model(X)
# print("input : output : act", [Link], [Link], [Link] )
loss = loss_function(output, y)
optimizer.zero_grad()
[Link]()
[Link]()
total_loss += [Link]()
avg_loss = total_loss / num_batches
print(f'Train loss: {avg_loss}')
return avg_loss
def test_model(data_loader, model, loss_function):
# initialize variables
num_batches = len(data_loader)
total_loss = 0
# iterate through data from dataloader and test model and calculate loss
[Link]()
with torch.no_grad():
for X, y in data_loader:
output = model(X)
total_loss += loss_function(output, y).item()
avg_loss = total_loss / num_batches
print(f'Test loss: {avg_loss}')
return avg_loss
def predict(data_loader, model):
# initiaze output data
output = [Link]([])
# iterate through data from data loader and make prediction
[Link]()
with torch.no_grad():
for X, _ in data_loader:
y_pred = model(X)
output = [Link]((output, y_pred), 0)
return output
Make train and test datasets using the
above TimeSeriesDataset class and
PyTorch DataLoader class
In [23]: def make_model(df_train, df_test, target, features, sequence_length, batch_si
num_hidden_units, learning_rate):
# make train and test dataset
train_dataset = TimeSeriesDataset(df_train,
target=target,
features=features,
sequence_length=sequence_length)
test_dataset = TimeSeriesDataset(df_test,
target=target,
features=features,
sequence_length=sequence_length)
# load dataset in DataLoader
train_loader = DataLoader(train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = DataLoader(test_dataset,
batch_size=batch_size,
shuffle=False)
train_eval_loader = DataLoader(train_dataset,
batch_size=batch_size,
shuffle=False)
# check if the features and targets are made accordingly with the right s
# and sizes(trust us, they are)
X, y = next(iter(train_loader))
print('Features shape:', [Link])
print('Target shape:', [Link])
# initialize model
model = LSTM(num_features=len(features), hidden_units=num_hidden_units)
loss_function = [Link]()
optimizer = [Link]([Link](), lr=learning_rate,)
return train_loader, test_loader, train_eval_loader, model, loss_function
Run model to reproduce hydrograph
from paper
In [ ]:
In [24]: # make model with needed objects to make prediction
train_loader, test_loader, train_eval_loader, model, loss_function, optimizer
make_model(df_train, df_test, target, features, sequence_length, batch_si
num_hidden_units, learning_rate)
data shape (2192, 6)
data shape (2192, 6)
Features shape: [Link]([512, 365, 5])
Target shape: [Link]([512])
In [12]: df_train
Out[12]: W730 W840 W1060 W1290 W1520 WL
Time
2015-01-01
-0.539357 -0.553056 -0.664890 -0.618936 -0.611800 -1.202521
[Link]+00:00
2015-01-02
-0.539357 0.871736 -0.647604 -0.618936 -0.272422 -1.202521
[Link]+00:00
2015-01-03
-0.501376 2.317129 -0.331862 -0.567792 0.044648 -1.202521
[Link]+00:00
2015-01-04
-0.002564 -0.353223 0.190200 -0.274484 -0.554082 -1.202521
[Link]+00:00
2015-01-05
-0.358497 -0.601264 -0.664890 -0.491635 -0.611800 -1.202521
[Link]+00:00
... ... ... ... ... ... ...
2020-12-27
-0.539357 -0.601264 -0.664890 -0.618936 -0.611800 -0.766822
[Link]+00:00
2020-12-28
-0.539357 -0.601264 -0.664890 -0.618936 -0.611800 -0.781591
[Link]+00:00
2020-12-29
-0.539357 -0.601264 -0.664890 -0.618936 -0.611800 -0.792668
[Link]+00:00
2020-12-30
-0.539357 -0.601264 -0.664890 -0.618936 -0.611800 -0.792668
[Link]+00:00
2020-12-31
-0.539357 -0.601264 -0.664890 -0.618936 -0.611800 -0.792668
[Link]+00:00
2192 rows × 6 columns
In [13]: for x, y in train_loader:
print([Link], [Link])
[Link](y)
break
[Link]([512, 365, 5]) [Link]([512])
In [25]: # run untrained test
print('Untrained test\n--------')
test_loss = test_model(test_loader, model, loss_function)
test_loss_epochs = [i for i in range(1,epochs+1)]
test_loss_epochs.insert(0,'Random weigths')
print()
# add prediction with untrained test to list and define the epochs we
# want to save
test_predictions = [predict(test_loader, model).numpy()]
# # train model and calculate training and test loss
for epoch_i in range(1,epochs+1):
print(f'Epoch {epoch_i}\n-----')
# epoch
train_loss = train_model(train_loader, model, loss_function, optimizer=op
test_loss = test_model(test_loader, model, loss_function)
# predict with model if we want the prediciton of that epoch
if epoch_i in epochs_to_save:
test_predictions.append(predict(test_loader, model).numpy())
print()
Untrained test
--------
Test loss: 1.1015282273292542
Epoch 1
-----
Train loss: 0.691096194088459
Test loss: 0.34186528995633125
Epoch 2
-----
Train loss: 0.2744169943034649
Test loss: 0.18098129332065582
Epoch 3
-----
Train loss: 0.1807936504483223
Test loss: 0.13457677513360977
Epoch 4
-----
Train loss: 0.17126549407839775
Test loss: 0.15358439274132252
Epoch 5
-----
Train loss: 0.14124586805701256
Test loss: 0.09722915291786194
Epoch 6
-----
Train loss: 0.10843918099999428
Test loss: 0.11747029051184654
Epoch 7
-----
Train loss: 0.09564841352403164
Test loss: 0.08832251373678446
Epoch 8
-----
Train loss: 0.07941098511219025
Test loss: 0.08081972133368254
Epoch 9
-----
Train loss: 0.07603576220571995
Test loss: 0.06189220305532217
Epoch 10
-----
Train loss: 0.06176964845508337
Test loss: 0.06323724891990423
Epoch 11
-----
Train loss: 0.057312702760100365
Test loss: 0.06580948363989592
Epoch 12
-----
Train loss: 0.06235352158546448
Test loss: 0.08148911595344543
Epoch 13
-----
Train loss: 0.08701557479798794
Test loss: 0.06657484639436007
Epoch 14
-----
Train loss: 0.0685692522674799
Test loss: 0.05807188153266907
Epoch 15
-----
Train loss: 0.06320013292133808
Test loss: 0.05849305912852287
Epoch 16
-----
Train loss: 0.059312948025763035
Test loss: 0.04643315635621548
Epoch 17
-----
Train loss: 0.0473786648362875
Test loss: 0.04054863564670086
Epoch 18
-----
Train loss: 0.039496853947639465
Test loss: 0.03358832001686096
Epoch 19
-----
Train loss: 0.03696837928146124
Test loss: 0.042382373474538326
Epoch 20
-----
Train loss: 0.03853471204638481
Test loss: 0.04651129525154829
Epoch 21
-----
Train loss: 0.04689805116504431
Test loss: 0.031654486898332834
Epoch 22
-----
Train loss: 0.041970094200223684
Test loss: 0.032593599520623684
Epoch 23
-----
Train loss: 0.04380444251000881
Test loss: 0.04070279514417052
Epoch 24
-----
Train loss: 0.0469727274030447
Test loss: 0.034582564141601324
Epoch 25
-----
Train loss: 0.03291946602985263
Test loss: 0.02794454852119088
Epoch 26
-----
Train loss: 0.03019129764288664
Test loss: 0.02578092087060213
Epoch 27
-----
Train loss: 0.02792856330052018
Test loss: 0.024922854732722044
Epoch 28
-----
Train loss: 0.02494414569810033
Test loss: 0.02187663083896041
Epoch 29
-----
Train loss: 0.02483906550332904
Test loss: 0.01955495635047555
Epoch 30
-----
Train loss: 0.02175398776307702
Test loss: 0.02047473704442382
Epoch 31
-----
Train loss: 0.02241233643144369
Test loss: 0.01830902020446956
Epoch 32
-----
Train loss: 0.02072978811338544
Test loss: 0.02043104451149702
Epoch 33
-----
Train loss: 0.021412630565464497
Test loss: 0.018795558251440525
Epoch 34
-----
Train loss: 0.020462784450501204
Test loss: 0.019370753783732653
Epoch 35
-----
Train loss: 0.020967957098037004
Test loss: 0.01704179123044014
Epoch 36
-----
Train loss: 0.019973588176071644
Test loss: 0.018898777663707733
Epoch 37
-----
Train loss: 0.018547479063272476
Test loss: 0.018438390223309398
Epoch 38
-----
Train loss: 0.01985554164275527
Test loss: 0.01616179570555687
Epoch 39
-----
Train loss: 0.016931472811847925
Test loss: 0.0164082667324692
Epoch 40
-----
Train loss: 0.018010655418038368
Test loss: 0.015737075358629227
Epoch 41
-----
Train loss: 0.01703188964165747
Test loss: 0.015757250133901834
Epoch 42
-----
Train loss: 0.017022629966959357
Test loss: 0.016437577083706856
Epoch 43
-----
Train loss: 0.01748147141188383
Test loss: 0.015140331350266933
Epoch 44
-----
Train loss: 0.01613947213627398
Test loss: 0.016928663477301598
Epoch 45
-----
Train loss: 0.017461356706917286
Test loss: 0.01551959477365017
Epoch 46
-----
Train loss: 0.01764287194237113
Test loss: 0.014674758305773139
Epoch 47
-----
Train loss: 0.0158578185364604
Test loss: 0.012765411986038089
Epoch 48
-----
Train loss: 0.019019876839593053
Test loss: 0.014482288621366024
Epoch 49
-----
Train loss: 0.018121839500963688
Test loss: 0.013502749614417553
Epoch 50
-----
Train loss: 0.016163869760930538
Test loss: 0.011864851461723447
In [26]: # put prediction data in dataframe
epoch_list = [f'epoch {str(e)}' for e in epochs_to_save]
epoch_list.insert(0,'Random weights')
model_output_data_dict = dict(zip(epoch_list, test_predictions))
df_out = [Link](index=df_test.index[sequence_length:], data=model_outpu
for col in df_out.columns:
df_out[col] = df_out[col] * target_stdev + target_mean
# make sexy plot
random_weights = 'Random weights'
epoch_cols = df_out.[Link]([random_weights])
fig,ax = [Link](figsize=(15,10))
df[target][test_start:test_end].iloc[sequence_length:].plot(ax=ax, label='Obs
df_out[random_weights].plot(ax=ax, alpha=0.8, color='Grey')
df_out[epoch_cols].plot(ax=ax, alpha=0.8, colormap='YlOrRd')
ax.set_xlabel('Date [day]')
ax.set_ylabel('Discharge [mm / day]')
ax.set_title('Reproduced Evolution of hydrograph over number of training epoc
[Link](loc='upper right');
Make model for a prediction and with
losses
In [27]: # make model with needed objects to make prediction
train_loader, test_loader, train_eval_loader, model, loss_function, optimizer
make_model(df_train, df_predict, target, features, sequence_length, batch
num_hidden_units, learning_rate)
data shape (2192, 6)
data shape (2192, 6)
Features shape: [Link]([512, 365, 5])
Target shape: [Link]([512])
In [28]: # run untrained test and make loss lists
print('Untrained test\n--------')
train_losses = []
train_loss_epochs = [i for i in range(1,epochs+1)]
test_losses = [test_model(test_loader, model, loss_function)]
test_loss_epochs = [i for i in range(0,epochs+1)]
print()
# add prediction with untrained test to list and define the epochs we
# want to save
test_predictions = [predict(test_loader, model).numpy()]
# # train model and calculate training and test loss
for epoch_i in range(1,epochs+1):
print(f'Epoch {epoch_i}\n-----')
# epoch
train_loss = train_model(train_loader, model, loss_function, optimizer=op
test_loss = test_model(test_loader, model, loss_function)
train_losses.append(train_loss)
test_losses.append(test_loss)
Untrained test
--------
Test loss: 1.1022280901670456
Epoch 1
-----
Train loss: 0.8048611804842949
Test loss: 0.32835526764392853
Epoch 2
-----
Train loss: 0.31855907291173935
Test loss: 0.18016764894127846
Epoch 3
-----
Train loss: 0.18531503155827522
Test loss: 0.1625302042812109
Epoch 4
-----
Train loss: 0.13459753058850765
Test loss: 0.136846661567688
Epoch 5
-----
Train loss: 0.11209174245595932
Test loss: 0.10650548338890076
Epoch 6
-----
Train loss: 0.10110868886113167
Test loss: 0.07836340926587582
Epoch 7
-----
Train loss: 0.07752809673547745
Test loss: 0.0718389330431819
Epoch 8
-----
Train loss: 0.07289406843483448
Test loss: 0.07362210191786289
Epoch 9
-----
Train loss: 0.06482424587011337
Test loss: 0.06276528630405664
Epoch 10
-----
Train loss: 0.05806788895279169
Test loss: 0.049016037955880165
Epoch 11
-----
Train loss: 0.04903759714215994
Test loss: 0.043455285020172596
Epoch 12
-----
Train loss: 0.04204488452523947
Test loss: 0.03448483208194375
Epoch 13
-----
Train loss: 0.03607444278895855
Test loss: 0.02939627692103386
Epoch 14
-----
Train loss: 0.03259630734100938
Test loss: 0.028220731299370527
Epoch 15
-----
Train loss: 0.028714860789477825
Test loss: 0.023154616355895996
Epoch 16
-----
Train loss: 0.02813507243990898
Test loss: 0.023941838182508945
Epoch 17
-----
Train loss: 0.024062695913016796
Test loss: 0.021737852599471807
Epoch 18
-----
Train loss: 0.02226743008941412
Test loss: 0.019107633735984564
Epoch 19
-----
Train loss: 0.021037204191088676
Test loss: 0.01835397514514625
Epoch 20
-----
Train loss: 0.02024810668081045
Test loss: 0.0200261774007231
Epoch 21
-----
Train loss: 0.020070136059075594
Test loss: 0.01953724818304181
Epoch 22
-----
Train loss: 0.02081514662131667
Test loss: 0.01919945585541427
Epoch 23
-----
Train loss: 0.01884919498115778
Test loss: 0.01677158777602017
Epoch 24
-----
Train loss: 0.018627350218594074
Test loss: 0.015540724154561758
Epoch 25
-----
Train loss: 0.019436991773545742
Test loss: 0.015352707589045167
Epoch 26
-----
Train loss: 0.018692481331527233
Test loss: 0.015250000404193997
Epoch 27
-----
Train loss: 0.015896357595920563
Test loss: 0.01411851984448731
Epoch 28
-----
Train loss: 0.016760543920099735
Test loss: 0.01842940622009337
Epoch 29
-----
Train loss: 0.01684057293459773
Test loss: 0.013957456452772021
Epoch 30
-----
Train loss: 0.016454364405944943
Test loss: 0.013928697677329183
Epoch 31
-----
Train loss: 0.016560723539441824
Test loss: 0.01557435910217464
Epoch 32
-----
Train loss: 0.014777921373024583
Test loss: 0.013648539315909147
Epoch 33
-----
Train loss: 0.014464801410213113
Test loss: 0.014617457520216703
Epoch 34
-----
Train loss: 0.015246885130181909
Test loss: 0.012015167623758316
Epoch 35
-----
Train loss: 0.014583155512809753
Test loss: 0.013940806966274977
Epoch 36
-----
Train loss: 0.01460384065285325
Test loss: 0.012950338423252106
Epoch 37
-----
Train loss: 0.014014459913596511
Test loss: 0.0116625907830894
Epoch 38
-----
Train loss: 0.01460436056368053
Test loss: 0.011680243071168661
Epoch 39
-----
Train loss: 0.013286851812154055
Test loss: 0.012641430599614978
Epoch 40
-----
Train loss: 0.014244649093598127
Test loss: 0.01214320631697774
Epoch 41
-----
Train loss: 0.013054462848231196
Test loss: 0.012477455427870154
Epoch 42
-----
Train loss: 0.013976335059851408
Test loss: 0.01212047878652811
Epoch 43
-----
Train loss: 0.013530088821426034
Test loss: 0.012281304458156228
Epoch 44
-----
Train loss: 0.01295680203475058
Test loss: 0.012196385068818927
Epoch 45
-----
Train loss: 0.012649151729419827
Test loss: 0.01216863957233727
Epoch 46
-----
Train loss: 0.012619383865967393
Test loss: 0.01372487423941493
Epoch 47
-----
Train loss: 0.014892063103616238
Test loss: 0.012613677885383368
Epoch 48
-----
Train loss: 0.014035939937457442
Test loss: 0.012619436485692859
Epoch 49
-----
Train loss: 0.013463571667671204
Test loss: 0.011700624600052834
Epoch 50
-----
Train loss: 0.012781243771314621
Test loss: 0.011239474173635244
In [30]: # predict data with model
y_pred_train = predict(train_eval_loader, model).numpy()
y_pred_test = predict(test_loader, model).numpy()
# make placeholder for model output data
y_pred_col = 'LSTM model discharge prediction'
# load data in dataframe
df_train_out = [Link](index=df_train.index[sequence_length:],
data={target:df_train[target][sequence_length:],
y_pred_col:y_pred_train})
df_test_out = [Link](index=df_predict.index[sequence_length:],
data={target:df_predict[target][sequence_length:],
y_pred_col:y_pred_test})
# de-normalize data
for col in df_train_out.columns:
df_train_out[col] = df_train_out[col] * target_stdev + target_mean
for col in df_test_out.columns:
df_test_out[col] = df_test_out[col] * target_stdev + target_mean
# calculate RMSE for both datasets
RMSE_test = ((df_test_out[target] - df_test_out[y_pred_col]) ** 2).mean() **
RMSE_train = ((df_train_out[target] - df_train_out[y_pred_col]) ** 2).mean()
# plot
fig,ax = [Link](figsize=(20,10))
df_train_out.plot(ax=ax)
ax.set_ylabel('Discharge m3/d');
ax.set_xlabel('Date [days]');
ax.set_title(f'Training data model evaluation measured discharge and LSTM mod
[Link]()
# plot
fig,ax = [Link](figsize=(20,10))
df_test_out.plot(ax=ax)
ax.set_ylabel('Discharge m3/d');
ax.set_xlabel('Date [days]');
ax.set_title(f'Testing data model evaluation measured discharge and LSTM mode
[Link]()
/var/folders/q0/nf8282h91h168d4l7tk8w_mr0000gn/T/ipykernel_48439/307
[Link]: UserWarning: Matplotlib is currently using module://ma
tplotlib_inline.backend_inline, which is a non-GUI backend, so canno
t show the figure.
[Link]()
/var/folders/q0/nf8282h91h168d4l7tk8w_mr0000gn/T/ipykernel_48439/307
[Link]: UserWarning: Matplotlib is currently using module://ma
tplotlib_inline.backend_inline, which is a non-GUI backend, so canno
t show the figure.
[Link]()
In [42]: [Link](figsize=(15,10))
[Link](train_loss_epochs, train_losses, label='Training losses')
[Link](test_loss_epochs, test_losses, label='Testing losses')
[Link](f'Losses with training data from {train_start} to {train_end} and t
[Link]('Loss')
[Link]('Epochs')
[Link](loc='upper right');
In [ ]:
In [ ]: