papermill_step2   A
last analyzed

Complexity

Total Complexity 0

Size/Duplication

Total Lines 164
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 0
eloc 93
dl 0
loc 164
rs 10
c 0
b 0
f 0
1
# ---
2
# jupyter:
3
#   jupytext:
4
#     cell_metadata_json: true
5
#     formats: ipynb,py:percent
6
#     notebook_metadata_filter: language_info
7
#     text_representation:
8
#       extension: .py
9
#       format_name: percent
10
#       format_version: '1.3'
11
#       jupytext_version: 1.5.2
12
#   kernelspec:
13
#     display_name: Python 3
14
#     language: python
15
#     name: python3
16
#   language_info:
17
#     codemirror_mode:
18
#       name: ipython
19
#       version: 3
20
#     file_extension: .py
21
#     mimetype: text/x-python
22
#     name: python
23
#     nbconvert_exporter: python
24
#     pygments_lexer: ipython3
25
#     version: 3.8.4
26
# ---
27
28
# %% {"tags": ["parameters"]}
29
# Our default parameters
30
# This cell has a "parameters" tag, means that it defines the parameters for use in the notebook
31
run_date = '2018-11-18'
32
source_id = 'sensor1'
33
nb_days = 32
34
35
# %%
36
from datetime import datetime, timedelta
37
import matplotlib.pyplot as plt
38
import numpy as np
39
import itertools
40
import os
41
import pandas as pd
42
from pylab import rcParams
43
import scrapbook as sb
44
import statsmodels.api as sm
45
46
# turn off interactive plotting to avoid double plotting
47
plt.ioff()
48
49
# %%
50
data_dir = "../data/input/step1" 
51
data = None
52
run_datetime = datetime.strptime(run_date, '%Y-%m-%d')
53
for i in range(nb_days):    
54
    deltatime = run_datetime - timedelta(i)
55
    month_partition = deltatime.strftime("%Y-%m")
56
    delta = datetime.strftime(deltatime, '%Y-%m-%d')    
57
    file = os.path.join(data_dir, month_partition, delta + "-" + source_id + ".csv")
58
    if os.path.exists(file):
59
        print("Loading " + file)
60
        new = pd.read_csv(file)
61
        if data is not None:
62
            data = pd.concat([data, new])
63
        else:
64
            data = new
65
    else:
66
        print("Failed to load " + file)
67
68
69
# %%
70
print(data)
71
72
# %%
73
data['date'] = data['date'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
74
print(data['date'].describe())
75
data.describe()
76
77
78
# %%
79
data = data.sort_values('date').set_index('date', drop=True)
80
data = data.asfreq(freq="5min")
81
data.head(5)
82
83
# %% [markdown]
84
# https://towardsdatascience.com/an-end-to-end-project-on-time-series-analysis-and-forecasting-with-python-4835e6bf050b
85
#
86
#
87
# https://www.statsmodels.org/dev/generated/statsmodels.tsa.seasonal.seasonal_decompose.html
88
89
# %%
90
rcParams['figure.figsize'] = 18, 8
91
decomposition = sm.tsa.seasonal_decompose(data['mydata'], model='additive', freq=288)
92
fig = decomposition.plot()
93
94
95
# %%
96
p = d = q = range(0, 2)
97
pdq = list(itertools.product(p, d, q))
98
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
99
100
print('Examples of parameter combinations for Seasonal ARIMA...')
101
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
102
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
103
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
104
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))
105
106
# %%
107
scores = {
108
    "AIC" : [],
109
    "param" : [],
110
    "param_seasonal" : []
111
}
112
for param in pdq:
113
    for param_seasonal in seasonal_pdq:
114
        try:
115
            mod = sm.tsa.statespace.SARIMAX(
116
                data['mydata'],
117
                order=param,
118
                seasonal_order=param_seasonal,
119
                enforce_stationarity=False,
120
                enforce_invertibility=False
121
            )
122
            results = mod.fit()
123
            scores['AIC'].append(results.aic)
124
            scores['param'].append(param)
125
            scores['param_seasonal'].append(param_seasonal)
126
            print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
127
        except:
128
            continue
129
scores = pd.DataFrame.from_dict(scores)
130
scores.sort_values('AIC').head(5)
131
132
# %%
133
best = scores.sort_values('AIC').head(1).values[0]
134
mod = sm.tsa.statespace.SARIMAX(data['mydata'],
135
                                order=best[1],
136
                                seasonal_order=best[2],
137
                                enforce_stationarity=True,
138
                                enforce_invertibility=False)
139
results = mod.fit()
140
print(results.summary().tables[1])
141
142
# %%
143
results.plot_diagnostics(figsize=(16, 8))
144
plt.show()
145
146
# %%
147
pred = results.get_prediction(start=(run_datetime - timedelta(1)), dynamic=False)
148
pred_ci = pred.conf_int()
149
150
fig, ax = plt.subplots()
151
ax.plot(data[data.index > (run_datetime - timedelta(3))]['mydata'], label='observed')
152
ax.plot(pred.predicted_mean, label='One-step ahead Forecast', alpha=.7)
153
ax.fill_between(pred_ci.index,
154
                pred_ci.iloc[:, 0],
155
                pred_ci.iloc[:, 1], color='k', alpha=.2)
156
ax.set_xlabel('Date')
157
ax.set_ylabel('mydata')
158
ax.set(title='Results of ARIMA{}x{}12 - AIC:{} on {}'.format(best[1], best[2], round(best[0]), run_date))
159
fig.legend()
160
sb.glue('arima_results_fig', fig, display=True)
161
162
# %%
163
pred.save("../data/output/step2/prediction_model_" + run_date + "-" + source_id)
164
165
# %%
166
167
168
169