Passed
Push — master ( 5927b1...ccc1cd )
by Christophe
01:43 queued 38s
created

papermill_step2   A

Complexity

Total Complexity 0

Size/Duplication

Total Lines 156
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 0
eloc 91
dl 0
loc 156
rs 10
c 0
b 0
f 0
1
# ---
2
# jupyter:
3
#   jupytext:
4
#     text_representation:
5
#       extension: .py
6
#       format_name: percent
7
#       format_version: '1.1'
8
#       jupytext_version: 0.8.5
9
#   kernelspec:
10
#     display_name: Python 3
11
#     language: python
12
#     name: python3
13
#   language_info:
14
#     codemirror_mode:
15
#       name: ipython
16
#       version: 3
17
#     file_extension: .py
18
#     mimetype: text/x-python
19
#     name: python
20
#     nbconvert_exporter: python
21
#     pygments_lexer: ipython3
22
#     version: 3.6.6
23
# ---
24
25
# %% {"tags": ["parameters"]}
26
# Our default parameters
27
# This cell has a "parameters" tag, means that it defines the parameters for use in the notebook
28
run_date = '2018-11-18'
29
source_id = 'sensor1'
30
nb_days = 32
31
32
# %%
33
import numpy as np
34
import pandas as pd
35
import papermill as pm
36
import matplotlib.pyplot as plt
37
from datetime import datetime, timedelta
38
import os
39
import statsmodels.api as sm
40
from pylab import rcParams
41
import itertools
42
43
# turn off interactive plotting to avoid double plotting
44
plt.ioff()
45
46
# %%
47
data_dir = "../data/input/step1" 
48
data = None
49
run_datetime = datetime.strptime(run_date, '%Y-%m-%d')
50
for i in range(nb_days):    
51
    deltatime = run_datetime - timedelta(i)
52
    month_partition = deltatime.strftime("%Y-%m")
53
    delta = datetime.strftime(deltatime, '%Y-%m-%d')    
54
    file = os.path.join(data_dir, month_partition, delta + "-" + source_id + ".csv")
55
    if os.path.exists(file):
56
        print("Loading " + file)
57
        new = pd.read_csv(file)
58
        if data is not None:
59
            data = pd.concat([data, new])
60
        else:
61
            data = new
62
63
64
# %%
65
data['date'] = data['date'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
66
print(data['date'].describe())
67
data.describe()
68
69
70
# %%
71
data = data.sort_values('date').set_index('date', drop=True)
72
data = data.asfreq(freq="5min")
73
data.head(5)
74
75
# %% [markdown]
76
# https://towardsdatascience.com/an-end-to-end-project-on-time-series-analysis-and-forecasting-with-python-4835e6bf050b
77
#
78
#
79
# https://www.statsmodels.org/dev/generated/statsmodels.tsa.seasonal.seasonal_decompose.html
80
81
# %%
82
rcParams['figure.figsize'] = 18, 8
83
decomposition = sm.tsa.seasonal_decompose(data['mydata'], model='additive', freq=288)
84
fig = decomposition.plot()
85
86
87
# %%
88
p = d = q = range(0, 2)
89
pdq = list(itertools.product(p, d, q))
90
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
91
92
print('Examples of parameter combinations for Seasonal ARIMA...')
93
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
94
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
95
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
96
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))
97
98
# %%
99
scores = {
100
    "AIC" : [],
101
    "param" : [],
102
    "param_seasonal" : []
103
}
104
for param in pdq:
105
    for param_seasonal in seasonal_pdq:
106
        try:
107
            mod = sm.tsa.statespace.SARIMAX(
108
                data['mydata'],
109
                order=param,
110
                seasonal_order=param_seasonal,
111
                enforce_stationarity=False,
112
                enforce_invertibility=False
113
            )
114
            results = mod.fit()
115
            scores['AIC'].append(results.aic)
116
            scores['param'].append(param)
117
            scores['param_seasonal'].append(param_seasonal)
118
            print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
119
        except:
120
            continue
121
scores = pd.DataFrame.from_dict(scores)
122
scores.sort_values('AIC').head(5)
123
124
# %%
125
best = scores.sort_values('AIC').head(1).values[0]
126
mod = sm.tsa.statespace.SARIMAX(data['mydata'],
127
                                order=best[1],
128
                                seasonal_order=best[2],
129
                                enforce_stationarity=True,
130
                                enforce_invertibility=False)
131
results = mod.fit()
132
print(results.summary().tables[1])
133
134
# %%
135
results.plot_diagnostics(figsize=(16, 8))
136
plt.show()
137
138
# %%
139
pred = results.get_prediction(start=(run_datetime - timedelta(1)), dynamic=False)
140
pred_ci = pred.conf_int()
141
142
fig, ax = plt.subplots()
143
ax.plot(data[data.index > (run_datetime - timedelta(3))]['mydata'], label='observed')
144
ax.plot(pred.predicted_mean, label='One-step ahead Forecast', alpha=.7)
145
ax.fill_between(pred_ci.index,
146
                pred_ci.iloc[:, 0],
147
                pred_ci.iloc[:, 1], color='k', alpha=.2)
148
ax.set_xlabel('Date')
149
ax.set_ylabel('mydata')
150
ax.set(title='Results of ARIMA{}x{}12 - AIC:{} on {}'.format(best[1], best[2], round(best[0]), run_date))
151
fig.legend()
152
pm.display('arima_results_fig', fig)
153
154
# %%
155
pred.save("../data/output/step2/prediction_model_" + run_date + "-" + source_id)
156
157
# %%
158
159
160
161