Completed
Pull Request — master (#867)
by Joe
02:09
created

tests.risk.ensure_latest_answer_key()   B

Complexity

Conditions 6

Size

Total Lines 48

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 6
dl 0
loc 48
rs 7.6529
1
#
2
# Copyright 2014 Quantopian, Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
import datetime
16
import hashlib
17
import os
18
19
import numpy as np
20
import pandas as pd
21
import pytz
22
import xlrd
23
import requests
24
25
from six.moves import map
26
27
28
def col_letter_to_index(col_letter):
29
    # Only supports single letter,
30
    # but answer key doesn't need multi-letter, yet.
31
    index = 0
32
    for i, char in enumerate(reversed(col_letter)):
33
        index += ((ord(char) - 65) + 1) * pow(26, i)
34
    return index
35
36
DIR = os.path.dirname(os.path.realpath(__file__))
37
38
ANSWER_KEY_CHECKSUMS_PATH = os.path.join(DIR, 'risk-answer-key-checksums')
39
ANSWER_KEY_CHECKSUMS = open(ANSWER_KEY_CHECKSUMS_PATH, 'r').read().splitlines()
40
41
ANSWER_KEY_FILENAME = 'risk-answer-key.xlsx'
42
43
ANSWER_KEY_PATH = os.path.join(DIR, ANSWER_KEY_FILENAME)
44
45
ANSWER_KEY_BUCKET_NAME = 'zipline-test_data'
46
47
ANSWER_KEY_DL_TEMPLATE = """
48
https://s3.amazonaws.com/zipline-test-data/risk/{md5}/risk-answer-key.xlsx
49
""".strip()
50
51
LATEST_ANSWER_KEY_URL = ANSWER_KEY_DL_TEMPLATE.format(
52
    md5=ANSWER_KEY_CHECKSUMS[-1])
53
54
55
def answer_key_signature():
56
    with open(ANSWER_KEY_PATH, 'rb') as f:
57
        md5 = hashlib.md5()
58
        buf = f.read(1024)
59
        md5.update(buf)
60
        while buf != b"":
61
            buf = f.read(1024)
62
            md5.update(buf)
63
    return md5.hexdigest()
64
65
66
def ensure_latest_answer_key():
67
    """
68
    Get the latest answer key from a publically available location.
69
70
    Logic for determining what and when to download is as such:
71
72
    - If there is no local spreadsheet file, then get the lastest answer key,
73
    as defined by the last row in the checksum file.
74
    - If there is a local spreadsheet file:
75
    -- If the spreadsheet's checksum is in the checksum file:
76
    --- If the spreadsheet's checksum does not match the latest, then grab the
77
    the latest checksum and replace the local checksum file.
78
    --- If the spreadsheet's checksum matches the latest, then skip download,
79
    and use the local spreadsheet as a cached copy.
80
    -- If the spreadsheet's checksum is not in the checksum file, then leave
81
    the local file alone, assuming that the local xls's md5 is not in the list
82
    due to local modifications during development.
83
84
    It is possible that md5's could collide, if that is ever case, we should
85
    then find an alternative naming scheme.
86
87
    The spreadsheet answer sheet is not kept in SCM, as every edit would
88
    increase the repo size by the file size, since it is treated as a binary.
89
    """
90
91
    answer_key_dl_checksum = None
92
93
    local_answer_key_exists = os.path.exists(ANSWER_KEY_PATH)
94
    if local_answer_key_exists:
95
        local_hash = answer_key_signature()
96
97
        if local_hash in ANSWER_KEY_CHECKSUMS:
98
            # Assume previously downloaded version.
99
            # Check for latest.
100
            if local_hash != ANSWER_KEY_CHECKSUMS[-1]:
101
                # More recent checksum, download
102
                answer_key_dl_checksum = ANSWER_KEY_CHECKSUMS[-1]
103
            else:
104
                # Assume local copy that is being developed on
105
                answer_key_dl_checksum = None
106
    else:
107
        answer_key_dl_checksum = ANSWER_KEY_CHECKSUMS[-1]
108
109
    if answer_key_dl_checksum:
110
        res = requests.get(
111
            ANSWER_KEY_DL_TEMPLATE.format(md5=answer_key_dl_checksum))
112
        with open(ANSWER_KEY_PATH, 'wb') as f:
113
            f.write(res.content)
114
115
# Get latest answer key on load.
116
ensure_latest_answer_key()
117
118
119
class DataIndex(object):
120
    """
121
    Coordinates for the spreadsheet, using the values as seen in the notebook.
122
    The python-excel libraries use 0 index, while the spreadsheet in a GUI
123
    uses a 1 index.
124
    """
125
    def __init__(self, sheet_name, col, row_start, row_end,
126
                 value_type='float'):
127
        self.sheet_name = sheet_name
128
        self.col = col
129
        self.row_start = row_start
130
        self.row_end = row_end
131
        self.value_type = value_type
132
133
    @property
134
    def col_index(self):
135
        return col_letter_to_index(self.col) - 1
136
137
    @property
138
    def row_start_index(self):
139
        return self.row_start - 1
140
141
    @property
142
    def row_end_index(self):
143
        return self.row_end - 1
144
145
    def __str__(self):
146
        return "'{sheet_name}'!{col}{row_start}:{col}{row_end}".format(
147
            sheet_name=self.sheet_name,
148
            col=self.col,
149
            row_start=self.row_start,
150
            row_end=self.row_end
151
        )
152
153
154
class AnswerKey(object):
155
156
    INDEXES = {
157
        'RETURNS': DataIndex('Sim Period', 'D', 4, 255),
158
159
        'BENCHMARK': {
160
            'Dates': DataIndex('s_p', 'A', 4, 254, value_type='date'),
161
            'Returns': DataIndex('s_p', 'H', 4, 254)
162
        },
163
164
        # Below matches the inconsistent capitalization in spreadsheet
165
        'BENCHMARK_PERIOD_RETURNS': {
166
            'Monthly': DataIndex('s_p', 'R', 8, 19),
167
            '3-Month': DataIndex('s_p', 'S', 10, 19),
168
            '6-month': DataIndex('s_p', 'T', 13, 19),
169
            'year': DataIndex('s_p', 'U', 19, 19),
170
        },
171
172
        'BENCHMARK_PERIOD_VOLATILITY': {
173
            'Monthly': DataIndex('s_p', 'V', 8, 19),
174
            '3-Month': DataIndex('s_p', 'W', 10, 19),
175
            '6-month': DataIndex('s_p', 'X', 13, 19),
176
            'year': DataIndex('s_p', 'Y', 19, 19),
177
        },
178
179
        'ALGORITHM_PERIOD_RETURNS': {
180
            'Monthly': DataIndex('Sim Period', 'Z', 23, 34),
181
            '3-Month': DataIndex('Sim Period', 'AA', 25, 34),
182
            '6-month': DataIndex('Sim Period', 'AB', 28, 34),
183
            'year': DataIndex('Sim Period', 'AC', 34, 34),
184
        },
185
186
        'ALGORITHM_PERIOD_VOLATILITY': {
187
            'Monthly': DataIndex('Sim Period', 'AH', 23, 34),
188
            '3-Month': DataIndex('Sim Period', 'AI', 25, 34),
189
            '6-month': DataIndex('Sim Period', 'AJ', 28, 34),
190
            'year': DataIndex('Sim Period', 'AK', 34, 34),
191
        },
192
193
        'ALGORITHM_PERIOD_SHARPE': {
194
            'Monthly': DataIndex('Sim Period', 'AL', 23, 34),
195
            '3-Month': DataIndex('Sim Period', 'AM', 25, 34),
196
            '6-month': DataIndex('Sim Period', 'AN', 28, 34),
197
            'year': DataIndex('Sim Period', 'AO', 34, 34),
198
        },
199
200
        'ALGORITHM_PERIOD_BETA': {
201
            'Monthly': DataIndex('Sim Period', 'AP', 23, 34),
202
            '3-Month': DataIndex('Sim Period', 'AQ', 25, 34),
203
            '6-month': DataIndex('Sim Period', 'AR', 28, 34),
204
            'year': DataIndex('Sim Period', 'AS', 34, 34),
205
        },
206
207
        'ALGORITHM_PERIOD_ALPHA': {
208
            'Monthly': DataIndex('Sim Period', 'AT', 23, 34),
209
            '3-Month': DataIndex('Sim Period', 'AU', 25, 34),
210
            '6-month': DataIndex('Sim Period', 'AV', 28, 34),
211
            'year': DataIndex('Sim Period', 'AW', 34, 34),
212
        },
213
214
        'ALGORITHM_PERIOD_BENCHMARK_VARIANCE': {
215
            'Monthly': DataIndex('Sim Period', 'BJ', 23, 34),
216
            '3-Month': DataIndex('Sim Period', 'BK', 25, 34),
217
            '6-month': DataIndex('Sim Period', 'BL', 28, 34),
218
            'year': DataIndex('Sim Period', 'BM', 34, 34),
219
        },
220
221
        'ALGORITHM_PERIOD_COVARIANCE': {
222
            'Monthly': DataIndex('Sim Period', 'BF', 23, 34),
223
            '3-Month': DataIndex('Sim Period', 'BG', 25, 34),
224
            '6-month': DataIndex('Sim Period', 'BH', 28, 34),
225
            'year': DataIndex('Sim Period', 'BI', 34, 34),
226
        },
227
228
        'ALGORITHM_PERIOD_DOWNSIDE_RISK': {
229
            'Monthly': DataIndex('Sim Period', 'BN', 23, 34),
230
            '3-Month': DataIndex('Sim Period', 'BO', 25, 34),
231
            '6-month': DataIndex('Sim Period', 'BP', 28, 34),
232
            'year': DataIndex('Sim Period', 'BQ', 34, 34),
233
        },
234
235
        'ALGORITHM_PERIOD_SORTINO': {
236
            'Monthly': DataIndex('Sim Period', 'BR', 23, 34),
237
            '3-Month': DataIndex('Sim Period', 'BS', 25, 34),
238
            '6-month': DataIndex('Sim Period', 'BT', 28, 34),
239
            'year': DataIndex('Sim Period', 'BU', 34, 34),
240
        },
241
242
        'ALGORITHM_RETURN_VALUES': DataIndex(
243
            'Sim Cumulative', 'D', 4, 254),
244
245
        'ALGORITHM_CUMULATIVE_VOLATILITY': DataIndex(
246
            'Sim Cumulative', 'P', 4, 254),
247
248
        'ALGORITHM_CUMULATIVE_SHARPE': DataIndex(
249
            'Sim Cumulative', 'R', 4, 254),
250
251
        'CUMULATIVE_DOWNSIDE_RISK': DataIndex(
252
            'Sim Cumulative', 'U', 4, 254),
253
254
        'CUMULATIVE_SORTINO': DataIndex(
255
            'Sim Cumulative', 'V', 4, 254),
256
257
        'CUMULATIVE_INFORMATION': DataIndex(
258
            'Sim Cumulative', 'AA', 4, 254),
259
260
        'CUMULATIVE_BETA': DataIndex(
261
            'Sim Cumulative', 'AD', 4, 254),
262
263
        'CUMULATIVE_ALPHA': DataIndex(
264
            'Sim Cumulative', 'AE', 4, 254),
265
266
        'CUMULATIVE_MAX_DRAWDOWN': DataIndex(
267
            'Sim Cumulative', 'AH', 4, 254),
268
269
    }
270
271
    def __init__(self):
272
        self.workbook = xlrd.open_workbook(ANSWER_KEY_PATH)
273
274
        self.sheets = {}
275
        self.sheets['Sim Period'] = self.workbook.sheet_by_name('Sim Period')
276
        self.sheets['Sim Cumulative'] = self.workbook.sheet_by_name(
277
            'Sim Cumulative')
278
        self.sheets['s_p'] = self.workbook.sheet_by_name('s_p')
279
280
        for name, index in self.INDEXES.items():
281
            if isinstance(index, dict):
282
                subvalues = {}
283
                for subkey, subindex in index.items():
284
                    subvalues[subkey] = self.get_values(subindex)
285
                setattr(self, name, subvalues)
286
            else:
287
                setattr(self, name, self.get_values(index))
288
289
    def parse_date_value(self, value):
290
        return xlrd.xldate_as_tuple(value, 0)
291
292
    def parse_float_value(self, value):
293
        return value if value != '' else np.nan
294
295
    def get_raw_values(self, data_index):
296
        return self.sheets[data_index.sheet_name].col_values(
297
            data_index.col_index,
298
            data_index.row_start_index,
299
            data_index.row_end_index + 1)
300
301
    @property
302
    def value_type_to_value_func(self):
303
        return {
304
            'float': self.parse_float_value,
305
            'date': self.parse_date_value,
306
        }
307
308
    def get_values(self, data_index):
309
        value_parser = self.value_type_to_value_func[data_index.value_type]
310
        return [value for value in
311
                map(value_parser, self.get_raw_values(data_index))]
312
313
314
ANSWER_KEY = AnswerKey()
315
316
BENCHMARK_DATES = ANSWER_KEY.BENCHMARK['Dates']
317
BENCHMARK_RETURNS = ANSWER_KEY.BENCHMARK['Returns']
318
DATES = [datetime.datetime(*x, tzinfo=pytz.UTC) for x in BENCHMARK_DATES]
319
BENCHMARK = pd.Series(dict(zip(DATES, BENCHMARK_RETURNS)))
320
ALGORITHM_RETURNS = pd.Series(
321
    dict(zip(DATES, ANSWER_KEY.ALGORITHM_RETURN_VALUES)))
322
RETURNS_DATA = pd.DataFrame({'Benchmark Returns': BENCHMARK,
323
                             'Algorithm Returns': ALGORITHM_RETURNS})
324
RISK_CUMULATIVE = pd.DataFrame({
325
    'volatility': pd.Series(dict(zip(
326
        DATES, ANSWER_KEY.ALGORITHM_CUMULATIVE_VOLATILITY))),
327
    'sharpe': pd.Series(dict(zip(
328
        DATES, ANSWER_KEY.ALGORITHM_CUMULATIVE_SHARPE))),
329
    'downside_risk': pd.Series(dict(zip(
330
        DATES, ANSWER_KEY.CUMULATIVE_DOWNSIDE_RISK))),
331
    'sortino': pd.Series(dict(zip(
332
        DATES, ANSWER_KEY.CUMULATIVE_SORTINO))),
333
    'information': pd.Series(dict(zip(
334
        DATES, ANSWER_KEY.CUMULATIVE_INFORMATION))),
335
    'alpha': pd.Series(dict(zip(
336
        DATES, ANSWER_KEY.CUMULATIVE_ALPHA))),
337
    'beta': pd.Series(dict(zip(
338
        DATES, ANSWER_KEY.CUMULATIVE_BETA))),
339
    'max_drawdown': pd.Series(dict(zip(
340
        DATES, ANSWER_KEY.CUMULATIVE_MAX_DRAWDOWN))),
341
})
342