zipline.data.BcolzMinuteBarWriter._write_internal() - Code Metrics - Inspection of "Q 2.0" - quantopian/zipline - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#858)

by Eddie

created 2015-12-17 19:26 UTC

_write_internal() B

↳ Parent: zipline.data.BcolzMinuteBarWriter

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	4
dl	0
loc	63
rs	8.8946

How to fix Long Method

# Copyright 2015 Quantopian, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import (
    ABCMeta,
    abstractmethod,
)
import bcolz
import json
import os
from bcolz import ctable
from datetime import datetime
import numpy as np
from numpy import float64
from os.path import join
import pandas as pd
from pandas import read_csv
from six import with_metaclass

from zipline.finance.trading import TradingEnvironment
from zipline.utils import tradingcalendar

MINUTES_PER_DAY = 390

_writer_env = TradingEnvironment()

METADATA_FILENAME = 'metadata.json'


def write_metadata(directory, first_trading_day):
    metadata_path = os.path.join(directory, METADATA_FILENAME)

    metadata = {
        'first_trading_day': str(first_trading_day.date())
    }

    with open(metadata_path, 'w') as fp:
        json.dump(metadata, fp)


class BcolzMinuteBarWriter(with_metaclass(ABCMeta)):
    """
    Class capable of writing minute OHLCV data to disk into bcolz format.
    """
    @property
    def first_trading_day(self):
        return self._first_trading_day

    @abstractmethod
    def gen_frames(self, assets):
        """
        Return an iterator of pairs of (asset_id, pd.dataframe).
        """
        raise NotImplementedError()

    def write(self, directory, assets, sid_path_func=None):
        _iterator = self.gen_frames(assets)

        return self._write_internal(directory, _iterator,
                                    sid_path_func=sid_path_func)

    @staticmethod
    def full_minutes_for_days(env, dt1, dt2):
        start_date = env.normalize_date(dt1)
        end_date = env.normalize_date(dt2)

        all_minutes = []

        for day in env.days_in_range(start_date, end_date):
            minutes_in_day = pd.date_range(
                start=pd.Timestamp(
                    datetime(
                        year=day.year,
                        month=day.month,
                        day=day.day,
                        hour=9,
                        minute=31),
                    tz='US/Eastern').tz_convert('UTC'),
                periods=390,
                freq="min"
            )

            all_minutes.append(minutes_in_day)

        # flatten
        return pd.DatetimeIndex(
            np.concatenate(all_minutes), copy=False, tz='UTC'
        )

    def _write_internal(self, directory, iterator, sid_path_func=None):
        first_trading_day = self.first_trading_day

        write_metadata(directory, first_trading_day)

        first_open = pd.Timestamp(
            datetime(
                year=first_trading_day.year,
                month=first_trading_day.month,
                day=first_trading_day.day,
                hour=9,
                minute=31
            ), tz='US/Eastern').tz_convert('UTC')

        for asset_id, df in iterator:
            if sid_path_func is None:
                path = join(directory, "{0}.bcolz".format(asset_id))
            else:
                path = sid_path_func(directory, asset_id)

            os.makedirs(path)

            minutes = self.full_minutes_for_days(_writer_env,
                                                 first_open, df.index[-1])
            minutes_count = len(minutes)

            dt_col = np.zeros(minutes_count, dtype=np.uint32)
            open_col = np.zeros(minutes_count, dtype=np.uint32)
            high_col = np.zeros(minutes_count, dtype=np.uint32)
            low_col = np.zeros(minutes_count, dtype=np.uint32)
            close_col = np.zeros(minutes_count, dtype=np.uint32)
            vol_col = np.zeros(minutes_count, dtype=np.uint32)

            for row in df.iterrows():
                dt = row[0]
                idx = minutes.searchsorted(dt)

                dt_col[idx] = dt.value / 1e9
                open_col[idx] = row[1].loc["open"]
                high_col[idx] = row[1].loc["high"]
                low_col[idx] = row[1].loc["low"]
                close_col[idx] = row[1].loc["close"]
                vol_col[idx] = row[1].loc["volume"]

            ctable(
                columns=[
                    open_col,
                    high_col,
                    low_col,
                    close_col,
                    vol_col,
                    dt_col
                ],
                names=[
                    "open",
                    "high",
                    "low",
                    "close",
                    "volume",
                    "dt"
                ],
                rootdir=path,
                mode='w'
            )


class MinuteBarWriterFromDataFrames(BcolzMinuteBarWriter):
    _csv_dtypes = {
        'open': float64,
        'high': float64,
        'low': float64,
        'close': float64,
        'volume': float64,
    }

    def __init__(self, first_trading_day):
        self._first_trading_day = first_trading_day

    def gen_frames(self, assets):
        for asset in assets:
            df = assets[asset]
            yield asset, df.set_index("minute")


class MinuteBarWriterFromCSVs(BcolzMinuteBarWriter):
    """
    BcolzMinuteBarWriter constructed from a map of CSVs to assets.

    Parameters
    ----------
    asset_map: dict
        A map from asset_id -> path to csv with data for that asset.

    CSVs should have the following columns:
        minute : datetime64
        open : float64
        high : float64
        low : float64
        close : float64
        volume : int64
    """
    _csv_dtypes = {
        'open': float64,
        'high': float64,
        'low': float64,
        'close': float64,
        'volume': float64,
    }

    def __init__(self, asset_map, first_trading_day):
        self._asset_map = asset_map
        self._first_trading_day = first_trading_day

    def gen_frames(self, assets):
        """
        Read CSVs as DataFrames from our asset map.
        """
        dtypes = self._csv_dtypes

        for asset in assets:
            path = self._asset_map.get(asset)
            if path is None:
                raise KeyError("No path supplied for asset %s" % asset)
            df = read_csv(path, parse_dates=['minute'], dtype=dtypes)
            df = df.set_index("minute").tz_localize("UTC")

            yield asset, df


class BcolzMinuteBarReader(object):

    def __init__(self, rootdir, sid_path_func=None):
        self.rootdir = rootdir

        metadata = self._get_metadata()

        self.first_trading_day = pd.Timestamp(
            metadata['first_trading_day'], tz='UTC')
        mask = tradingcalendar.trading_days.slice_indexer(
            self.first_trading_day)
        # TODO: Read/write calendar to match daily, so that calendar is not
        # 'hardcoded'.
        self.trading_days = tradingcalendar.trading_days[mask]
        self._sid_path_func = sid_path_func

        self._carrays = {
            'open': {},
            'high': {},
            'low': {},
            'close': {},
            'volume': {},
            'sid': {},
            'dt': {},
        }

    def _get_metadata(self):
        with open(os.path.join(self.rootdir, METADATA_FILENAME)) as fp:
            return json.load(fp)

    def _get_ctable(self, asset):
        sid = int(asset)
        if self._sid_path_func is not None:
            path = self._sid_path_func(self.rootdir, sid)
        else:
            path = "{0}/{1}.bcolz".format(self.rootdir, sid)

        return bcolz.open(path, mode='r')

    def _find_position_of_minute(self, minute_dt):
        """
        Internal method that returns the position of the given minute in the
        list of every trading minute since market open of the first trading
        day.

        IMPORTANT: This method assumes every day is 390 minutes long, even
        early closes.  Our minute bcolz files are generated like this to
        support fast lookup.

        ex. this method would return 2 for 1/2/2002 9:32 AM Eastern, if
        1/2/2002 is the first trading day of the dataset.

        Parameters
        ----------
        minute_dt: pd.Timestamp
            The minute whose position should be calculated.

        Returns
        -------
        The position of the given minute in the list of all trading minutes
        since market open on the first trading day.
        """
        day = minute_dt.date()
        day_idx = self.trading_days.searchsorted(day)
        if day_idx < 0:
            return -1

        day_open = pd.Timestamp(
            datetime(
                year=day.year,
                month=day.month,
                day=day.day,
                hour=9,
                minute=31),
            tz='US/Eastern').tz_convert('UTC')

        minutes_offset = int((minute_dt - day_open).total_seconds()) / 60

        return int((390 * day_idx) + minutes_offset)

    def _open_minute_file(self, field, asset):
        sid_str = str(int(asset))

        try:
            carray = self._carrays[field][sid_str]
        except KeyError:
            carray = self._carrays[field][sid_str] = \
                self._get_ctable(asset)[field]

        return carray


1			# Copyright 2015 Quantopian, Inc.
2			#
3			# Licensed under the Apache License, Version 2.0 (the "License");
4			# you may not use this file except in compliance with the License.
5			# You may obtain a copy of the License at
6			#
7			# http://www.apache.org/licenses/LICENSE-2.0
8			#
9			# Unless required by applicable law or agreed to in writing, software
10			# distributed under the License is distributed on an "AS IS" BASIS,
11			# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12			# See the License for the specific language governing permissions and
13			# limitations under the License.
14
15			from abc import (
16			ABCMeta,
17			abstractmethod,
18			)
19			import bcolz
20			import json
21			import os
22			from bcolz import ctable
23			from datetime import datetime
24			import numpy as np
25			from numpy import float64
26			from os.path import join
27			import pandas as pd
28			from pandas import read_csv
29			from six import with_metaclass
30
31			from zipline.finance.trading import TradingEnvironment
32			from zipline.utils import tradingcalendar
33
34			MINUTES_PER_DAY = 390
35
36			_writer_env = TradingEnvironment()
37
38			METADATA_FILENAME = 'metadata.json'
39
40
41			def write_metadata(directory, first_trading_day):
42			metadata_path = os.path.join(directory, METADATA_FILENAME)
43
44			metadata = {
45			'first_trading_day': str(first_trading_day.date())
46			}
47
48			with open(metadata_path, 'w') as fp:
49			json.dump(metadata, fp)
50
51
52			class BcolzMinuteBarWriter(with_metaclass(ABCMeta)):
53			"""
54			Class capable of writing minute OHLCV data to disk into bcolz format.
55			"""
56			@property
57			def first_trading_day(self):
58			return self._first_trading_day
59
60			@abstractmethod
61			def gen_frames(self, assets):
62			"""
63			Return an iterator of pairs of (asset_id, pd.dataframe).
64			"""
65			raise NotImplementedError()
66
67			def write(self, directory, assets, sid_path_func=None):
68			_iterator = self.gen_frames(assets)
69
70			return self._write_internal(directory, _iterator,
71			sid_path_func=sid_path_func)
72
73			@staticmethod
74			def full_minutes_for_days(env, dt1, dt2):
75			start_date = env.normalize_date(dt1)
76			end_date = env.normalize_date(dt2)
77
78			all_minutes = []
79
80			for day in env.days_in_range(start_date, end_date):
81			minutes_in_day = pd.date_range(
82			start=pd.Timestamp(
83			datetime(
84			year=day.year,
85			month=day.month,
86			day=day.day,
87			hour=9,
88			minute=31),
89			tz='US/Eastern').tz_convert('UTC'),
90			periods=390,
91			freq="min"
92			)
93
94			all_minutes.append(minutes_in_day)
95
96			# flatten
97			return pd.DatetimeIndex(
98			np.concatenate(all_minutes), copy=False, tz='UTC'
99			)
100
101			def _write_internal(self, directory, iterator, sid_path_func=None):
102			first_trading_day = self.first_trading_day
103
104			write_metadata(directory, first_trading_day)
105
106			first_open = pd.Timestamp(
107			datetime(
108			year=first_trading_day.year,
109			month=first_trading_day.month,
110			day=first_trading_day.day,
111			hour=9,
112			minute=31
113			), tz='US/Eastern').tz_convert('UTC')
114
115			for asset_id, df in iterator:
116			if sid_path_func is None:
117			path = join(directory, "{0}.bcolz".format(asset_id))
118			else:
119			path = sid_path_func(directory, asset_id)
120
121			os.makedirs(path)
122
123			minutes = self.full_minutes_for_days(_writer_env,
124			first_open, df.index[-1])
125			minutes_count = len(minutes)
126
127			dt_col = np.zeros(minutes_count, dtype=np.uint32)
128			open_col = np.zeros(minutes_count, dtype=np.uint32)
129			high_col = np.zeros(minutes_count, dtype=np.uint32)
130			low_col = np.zeros(minutes_count, dtype=np.uint32)
131			close_col = np.zeros(minutes_count, dtype=np.uint32)
132			vol_col = np.zeros(minutes_count, dtype=np.uint32)
133
134			for row in df.iterrows():
135			dt = row[0]
136			idx = minutes.searchsorted(dt)
137
138			dt_col[idx] = dt.value / 1e9
139			open_col[idx] = row[1].loc["open"]
140			high_col[idx] = row[1].loc["high"]
141			low_col[idx] = row[1].loc["low"]
142			close_col[idx] = row[1].loc["close"]
143			vol_col[idx] = row[1].loc["volume"]
144
145			ctable(
146			columns=[
147			open_col,
148			high_col,
149			low_col,
150			close_col,
151			vol_col,
152			dt_col
153			],
154			names=[
155			"open",
156			"high",
157			"low",
158			"close",
159			"volume",
160			"dt"
161			],
162			rootdir=path,
163			mode='w'
164			)
165
166
167			class MinuteBarWriterFromDataFrames(BcolzMinuteBarWriter):
168			_csv_dtypes = {
169			'open': float64,
170			'high': float64,
171			'low': float64,
172			'close': float64,
173			'volume': float64,
174			}
175
176			def __init__(self, first_trading_day):
177			self._first_trading_day = first_trading_day
178
179			def gen_frames(self, assets):
180			for asset in assets:
181			df = assets[asset]
182			yield asset, df.set_index("minute")
183
184
185			class MinuteBarWriterFromCSVs(BcolzMinuteBarWriter):
186			"""
187			BcolzMinuteBarWriter constructed from a map of CSVs to assets.
188
189			Parameters
190			----------
191			asset_map: dict
192			A map from asset_id -> path to csv with data for that asset.
193
194			CSVs should have the following columns:
195			minute : datetime64
196			open : float64
197			high : float64
198			low : float64
199			close : float64
200			volume : int64
201			"""
202			_csv_dtypes = {
203			'open': float64,
204			'high': float64,
205			'low': float64,
206			'close': float64,
207			'volume': float64,
208			}
209
210			def __init__(self, asset_map, first_trading_day):
211			self._asset_map = asset_map
212			self._first_trading_day = first_trading_day
213
214			def gen_frames(self, assets):
215			"""
216			Read CSVs as DataFrames from our asset map.
217			"""
218			dtypes = self._csv_dtypes
219
220			for asset in assets:
221			path = self._asset_map.get(asset)
222			if path is None:
223			raise KeyError("No path supplied for asset %s" % asset)
224			df = read_csv(path, parse_dates=['minute'], dtype=dtypes)
225			df = df.set_index("minute").tz_localize("UTC")
226
227			yield asset, df
228
229
230			class BcolzMinuteBarReader(object):
231
232			def __init__(self, rootdir, sid_path_func=None):
233			self.rootdir = rootdir
234
235			metadata = self._get_metadata()
236
237			self.first_trading_day = pd.Timestamp(
238			metadata['first_trading_day'], tz='UTC')
239			mask = tradingcalendar.trading_days.slice_indexer(
240			self.first_trading_day)
241			# TODO: Read/write calendar to match daily, so that calendar is not
242			# 'hardcoded'.
243			self.trading_days = tradingcalendar.trading_days[mask]
244			self._sid_path_func = sid_path_func
245
246			self._carrays = {
247			'open': {},
248			'high': {},
249			'low': {},
250			'close': {},
251			'volume': {},
252			'sid': {},
253			'dt': {},
254			}
255
256			def _get_metadata(self):
257			with open(os.path.join(self.rootdir, METADATA_FILENAME)) as fp:
258			return json.load(fp)
259
260			def _get_ctable(self, asset):
261			sid = int(asset)
262			if self._sid_path_func is not None:
263			path = self._sid_path_func(self.rootdir, sid)
264			else:
265			path = "{0}/{1}.bcolz".format(self.rootdir, sid)
266
267			return bcolz.open(path, mode='r')
268
269			def _find_position_of_minute(self, minute_dt):
270			"""
271			Internal method that returns the position of the given minute in the
272			list of every trading minute since market open of the first trading
273			day.
274
275			IMPORTANT: This method assumes every day is 390 minutes long, even
276			early closes. Our minute bcolz files are generated like this to
277			support fast lookup.
278
279			ex. this method would return 2 for 1/2/2002 9:32 AM Eastern, if
280			1/2/2002 is the first trading day of the dataset.
281
282			Parameters
283			----------
284			minute_dt: pd.Timestamp
285			The minute whose position should be calculated.
286
287			Returns
288			-------
289			The position of the given minute in the list of all trading minutes
290			since market open on the first trading day.
291			"""
292			day = minute_dt.date()
293			day_idx = self.trading_days.searchsorted(day)
294			if day_idx < 0:
295			return -1
296
297			day_open = pd.Timestamp(
298			datetime(
299			year=day.year,
300			month=day.month,
301			day=day.day,
302			hour=9,
303			minute=31),
304			tz='US/Eastern').tz_convert('UTC')
305
306			minutes_offset = int((minute_dt - day_open).total_seconds()) / 60
307
308			return int((390 * day_idx) + minutes_offset)
309
310			def _open_minute_file(self, field, asset):
311			sid_str = str(int(asset))
312
313			try:
314			carray = self._carrays[field][sid_str]
315			except KeyError:
316			carray = self._carrays[field][sid_str] = \
317			self._get_ctable(asset)[field]
318
319			return carray
320

quantopian / zipline

Pull Request — master (#858)

_write_internal() B

Complexity

Size

Duplication

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like