zipline.data.MinuteBarWriterFromDataFrames.gen_frames() - Code Metrics - Inspection of "Q 2.0" - quantopian/zipline - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#858)

by Eddie

created 2015-12-11 16:10 UTC

gen_frames() A

↳ Parent: zipline.data.MinuteBarWriterFromDataFrames

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	2
dl	0
loc	4
rs	10

from abc import (
    ABCMeta,
    abstractmethod,
)
import bcolz
import json
import os
from bcolz import ctable
from datetime import datetime
import numpy as np
from numpy import float64
from os.path import join
import pandas as pd
from pandas import read_csv
from six import with_metaclass

from zipline.finance.trading import TradingEnvironment
from zipline.utils import tradingcalendar

MINUTES_PER_DAY = 390

_writer_env = TradingEnvironment()

METADATA_FILENAME = 'metadata.json'


def write_metadata(directory, first_trading_day):
    metadata_path = os.path.join(directory, METADATA_FILENAME)

    metadata = {
        'first_trading_day': str(first_trading_day.date())
    }

    with open(metadata_path, 'w') as fp:
        json.dump(metadata, fp)


class BcolzMinuteBarWriter(with_metaclass(ABCMeta)):
    """
    Class capable of writing minute OHLCV data to disk into bcolz format.
    """
    @property
    def first_trading_day(self):
        return self._first_trading_day

    @abstractmethod
    def gen_frames(self, assets):
        """
        Return an iterator of pairs of (asset_id, pd.dataframe).
        """
        raise NotImplementedError()

    def write(self, directory, assets, sid_path_func=None):
        _iterator = self.gen_frames(assets)

        return self._write_internal(directory, _iterator,
                                    sid_path_func=sid_path_func)

    @staticmethod
    def full_minutes_for_days(env, dt1, dt2):
        start_date = env.normalize_date(dt1)
        end_date = env.normalize_date(dt2)

        all_minutes = []

        for day in env.days_in_range(start_date, end_date):
            minutes_in_day = pd.date_range(
                start=pd.Timestamp(
                    datetime(
                        year=day.year,
                        month=day.month,
                        day=day.day,
                        hour=9,
                        minute=31),
                    tz='US/Eastern').tz_convert('UTC'),
                periods=390,
                freq="min"
            )

            all_minutes.append(minutes_in_day)

        # flatten
        return pd.DatetimeIndex(
            np.concatenate(all_minutes), copy=False, tz='UTC'
        )

    def _write_internal(self, directory, iterator, sid_path_func=None):
        first_trading_day = self.first_trading_day

        write_metadata(directory, first_trading_day)

        first_open = pd.Timestamp(
            datetime(
                year=first_trading_day.year,
                month=first_trading_day.month,
                day=first_trading_day.day,
                hour=9,
                minute=31
            ), tz='US/Eastern').tz_convert('UTC')

        for asset_id, df in iterator:
            if sid_path_func is None:
                path = join(directory, "{0}.bcolz".format(asset_id))
            else:
                path = sid_path_func(directory, asset_id)

            os.makedirs(path)

            minutes = self.full_minutes_for_days(_writer_env,
                                                 first_open, df.index[-1])
            minutes_count = len(minutes)

            dt_col = np.zeros(minutes_count, dtype=np.uint32)
            open_col = np.zeros(minutes_count, dtype=np.uint32)
            high_col = np.zeros(minutes_count, dtype=np.uint32)
            low_col = np.zeros(minutes_count, dtype=np.uint32)
            close_col = np.zeros(minutes_count, dtype=np.uint32)
            vol_col = np.zeros(minutes_count, dtype=np.uint32)

            for row in df.iterrows():
                dt = row[0]
                idx = minutes.searchsorted(dt)

                dt_col[idx] = dt.value / 1e9
                open_col[idx] = row[1].loc["open"]
                high_col[idx] = row[1].loc["high"]
                low_col[idx] = row[1].loc["low"]
                close_col[idx] = row[1].loc["close"]
                vol_col[idx] = row[1].loc["volume"]

            ctable(
                columns=[
                    open_col,
                    high_col,
                    low_col,
                    close_col,
                    vol_col,
                    dt_col
                ],
                names=[
                    "open",
                    "high",
                    "low",
                    "close",
                    "volume",
                    "dt"
                ],
                rootdir=path,
                mode='w'
            )


class MinuteBarWriterFromDataFrames(BcolzMinuteBarWriter):
    _csv_dtypes = {
        'open': float64,
        'high': float64,
        'low': float64,
        'close': float64,
        'volume': float64,
    }

    def __init__(self, first_trading_day):
        self._first_trading_day = first_trading_day

    def gen_frames(self, assets):
        for asset in assets:
            df = assets[asset]
            yield asset, df.set_index("minute")


class MinuteBarWriterFromCSVs(BcolzMinuteBarWriter):
    """
    BcolzMinuteBarWriter constructed from a map of CSVs to assets.

    Parameters
    ----------
    asset_map: dict
        A map from asset_id -> path to csv with data for that asset.

    CSVs should have the following columns:
        minute : datetime64
        open : float64
        high : float64
        low : float64
        close : float64
        volume : int64
    """
    _csv_dtypes = {
        'open': float64,
        'high': float64,
        'low': float64,
        'close': float64,
        'volume': float64,
    }

    def __init__(self, asset_map, first_trading_day):
        self._asset_map = asset_map
        self._first_trading_day = first_trading_day

    def gen_frames(self, assets):
        """
        Read CSVs as DataFrames from our asset map.
        """
        dtypes = self._csv_dtypes

        for asset in assets:
            path = self._asset_map.get(asset)
            if path is None:
                raise KeyError("No path supplied for asset %s" % asset)
            df = read_csv(path, parse_dates=['minute'], dtype=dtypes)
            df = df.set_index("minute").tz_localize("UTC")

            yield asset, df


class BcolzMinuteBarReader(object):

    def __init__(self, rootdir, sid_path_func=None):
        self.rootdir = rootdir

        metadata = self._get_metadata()

        self.first_trading_day = pd.Timestamp(
            metadata['first_trading_day'], tz='UTC')
        mask = tradingcalendar.trading_days.slice_indexer(
            self.first_trading_day)
        # TODO: Read/write calendar to match daily, so that calendar is not
        # 'hardcoded'.
        self.trading_days = tradingcalendar.trading_days[mask]
        self._sid_path_func = sid_path_func

        self._carrays = {
            'open': {},
            'high': {},
            'low': {},
            'close': {},
            'volume': {},
            'sid': {},
            'dt': {},
        }

    def _get_metadata(self):
        with open(os.path.join(self.rootdir, METADATA_FILENAME)) as fp:
            return json.load(fp)

    def _get_ctable(self, asset):
        sid = int(asset)
        if self._sid_path_func is not None:
            path = self._sid_path_func(self.rootdir, sid)
        else:
            path = "{0}/{1}.bcolz".format(self.rootdir, sid)

        return bcolz.open(path, mode='r')

    def _find_position_of_minute(self, minute_dt):
        """
        Internal method that returns the position of the given minute in the
        list of every trading minute since market open on 1/2/2002.

        IMPORTANT: This method assumes every day is 390 minutes long, even
        early closes.  Our minute bcolz files are generated like this to
        support fast lookup.

        ex. this method would return 2 for 1/2/2002 9:32 AM Eastern.

        Parameters
        ----------
        minute_dt: pd.Timestamp
            The minute whose position should be calculated.

        Returns
        -------
        The position of the given minute in the list of all trading minutes
        since market open on 1/2/2002.
        """
        day = minute_dt.date()
        day_idx = self.trading_days.searchsorted(day)
        if day_idx < 0:
            return -1

        day_open = pd.Timestamp(
            datetime(
                year=day.year,
                month=day.month,
                day=day.day,
                hour=9,
                minute=31),
            tz='US/Eastern').tz_convert('UTC')

        minutes_offset = int((minute_dt - day_open).total_seconds()) / 60

        return int((390 * day_idx) + minutes_offset)

    def _open_minute_file(self, field, asset):
        sid_str = str(int(asset))

        try:
            carray = self._carrays[field][sid_str]
        except KeyError:
            carray = self._carrays[field][sid_str] = \
                self._get_ctable(asset)[field]

        return carray


1			from abc import (
2			ABCMeta,
3			abstractmethod,
4			)
5			import bcolz
6			import json
7			import os
8			from bcolz import ctable
9			from datetime import datetime
10			import numpy as np
11			from numpy import float64
12			from os.path import join
13			import pandas as pd
14			from pandas import read_csv
15			from six import with_metaclass
16
17			from zipline.finance.trading import TradingEnvironment
18			from zipline.utils import tradingcalendar
19
20			MINUTES_PER_DAY = 390
21
22			_writer_env = TradingEnvironment()
23
24			METADATA_FILENAME = 'metadata.json'
25
26
27			def write_metadata(directory, first_trading_day):
28			metadata_path = os.path.join(directory, METADATA_FILENAME)
29
30			metadata = {
31			'first_trading_day': str(first_trading_day.date())
32			}
33
34			with open(metadata_path, 'w') as fp:
35			json.dump(metadata, fp)
36
37
38			class BcolzMinuteBarWriter(with_metaclass(ABCMeta)):
39			"""
40			Class capable of writing minute OHLCV data to disk into bcolz format.
41			"""
42			@property
43			def first_trading_day(self):
44			return self._first_trading_day
45
46			@abstractmethod
47			def gen_frames(self, assets):
48			"""
49			Return an iterator of pairs of (asset_id, pd.dataframe).
50			"""
51			raise NotImplementedError()
52
53			def write(self, directory, assets, sid_path_func=None):
54			_iterator = self.gen_frames(assets)
55
56			return self._write_internal(directory, _iterator,
57			sid_path_func=sid_path_func)
58
59			@staticmethod
60			def full_minutes_for_days(env, dt1, dt2):
61			start_date = env.normalize_date(dt1)
62			end_date = env.normalize_date(dt2)
63
64			all_minutes = []
65
66			for day in env.days_in_range(start_date, end_date):
67			minutes_in_day = pd.date_range(
68			start=pd.Timestamp(
69			datetime(
70			year=day.year,
71			month=day.month,
72			day=day.day,
73			hour=9,
74			minute=31),
75			tz='US/Eastern').tz_convert('UTC'),
76			periods=390,
77			freq="min"
78			)
79
80			all_minutes.append(minutes_in_day)
81
82			# flatten
83			return pd.DatetimeIndex(
84			np.concatenate(all_minutes), copy=False, tz='UTC'
85			)
86
87			def _write_internal(self, directory, iterator, sid_path_func=None):
88			first_trading_day = self.first_trading_day
89
90			write_metadata(directory, first_trading_day)
91
92			first_open = pd.Timestamp(
93			datetime(
94			year=first_trading_day.year,
95			month=first_trading_day.month,
96			day=first_trading_day.day,
97			hour=9,
98			minute=31
99			), tz='US/Eastern').tz_convert('UTC')
100
101			for asset_id, df in iterator:
102			if sid_path_func is None:
103			path = join(directory, "{0}.bcolz".format(asset_id))
104			else:
105			path = sid_path_func(directory, asset_id)
106
107			os.makedirs(path)
108
109			minutes = self.full_minutes_for_days(_writer_env,
110			first_open, df.index[-1])
111			minutes_count = len(minutes)
112
113			dt_col = np.zeros(minutes_count, dtype=np.uint32)
114			open_col = np.zeros(minutes_count, dtype=np.uint32)
115			high_col = np.zeros(minutes_count, dtype=np.uint32)
116			low_col = np.zeros(minutes_count, dtype=np.uint32)
117			close_col = np.zeros(minutes_count, dtype=np.uint32)
118			vol_col = np.zeros(minutes_count, dtype=np.uint32)
119
120			for row in df.iterrows():
121			dt = row[0]
122			idx = minutes.searchsorted(dt)
123
124			dt_col[idx] = dt.value / 1e9
125			open_col[idx] = row[1].loc["open"]
126			high_col[idx] = row[1].loc["high"]
127			low_col[idx] = row[1].loc["low"]
128			close_col[idx] = row[1].loc["close"]
129			vol_col[idx] = row[1].loc["volume"]
130
131			ctable(
132			columns=[
133			open_col,
134			high_col,
135			low_col,
136			close_col,
137			vol_col,
138			dt_col
139			],
140			names=[
141			"open",
142			"high",
143			"low",
144			"close",
145			"volume",
146			"dt"
147			],
148			rootdir=path,
149			mode='w'
150			)
151
152
153			class MinuteBarWriterFromDataFrames(BcolzMinuteBarWriter):
154			_csv_dtypes = {
155			'open': float64,
156			'high': float64,
157			'low': float64,
158			'close': float64,
159			'volume': float64,
160			}
161
162			def __init__(self, first_trading_day):
163			self._first_trading_day = first_trading_day
164
165			def gen_frames(self, assets):
166			for asset in assets:
167			df = assets[asset]
168			yield asset, df.set_index("minute")
169
170
171			class MinuteBarWriterFromCSVs(BcolzMinuteBarWriter):
172			"""
173			BcolzMinuteBarWriter constructed from a map of CSVs to assets.
174
175			Parameters
176			----------
177			asset_map: dict
178			A map from asset_id -> path to csv with data for that asset.
179
180			CSVs should have the following columns:
181			minute : datetime64
182			open : float64
183			high : float64
184			low : float64
185			close : float64
186			volume : int64
187			"""
188			_csv_dtypes = {
189			'open': float64,
190			'high': float64,
191			'low': float64,
192			'close': float64,
193			'volume': float64,
194			}
195
196			def __init__(self, asset_map, first_trading_day):
197			self._asset_map = asset_map
198			self._first_trading_day = first_trading_day
199
200			def gen_frames(self, assets):
201			"""
202			Read CSVs as DataFrames from our asset map.
203			"""
204			dtypes = self._csv_dtypes
205
206			for asset in assets:
207			path = self._asset_map.get(asset)
208			if path is None:
209			raise KeyError("No path supplied for asset %s" % asset)
210			df = read_csv(path, parse_dates=['minute'], dtype=dtypes)
211			df = df.set_index("minute").tz_localize("UTC")
212
213			yield asset, df
214
215
216			class BcolzMinuteBarReader(object):
217
218			def __init__(self, rootdir, sid_path_func=None):
219			self.rootdir = rootdir
220
221			metadata = self._get_metadata()
222
223			self.first_trading_day = pd.Timestamp(
224			metadata['first_trading_day'], tz='UTC')
225			mask = tradingcalendar.trading_days.slice_indexer(
226			self.first_trading_day)
227			# TODO: Read/write calendar to match daily, so that calendar is not
228			# 'hardcoded'.
229			self.trading_days = tradingcalendar.trading_days[mask]
230			self._sid_path_func = sid_path_func
231
232			self._carrays = {
233			'open': {},
234			'high': {},
235			'low': {},
236			'close': {},
237			'volume': {},
238			'sid': {},
239			'dt': {},
240			}
241
242			def _get_metadata(self):
243			with open(os.path.join(self.rootdir, METADATA_FILENAME)) as fp:
244			return json.load(fp)
245
246			def _get_ctable(self, asset):
247			sid = int(asset)
248			if self._sid_path_func is not None:
249			path = self._sid_path_func(self.rootdir, sid)
250			else:
251			path = "{0}/{1}.bcolz".format(self.rootdir, sid)
252
253			return bcolz.open(path, mode='r')
254
255			def _find_position_of_minute(self, minute_dt):
256			"""
257			Internal method that returns the position of the given minute in the
258			list of every trading minute since market open on 1/2/2002.
259
260			IMPORTANT: This method assumes every day is 390 minutes long, even
261			early closes. Our minute bcolz files are generated like this to
262			support fast lookup.
263
264			ex. this method would return 2 for 1/2/2002 9:32 AM Eastern.
265
266			Parameters
267			----------
268			minute_dt: pd.Timestamp
269			The minute whose position should be calculated.
270
271			Returns
272			-------
273			The position of the given minute in the list of all trading minutes
274			since market open on 1/2/2002.
275			"""
276			day = minute_dt.date()
277			day_idx = self.trading_days.searchsorted(day)
278			if day_idx < 0:
279			return -1
280
281			day_open = pd.Timestamp(
282			datetime(
283			year=day.year,
284			month=day.month,
285			day=day.day,
286			hour=9,
287			minute=31),
288			tz='US/Eastern').tz_convert('UTC')
289
290			minutes_offset = int((minute_dt - day_open).total_seconds()) / 60
291
292			return int((390 * day_idx) + minutes_offset)
293
294			def _open_minute_file(self, field, asset):
295			sid_str = str(int(asset))
296
297			try:
298			carray = self._carrays[field][sid_str]
299			except KeyError:
300			carray = self._carrays[field][sid_str] = \
301			self._get_ctable(asset)[field]
302
303			return carray
304

quantopian / zipline

Pull Request — master (#858)

gen_frames() A

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like