zipline.data.BcolzMinuteBarWriter - Code Metrics - Inspection of "Q 2.0" - quantopian/zipline - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#858)

by Eddie

created 2015-12-04 15:58 UTC

zipline.data.BcolzMinuteBarWriter A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	112
Duplicated Lines	0 %

Metric	Value
dl	0
loc	112
rs	10
wmc	9

5 Methods

Rating	Name	Size	Complexity
B	_write_internal()	63	4
A	gen_frames()	6	1
A	first_trading_day()	3	1
B	full_minutes_for_days()	26	2
A	write()	5	1

from abc import (
    ABCMeta,
    abstractmethod,
)
import json
import os
from bcolz import ctable
from datetime import datetime
import numpy as np
from numpy import float64
from os.path import join
import pandas as pd
from pandas import read_csv
from six import with_metaclass

from zipline.finance.trading import TradingEnvironment
from zipline.utils import tradingcalendar

MINUTES_PER_DAY = 390

_writer_env = TradingEnvironment()

METADATA_FILENAME = 'metadata.json'


def write_metadata(directory, first_trading_day):
    metadata_path = os.path.join(directory, METADATA_FILENAME)

    metadata = {
        'first_trading_day': str(first_trading_day.date())
    }

    with open(metadata_path, 'w') as fp:
        json.dump(metadata, fp)


class BcolzMinuteBarWriter(with_metaclass(ABCMeta)):
    """
    Class capable of writing minute OHLCV data to disk into bcolz format.
    """
    @property
    def first_trading_day(self):
        return self._first_trading_day

    @abstractmethod
    def gen_frames(self, assets):
        """
        Return an iterator of pairs of (asset_id, pd.dataframe).
        """
        raise NotImplementedError()

    def write(self, directory, assets, sid_path_func=None):
        _iterator = self.gen_frames(assets)

        return self._write_internal(directory, _iterator,
                                    sid_path_func=sid_path_func)

    @staticmethod
    def full_minutes_for_days(env, dt1, dt2):
        start_date = env.normalize_date(dt1)
        end_date = env.normalize_date(dt2)

        all_minutes = []

        for day in env.days_in_range(start_date, end_date):
            minutes_in_day = pd.date_range(
                start=pd.Timestamp(
                    datetime(
                        year=day.year,
                        month=day.month,
                        day=day.day,
                        hour=9,
                        minute=31),
                    tz='US/Eastern').tz_convert('UTC'),
                periods=390,
                freq="min"
            )

            all_minutes.append(minutes_in_day)

        # flatten
        return pd.DatetimeIndex(
            np.concatenate(all_minutes), copy=False, tz='UTC'
        )

    def _write_internal(self, directory, iterator, sid_path_func=None):
        first_trading_day = self.first_trading_day

        write_metadata(directory, first_trading_day)

        first_open = pd.Timestamp(
            datetime(
                year=first_trading_day.year,
                month=first_trading_day.month,
                day=first_trading_day.day,
                hour=9,
                minute=31
            ), tz='US/Eastern').tz_convert('UTC')

        for asset_id, df in iterator:
            if sid_path_func is None:
                path = join(directory, "{0}.bcolz".format(asset_id))
            else:
                path = sid_path_func(directory, asset_id)

            os.makedirs(path)

            minutes = self.full_minutes_for_days(_writer_env,
                                                 first_open, df.index[-1])
            minutes_count = len(minutes)

            dt_col = np.zeros(minutes_count, dtype=np.uint32)
            open_col = np.zeros(minutes_count, dtype=np.uint32)
            high_col = np.zeros(minutes_count, dtype=np.uint32)
            low_col = np.zeros(minutes_count, dtype=np.uint32)
            close_col = np.zeros(minutes_count, dtype=np.uint32)
            vol_col = np.zeros(minutes_count, dtype=np.uint32)

            for row in df.iterrows():
                dt = row[0]
                idx = minutes.searchsorted(dt)

                dt_col[idx] = dt.value / 1e9
                open_col[idx] = row[1].loc["open"]
                high_col[idx] = row[1].loc["high"]
                low_col[idx] = row[1].loc["low"]
                close_col[idx] = row[1].loc["close"]
                vol_col[idx] = row[1].loc["volume"]

            ctable(
                columns=[
                    open_col,
                    high_col,
                    low_col,
                    close_col,
                    vol_col,
                    dt_col
                ],
                names=[
                    "open",
                    "high",
                    "low",
                    "close",
                    "volume",
                    "dt"
                ],
                rootdir=path,
                mode='w'
            )


class MinuteBarWriterFromDataFrames(BcolzMinuteBarWriter):
    _csv_dtypes = {
        'open': float64,
        'high': float64,
        'low': float64,
        'close': float64,
        'volume': float64,
    }

    def __init__(self, first_trading_day):
        self._first_trading_day = first_trading_day

    def gen_frames(self, assets):
        for asset in assets:
            df = assets[asset]
            yield asset, df.set_index("minute")


class MinuteBarWriterFromCSVs(BcolzMinuteBarWriter):
    """
    BcolzMinuteBarWriter constructed from a map of CSVs to assets.

    Parameters
    ----------
    asset_map: dict
        A map from asset_id -> path to csv with data for that asset.

    CSVs should have the following columns:
        minute : datetime64
        open : float64
        high : float64
        low : float64
        close : float64
        volume : int64
    """
    _csv_dtypes = {
        'open': float64,
        'high': float64,
        'low': float64,
        'close': float64,
        'volume': float64,
    }

    def __init__(self, asset_map, first_trading_day):
        self._asset_map = asset_map
        self._first_trading_day = first_trading_day

    def gen_frames(self, assets):
        """
        Read CSVs as DataFrames from our asset map.
        """
        dtypes = self._csv_dtypes

        for asset in assets:
            path = self._asset_map.get(asset)
            if path is None:
                raise KeyError("No path supplied for asset %s" % asset)
            df = read_csv(path, parse_dates=['minute'], dtype=dtypes)
            df = df.set_index("minute").tz_localize("UTC")

            yield asset, df


class BcolzMinuteBarReader(object):

    def __init__(self, rootdir, sid_path_func=None):
        self.rootdir = rootdir

        metadata = self._get_metadata()

        self.first_trading_day = pd.Timestamp(
            metadata['first_trading_day'], tz='UTC')
        mask = tradingcalendar.trading_days.slice_indexer(
            self.first_trading_day)
        self.trading_days = tradingcalendar.trading_days[mask]
        self.sid_path_func = sid_path_func

    def _get_metadata(self):
        with open(os.path.join(self.rootdir, METADATA_FILENAME)) as fp:
            return json.load(fp)


1			from abc import (
2			ABCMeta,
3			abstractmethod,
4			)
5			import json
6			import os
7			from bcolz import ctable
8			from datetime import datetime
9			import numpy as np
10			from numpy import float64
11			from os.path import join
12			import pandas as pd
13			from pandas import read_csv
14			from six import with_metaclass
15
16			from zipline.finance.trading import TradingEnvironment
17			from zipline.utils import tradingcalendar
18
19			MINUTES_PER_DAY = 390
20
21			_writer_env = TradingEnvironment()
22
23			METADATA_FILENAME = 'metadata.json'
24
25
26			def write_metadata(directory, first_trading_day):
27			metadata_path = os.path.join(directory, METADATA_FILENAME)
28
29			metadata = {
30			'first_trading_day': str(first_trading_day.date())
31			}
32
33			with open(metadata_path, 'w') as fp:
34			json.dump(metadata, fp)
35
36
37			class BcolzMinuteBarWriter(with_metaclass(ABCMeta)):
38			"""
39			Class capable of writing minute OHLCV data to disk into bcolz format.
40			"""
41			@property
42			def first_trading_day(self):
43			return self._first_trading_day
44
45			@abstractmethod
46			def gen_frames(self, assets):
47			"""
48			Return an iterator of pairs of (asset_id, pd.dataframe).
49			"""
50			raise NotImplementedError()
51
52			def write(self, directory, assets, sid_path_func=None):
53			_iterator = self.gen_frames(assets)
54
55			return self._write_internal(directory, _iterator,
56			sid_path_func=sid_path_func)
57
58			@staticmethod
59			def full_minutes_for_days(env, dt1, dt2):
60			start_date = env.normalize_date(dt1)
61			end_date = env.normalize_date(dt2)
62
63			all_minutes = []
64
65			for day in env.days_in_range(start_date, end_date):
66			minutes_in_day = pd.date_range(
67			start=pd.Timestamp(
68			datetime(
69			year=day.year,
70			month=day.month,
71			day=day.day,
72			hour=9,
73			minute=31),
74			tz='US/Eastern').tz_convert('UTC'),
75			periods=390,
76			freq="min"
77			)
78
79			all_minutes.append(minutes_in_day)
80
81			# flatten
82			return pd.DatetimeIndex(
83			np.concatenate(all_minutes), copy=False, tz='UTC'
84			)
85
86			def _write_internal(self, directory, iterator, sid_path_func=None):
87			first_trading_day = self.first_trading_day
88
89			write_metadata(directory, first_trading_day)
90
91			first_open = pd.Timestamp(
92			datetime(
93			year=first_trading_day.year,
94			month=first_trading_day.month,
95			day=first_trading_day.day,
96			hour=9,
97			minute=31
98			), tz='US/Eastern').tz_convert('UTC')
99
100			for asset_id, df in iterator:
101			if sid_path_func is None:
102			path = join(directory, "{0}.bcolz".format(asset_id))
103			else:
104			path = sid_path_func(directory, asset_id)
105
106			os.makedirs(path)
107
108			minutes = self.full_minutes_for_days(_writer_env,
109			first_open, df.index[-1])
110			minutes_count = len(minutes)
111
112			dt_col = np.zeros(minutes_count, dtype=np.uint32)
113			open_col = np.zeros(minutes_count, dtype=np.uint32)
114			high_col = np.zeros(minutes_count, dtype=np.uint32)
115			low_col = np.zeros(minutes_count, dtype=np.uint32)
116			close_col = np.zeros(minutes_count, dtype=np.uint32)
117			vol_col = np.zeros(minutes_count, dtype=np.uint32)
118
119			for row in df.iterrows():
120			dt = row[0]
121			idx = minutes.searchsorted(dt)
122
123			dt_col[idx] = dt.value / 1e9
124			open_col[idx] = row[1].loc["open"]
125			high_col[idx] = row[1].loc["high"]
126			low_col[idx] = row[1].loc["low"]
127			close_col[idx] = row[1].loc["close"]
128			vol_col[idx] = row[1].loc["volume"]
129
130			ctable(
131			columns=[
132			open_col,
133			high_col,
134			low_col,
135			close_col,
136			vol_col,
137			dt_col
138			],
139			names=[
140			"open",
141			"high",
142			"low",
143			"close",
144			"volume",
145			"dt"
146			],
147			rootdir=path,
148			mode='w'
149			)
150
151
152			class MinuteBarWriterFromDataFrames(BcolzMinuteBarWriter):
153			_csv_dtypes = {
154			'open': float64,
155			'high': float64,
156			'low': float64,
157			'close': float64,
158			'volume': float64,
159			}
160
161			def __init__(self, first_trading_day):
162			self._first_trading_day = first_trading_day
163
164			def gen_frames(self, assets):
165			for asset in assets:
166			df = assets[asset]
167			yield asset, df.set_index("minute")
168
169
170			class MinuteBarWriterFromCSVs(BcolzMinuteBarWriter):
171			"""
172			BcolzMinuteBarWriter constructed from a map of CSVs to assets.
173
174			Parameters
175			----------
176			asset_map: dict
177			A map from asset_id -> path to csv with data for that asset.
178
179			CSVs should have the following columns:
180			minute : datetime64
181			open : float64
182			high : float64
183			low : float64
184			close : float64
185			volume : int64
186			"""
187			_csv_dtypes = {
188			'open': float64,
189			'high': float64,
190			'low': float64,
191			'close': float64,
192			'volume': float64,
193			}
194
195			def __init__(self, asset_map, first_trading_day):
196			self._asset_map = asset_map
197			self._first_trading_day = first_trading_day
198
199			def gen_frames(self, assets):
200			"""
201			Read CSVs as DataFrames from our asset map.
202			"""
203			dtypes = self._csv_dtypes
204
205			for asset in assets:
206			path = self._asset_map.get(asset)
207			if path is None:
208			raise KeyError("No path supplied for asset %s" % asset)
209			df = read_csv(path, parse_dates=['minute'], dtype=dtypes)
210			df = df.set_index("minute").tz_localize("UTC")
211
212			yield asset, df
213
214
215			class BcolzMinuteBarReader(object):
216
217			def __init__(self, rootdir, sid_path_func=None):
218			self.rootdir = rootdir
219
220			metadata = self._get_metadata()
221
222			self.first_trading_day = pd.Timestamp(
223			metadata['first_trading_day'], tz='UTC')
224			mask = tradingcalendar.trading_days.slice_indexer(
225			self.first_trading_day)
226			self.trading_days = tradingcalendar.trading_days[mask]
227			self.sid_path_func = sid_path_func
228
229			def _get_metadata(self):
230			with open(os.path.join(self.rootdir, METADATA_FILENAME)) as fp:
231			return json.load(fp)
232

quantopian / zipline

Pull Request — master (#858)

zipline.data.BcolzMinuteBarWriter A

Complexity

Size/Duplication

5 Methods

Duplication Side-by-Side

Filter issues like