zipline.pipeline.loaders.blaze.BlazeEarningsCalendarLoader.load_adjusted_array() - Code Metrics - Inspection of "Adds support for different typed adjusted arrays a..." - quantopian/zipline - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#905)

unknown

created 2015-12-09 17:48 UTC

load_adjusted_array() B

↳ Parent: zipline.pipeline.loaders.blaze.BlazeEarningsCalendarLoader

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	5
dl	0
loc	47
rs	8.1672

1 Method

Rating	Name	Duplication	Size	Complexity
A	zipline.pipeline.loaders.blaze.BlazeEarningsCalendarLoader.mkseries()	0	5	1

import blaze as bz
from datashape import istabular
from odo import odo
import pandas as pd
from six import iteritems
from toolz import valmap

from .core import TS_FIELD_NAME, SID_FIELD_NAME
from zipline.pipeline.loaders.base import PipelineLoader
from zipline.pipeline.loaders.earnings import EarningsCalendarLoader


ANCMT_FIELD_NAME = 'announcement_date'


class BlazeEarningsCalendarLoader(PipelineLoader):
    """A pipeline loader for the ``EarningsCalendar`` dataset that loads
    data from a blaze expression.

    Parameters
    ----------
    expr : Expr
        The expression representing the data to load.
    resources : any, optional
        The resources to use when computing ``expr``. If expr is already
        bound to resources this can be omitted.
    odo_kwargs : dict, optional
        Extra keyword arguments to pass to odo when executing the expression.

    Notes
    -----
    The expression should have a tabular dshape of::

       Dim * {{
           {SID_FIELD_NAME}: int64,
           {TS_FIELD_NAME}: datetime64,
           {ANCMT_FIELD_NAME}: datetime64,
       }}

    Where each row of the table is a record including the sid to identify the
    company, the timestamp where we learned about the announcement, and the
    date when the earnings will be announced.

    If the '{TS_FIELD_NAME}' field is not included it is assumed that we
    start the backtest with knowledge of all announcements.
    """
    __doc__ = __doc__.format(
        TS_FIELD_NAME=TS_FIELD_NAME,
        SID_FIELD_NAME=SID_FIELD_NAME,
        ANCMT_FIELD_NAME=ANCMT_FIELD_NAME,
    )

    _expected_fields = frozenset({
        TS_FIELD_NAME,
        SID_FIELD_NAME,
        ANCMT_FIELD_NAME,
    })

    def __init__(self,
                 expr,
                 resources=None,
                 compute_kwargs=None,
                 odo_kwargs=None):
        dshape = expr.dshape

        if not istabular(dshape):
            raise ValueError(
                'expression dshape must be tabular, got: %s' % dshape,
            )

        expected_fields = self._expected_fields
        self._has_ts = has_ts = TS_FIELD_NAME in dshape.measure.dict
        if not has_ts:
            # This field is optional.
            expected_fields - {TS_FIELD_NAME}

        # bind the resources into the expression
        if resources is None:
            resources = {}
        elif not isinstance(resources, dict):
            leaves = expr._leaves()
            if len(leaves) != 1:
                raise ValueError('no data resources found')

            resources = {leaves[0]: resources}

        self._expr = expr[list(expected_fields)]._subs({
            k: bz.Data(v, dshape=k.dshape) for k, v in iteritems(resources)
        })
        self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {}

    def load_adjusted_array(self, columns, dates, assets, mask):
        expr = self._expr
        filtered = expr[expr[TS_FIELD_NAME] <= dates[0]]
        lower = odo(
            bz.by(
                filtered[SID_FIELD_NAME],
                timestamp=filtered[TS_FIELD_NAME].max(),
            ).timestamp.min(),
            pd.Timestamp,
            **self._odo_kwargs or {}
        )
        if lower is pd.NaT:
            # If there is no lower date, just query for data in the date
            # range. It must all be null anyways.
            lower = dates[0]

        raw = odo(
            expr[
                (expr[TS_FIELD_NAME] >= lower) &
                (expr[TS_FIELD_NAME] <= dates[-1])
            ],
            pd.DataFrame,
            **self._odo_kwargs or {}
        )

        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(
            sids[~(sids.isin(assets) | sids.notnull())].index,
            inplace=True
        )

        gb = raw.groupby(SID_FIELD_NAME)
        if self._has_ts:
            def mkseries(idx, raw_loc=raw.loc):
                vs = raw_loc[idx, [TS_FIELD_NAME, ANCMT_FIELD_NAME]].values
                return pd.Series(
                    index=pd.DatetimeIndex(vs[:, 0]),
                    data=vs[:, 1],
                )
        else:
            def mkseries(idx, raw_loc=raw.loc):
                return pd.DatetimeIndex(raw_loc[idx, ANCMT_FIELD_NAME])

        return EarningsCalendarLoader(
            dates,
            valmap(mkseries, gb.groups),
        ).load_adjusted_array(columns, dates, assets, mask)


1			import blaze as bz
2			from datashape import istabular
3			from odo import odo
4			import pandas as pd
5			from six import iteritems
6			from toolz import valmap
7
8			from .core import TS_FIELD_NAME, SID_FIELD_NAME
9			from zipline.pipeline.loaders.base import PipelineLoader
10			from zipline.pipeline.loaders.earnings import EarningsCalendarLoader
11
12
13			ANCMT_FIELD_NAME = 'announcement_date'
14
15
16			class BlazeEarningsCalendarLoader(PipelineLoader):
17			"""A pipeline loader for the ``EarningsCalendar`` dataset that loads
18			data from a blaze expression.
19
20			Parameters
21			----------
22			expr : Expr
23			The expression representing the data to load.
24			resources : any, optional
25			The resources to use when computing ``expr``. If expr is already
26			bound to resources this can be omitted.
27			odo_kwargs : dict, optional
28			Extra keyword arguments to pass to odo when executing the expression.
29
30			Notes
31			-----
32			The expression should have a tabular dshape of::
33
34			Dim * {{
35			{SID_FIELD_NAME}: int64,
36			{TS_FIELD_NAME}: datetime64,
37			{ANCMT_FIELD_NAME}: datetime64,
38			}}
39
40			Where each row of the table is a record including the sid to identify the
41			company, the timestamp where we learned about the announcement, and the
42			date when the earnings will be announced.
43
44			If the '{TS_FIELD_NAME}' field is not included it is assumed that we
45			start the backtest with knowledge of all announcements.
46			"""
47			__doc__ = __doc__.format(
48			TS_FIELD_NAME=TS_FIELD_NAME,
49			SID_FIELD_NAME=SID_FIELD_NAME,
50			ANCMT_FIELD_NAME=ANCMT_FIELD_NAME,
51			)
52
53			_expected_fields = frozenset({
54			TS_FIELD_NAME,
55			SID_FIELD_NAME,
56			ANCMT_FIELD_NAME,
57			})
58
59			def __init__(self,
60			expr,
61			resources=None,
62			compute_kwargs=None,
63			odo_kwargs=None):
64			dshape = expr.dshape
65
66			if not istabular(dshape):
67			raise ValueError(
68			'expression dshape must be tabular, got: %s' % dshape,
69			)
70
71			expected_fields = self._expected_fields
72			self._has_ts = has_ts = TS_FIELD_NAME in dshape.measure.dict
73			if not has_ts:
74			# This field is optional.
75			expected_fields - {TS_FIELD_NAME}
76
77			# bind the resources into the expression
78			if resources is None:
79			resources = {}
80			elif not isinstance(resources, dict):
81			leaves = expr._leaves()
82			if len(leaves) != 1:
83			raise ValueError('no data resources found')
84
85			resources = {leaves[0]: resources}
86
87			self._expr = expr[list(expected_fields)]._subs({
88			k: bz.Data(v, dshape=k.dshape) for k, v in iteritems(resources)
89			})
90			self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {}
91
92			def load_adjusted_array(self, columns, dates, assets, mask):
93			expr = self._expr
94			filtered = expr[expr[TS_FIELD_NAME] <= dates[0]]
95			lower = odo(
96			bz.by(
97			filtered[SID_FIELD_NAME],
98			timestamp=filtered[TS_FIELD_NAME].max(),
99			).timestamp.min(),
100			pd.Timestamp,
101			**self._odo_kwargs or {}
102			)
103			if lower is pd.NaT:
104			# If there is no lower date, just query for data in the date
105			# range. It must all be null anyways.
106			lower = dates[0]
107
108			raw = odo(
109			expr[
110			(expr[TS_FIELD_NAME] >= lower) &
111			(expr[TS_FIELD_NAME] <= dates[-1])
112			],
113			pd.DataFrame,
114			**self._odo_kwargs or {}
115			)
116
117			sids = raw.loc[:, SID_FIELD_NAME]
118			raw.drop(
119			sids[~(sids.isin(assets) \| sids.notnull())].index,
120			inplace=True
121			)
122
123			gb = raw.groupby(SID_FIELD_NAME)
124			if self._has_ts:
125			def mkseries(idx, raw_loc=raw.loc):
126			vs = raw_loc[idx, [TS_FIELD_NAME, ANCMT_FIELD_NAME]].values
127			return pd.Series(
128			index=pd.DatetimeIndex(vs[:, 0]),
129			data=vs[:, 1],
130			)
131			else:
132			def mkseries(idx, raw_loc=raw.loc):
133			return pd.DatetimeIndex(raw_loc[idx, ANCMT_FIELD_NAME])
134
135			return EarningsCalendarLoader(
136			dates,
137			valmap(mkseries, gb.groups),
138			).load_adjusted_array(columns, dates, assets, mask)
139

quantopian / zipline

Pull Request — master (#905)

load_adjusted_array() B

Complexity

Size

Duplication

1 Method

Duplication Side-by-Side

Filter issues like