Completed
Push — master ( efcb01...ce3727 )
by
unknown
01:22
created

zipline.pipeline.factors.binary_operator()   C

Complexity

Conditions 7

Size

Total Lines 38

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 7
dl 0
loc 38
rs 5.5
1
"""
2
factor.py
3
"""
4
from operator import attrgetter
5
from numbers import Number
6
7
from numpy import (
8
    apply_along_axis,
9
    float64,
10
    nan,
11
    inf,
12
)
13
from scipy.stats import rankdata
14
15
from zipline.errors import (
16
    UnknownRankMethod,
17
    UnsupportedDataType,
18
)
19
from zipline.lib.rank import rankdata_2d_ordinal
20
from zipline.pipeline.term import (
21
    CustomTermMixin,
22
    NotSpecified,
23
    RequiredWindowLengthMixin,
24
    SingleInputMixin,
25
    CompositeTerm,
26
)
27
from zipline.pipeline.expression import (
28
    BadBinaryOperator,
29
    COMPARISONS,
30
    is_comparison,
31
    MATH_BINOPS,
32
    method_name_for_op,
33
    NumericalExpression,
34
    NUMEXPR_MATH_FUNCS,
35
    UNARY_OPS,
36
)
37
from zipline.pipeline.filters import (
38
    NumExprFilter,
39
    PercentileFilter,
40
)
41
from zipline.utils.control_flow import nullctx
42
43
44
_RANK_METHODS = frozenset(['average', 'min', 'max', 'dense', 'ordinal'])
45
46
47
def binop_return_type(op):
48
    if is_comparison(op):
49
        return NumExprFilter
50
    else:
51
        return NumExprFactor
52
53
54
def binary_operator(op):
55
    """
56
    Factory function for making binary operator methods on a Factor subclass.
57
58
    Returns a function, "binary_operator" suitable for implementing functions
59
    like __add__.
60
    """
61
    # When combining a Factor with a NumericalExpression, we use this
62
    # attrgetter instance to defer to the commuted implementation of the
63
    # NumericalExpression operator.
64
    commuted_method_getter = attrgetter(method_name_for_op(op, commute=True))
65
66
    def binary_operator(self, other):
67
        # This can't be hoisted up a scope because the types returned by
68
        # binop_return_type aren't defined when the top-level function is
69
        # invoked in the class body of Factor.
70
        return_type = binop_return_type(op)
71
        if isinstance(self, NumExprFactor):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
72
            self_expr, other_expr, new_inputs = self.build_binary_op(
73
                op, other,
74
            )
75
            return return_type(
76
                "({left}) {op} ({right})".format(
77
                    left=self_expr,
78
                    op=op,
79
                    right=other_expr,
80
                ),
81
                new_inputs,
82
            )
83
        elif isinstance(other, NumExprFactor):
84
            # NumericalExpression overrides ops to correctly handle merging of
85
            # inputs.  Look up and call the appropriate reflected operator with
86
            # ourself as the input.
87
            return commuted_method_getter(other)(self)
88
        elif isinstance(other, Factor):
89
            if self is other:
90
                return return_type(
91
                    "x_0 {op} x_0".format(op=op),
92
                    (self,),
93
                )
94
            return return_type(
95
                "x_0 {op} x_1".format(op=op),
96
                (self, other),
97
            )
98
        elif isinstance(other, Number):
99
            return return_type(
100
                "x_0 {op} ({constant})".format(op=op, constant=other),
101
                binds=(self,),
102
            )
103
        raise BadBinaryOperator(op, self, other)
104
105
    binary_operator.__doc__ = "Binary Operator: '%s'" % op
106
    return binary_operator
107
108
109
def reflected_binary_operator(op):
110
    """
111
    Factory function for making binary operator methods on a Factor.
112
113
    Returns a function, "reflected_binary_operator" suitable for implementing
114
    functions like __radd__.
115
    """
116
    assert not is_comparison(op)
117
118
    def reflected_binary_operator(self, other):
119
120
        if isinstance(self, NumericalExpression):
121
            self_expr, other_expr, new_inputs = self.build_binary_op(
122
                op, other
123
            )
124
            return NumExprFactor(
125
                "({left}) {op} ({right})".format(
126
                    left=other_expr,
127
                    right=self_expr,
128
                    op=op,
129
                ),
130
                new_inputs,
131
            )
132
133
        # Only have to handle the numeric case because in all other valid cases
134
        # the corresponding left-binding method will be called.
135
        elif isinstance(other, Number):
136
            return NumExprFactor(
137
                "{constant} {op} x_0".format(op=op, constant=other),
138
                binds=(self,),
139
            )
140
        raise BadBinaryOperator(op, other, self)
141
    return reflected_binary_operator
142
143
144
def unary_operator(op):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
145
    """
146
    Factory function for making unary operator methods for Factors.
147
    """
148
    # Only negate is currently supported for all our possible input types.
149
    valid_ops = {'-'}
150
    if op not in valid_ops:
151
        raise ValueError("Invalid unary operator %s." % op)
152
153
    def unary_operator(self):
154
        # This can't be hoisted up a scope because the types returned by
155
        # unary_op_return_type aren't defined when the top-level function is
156
        # invoked.
157
        if isinstance(self, NumericalExpression):
158
            return NumExprFactor(
159
                "{op}({expr})".format(op=op, expr=self._expr),
160
                self.inputs,
161
            )
162
        else:
163
            return NumExprFactor("{op}x_0".format(op=op), (self,))
164
165
    unary_operator.__doc__ = "Unary Operator: '%s'" % op
166
    return unary_operator
167
168
169
def function_application(func):
170
    """
171
    Factory function for producing function application methods for Factor
172
    subclasses.
173
    """
174
    if func not in NUMEXPR_MATH_FUNCS:
175
        raise ValueError("Unsupported mathematical function '%s'" % func)
176
177
    def mathfunc(self):
178
        if isinstance(self, NumericalExpression):
179
            return NumExprFactor(
180
                "{func}({expr})".format(func=func, expr=self._expr),
181
                self.inputs,
182
            )
183
        else:
184
            return NumExprFactor("{func}(x_0)".format(func=func), (self,))
185
    return mathfunc
186
187
188
class Factor(CompositeTerm):
189
    """
190
    Pipeline API expression producing numerically-valued outputs.
191
    """
192
    dtype = float64
193
194
    # Dynamically add functions for creating NumExprFactor/NumExprFilter
195
    # instances.
196
    clsdict = locals()
197
    clsdict.update(
198
        {
199
            method_name_for_op(op): binary_operator(op)
200
            # Don't override __eq__ because it breaks comparisons on tuples of
201
            # Factors.
202
            for op in MATH_BINOPS.union(COMPARISONS - {'=='})
203
        }
204
    )
205
    clsdict.update(
206
        {
207
            method_name_for_op(op, commute=True): reflected_binary_operator(op)
208
            for op in MATH_BINOPS
209
        }
210
    )
211
    clsdict.update(
212
        {
213
            '__neg__': unary_operator(op)
214
            for op in UNARY_OPS
215
        }
216
    )
217
    clsdict.update(
218
        {
219
            funcname: function_application(funcname)
220
            for funcname in NUMEXPR_MATH_FUNCS
221
        }
222
    )
223
224
    __truediv__ = clsdict['__div__']
225
    __rtruediv__ = clsdict['__rdiv__']
226
227
    eq = binary_operator('==')
228
229
    def rank(self, method='ordinal', ascending=True, mask=NotSpecified):
230
        """
231
        Construct a new Factor representing the sorted rank of each column
232
        within each row.
233
234
        Parameters
235
        ----------
236
        method : str, {'ordinal', 'min', 'max', 'dense', 'average'}
237
            The method used to assign ranks to tied elements. See
238
            `scipy.stats.rankdata` for a full description of the semantics for
239
            each ranking method. Default is 'ordinal'.
240
        ascending : bool, optional
241
            Whether to return sorted rank in ascending or descending order.
242
            Default is True.
243
        mask : zipline.pipeline.Filter, optional
244
            A Filter representing assets to consider when computing ranks.
245
            If mask is supplied, ranks are computed ignoring any asset/date
246
            pairs for which `mask` produces a value of False.
247
248
        Returns
249
        -------
250
        ranks : zipline.pipeline.factors.Rank
251
            A new factor that will compute the ranking of the data produced by
252
            `self`.
253
254
        Notes
255
        -----
256
        The default value for `method` is different from the default for
257
        `scipy.stats.rankdata`.  See that function's documentation for a full
258
        description of the valid inputs to `method`.
259
260
        Missing or non-existent data on a given day will cause an asset to be
261
        given a rank of NaN for that day.
262
263
        See Also
264
        --------
265
        scipy.stats.rankdata
266
        zipline.lib.rank
267
        zipline.pipeline.factors.Rank
268
        """
269
        return Rank(self if ascending else -self, method=method, mask=mask)
270
271
    def top(self, N, mask=NotSpecified):
272
        """
273
        Construct a Filter matching the top N asset values of self each day.
274
275
        Parameters
276
        ----------
277
        N : int
278
            Number of assets passing the returned filter each day.
279
        mask : zipline.pipeline.Filter, optional
280
            A Filter representing assets to consider when computing ranks.
281
            If mask is supplied, top values are computed ignoring any
282
            asset/date pairs for which `mask` produces a value of False.
283
284
        Returns
285
        -------
286
        filter : zipline.pipeline.filters.Filter
287
        """
288
        return self.rank(ascending=False, mask=mask) <= N
289
290
    def bottom(self, N, mask=NotSpecified):
291
        """
292
        Construct a Filter matching the bottom N asset values of self each day.
293
294
        Parameters
295
        ----------
296
        N : int
297
            Number of assets passing the returned filter each day.
298
        mask : zipline.pipeline.Filter, optional
299
            A Filter representing assets to consider when computing ranks.
300
            If mask is supplied, bottom values are computed ignoring any
301
            asset/date pairs for which `mask` produces a value of False.
302
303
        Returns
304
        -------
305
        filter : zipline.pipeline.Filter
306
        """
307
        return self.rank(ascending=True, mask=mask) <= N
308
309
    def percentile_between(self,
310
                           min_percentile,
311
                           max_percentile,
312
                           mask=NotSpecified):
313
        """
314
        Construct a new Filter representing entries from the output of this
315
        Factor that fall within the percentile range defined by min_percentile
316
        and max_percentile.
317
318
        Parameters
319
        ----------
320
        min_percentile : float [0.0, 100.0]
321
            Return True for assets falling above this percentile in the data.
322
        max_percentile : float [0.0, 100.0]
323
            Return True for assets falling below this percentile in the data.
324
        mask : zipline.pipeline.Filter, optional
325
            A Filter representing assets to consider when percentile
326
            thresholds.  If mask is supplied, percentile cutoffs are computed
327
            each day using only assets for which `mask` returns True, and
328
            assets not passing `mask` will produce False in the output of this
329
            filter as well.
330
331
        Returns
332
        -------
333
        out : zipline.pipeline.filters.PercentileFilter
334
            A new filter that will compute the specified percentile-range mask.
335
336
        See Also
337
        --------
338
        zipline.pipeline.filters.PercentileFilter
339
        """
340
        return PercentileFilter(
341
            self,
342
            min_percentile=min_percentile,
343
            max_percentile=max_percentile,
344
            mask=mask,
345
        )
346
347
    def isnan(self):
348
        """
349
        A Filter producing True for all values where this Factor is NaN.
350
        """
351
        return self != self
352
353
    def notnan(self):
354
        """
355
        A Filter producing True for values where this Factor is not NaN.
356
357
        Returns
358
        -------
359
        nanfilter : zipline.pipeline.filters.Filter
360
        """
361
        return ~self.isnan()
362
363
    def isfinite(self):
364
        """
365
        A Filter producing True for values where this Factor is anything but
366
        NaN, inf, or -inf.
367
        """
368
        return (-inf < self) & (self < inf)
369
370
371
class NumExprFactor(NumericalExpression, Factor):
372
    """
373
    Factor computed from a numexpr expression.
374
375
    Parameters
376
    ----------
377
    expr : string
378
       A string suitable for passing to numexpr.  All variables in 'expr'
379
       should be of the form "x_i", where i is the index of the corresponding
380
       factor input in 'binds'.
381
    binds : tuple
382
       A tuple of factors to use as inputs.
383
384
    Notes
385
    -----
386
    NumExprFactors are constructed by numerical operators like `+` and `-`.
387
    Users should rarely need to construct a NumExprFactor directly.
388
    """
389
    pass
390
391
392
class Rank(SingleInputMixin, Factor):
393
    """
394
    A Factor representing the row-wise rank data of another Factor.
395
396
    Parameters
397
    ----------
398
    factor : zipline.pipeline.factors.Factor
399
        The factor on which to compute ranks.
400
    method : str, {'average', 'min', 'max', 'dense', 'ordinal'}
401
        The method used to assign ranks to tied elements.  See
402
        `scipy.stats.rankdata` for a full description of the semantics for each
403
        ranking method.
404
405
    See Also
406
    --------
407
    scipy.stats.rankdata : Underlying ranking algorithm.
408
    zipline.factors.Factor.rank : Method-style interface to same functionality.
409
410
    Notes
411
    -----
412
    Most users should call Factor.rank rather than directly construct an
413
    instance of this class.
414
    """
415
    window_length = 0
416
    dtype = float64
417
418
    def __new__(cls, factor, method, mask):
419
        return super(Rank, cls).__new__(
420
            cls,
421
            inputs=(factor,),
422
            method=method,
423
            mask=mask,
424
        )
425
426
    def _init(self, method, *args, **kwargs):
427
        self._method = method
428
        return super(Rank, self)._init(*args, **kwargs)
429
430
    @classmethod
431
    def static_identity(cls, method, *args, **kwargs):
432
        return (
433
            super(Rank, cls).static_identity(*args, **kwargs),
434
            method,
435
        )
436
437
    def _validate(self):
438
        """
439
        Verify that the stored rank method is valid.
440
        """
441
        if self._method not in _RANK_METHODS:
442
            raise UnknownRankMethod(
443
                method=self._method,
444
                choices=set(_RANK_METHODS),
445
            )
446
        return super(Rank, self)._validate()
447
448
    def _compute(self, arrays, dates, assets, mask):
449
        """
450
        For each row in the input, compute a like-shaped array of per-row
451
        ranks.
452
        """
453
        inv_mask = ~mask
454
        data = arrays[0].copy()
455
        data[inv_mask] = nan
456
        # OPTIMIZATION: Fast path the default case with our own specialized
457
        # Cython implementation.
458
        if self._method == 'ordinal':
459
            result = rankdata_2d_ordinal(data)
460
        else:
461
            # FUTURE OPTIMIZATION:
462
            # Write a less general "apply to rows" method that doesn't do all
463
            # the extra work that apply_along_axis does.
464
            result = apply_along_axis(rankdata, 1, data, method=self._method)
465
466
        # rankdata will sort nan values into last place, but we want our
467
        # nans to propagate, so explicitly re-apply.
468
        result[inv_mask] = nan
469
        return result
470
471
    def __repr__(self):
472
        return "{type}({input_}, method='{method}', mask={mask})".format(
473
            type=type(self).__name__,
474
            input_=self.inputs[0],
475
            method=self._method,
476
            mask=self.mask,
477
        )
478
479
480
class CustomFactor(RequiredWindowLengthMixin, CustomTermMixin, Factor):
481
    '''
482
    Base class for user-defined Factors.
483
484
    Parameters
485
    ----------
486
    inputs : iterable, optional
487
        An iterable of `BoundColumn` instances (e.g. USEquityPricing.close),
488
        describing the data to load and pass to `self.compute`.  If this
489
        argument is passed to the CustomFactor constructor, we look for a
490
        class-level attribute named `inputs`.
491
    window_length : int, optional
492
        Number of rows of rows to pass for each input.  If this
493
        argument is not passed to the CustomFactor constructor, we look for a
494
        class-level attribute named `window_length`.
495
496
    Notes
497
    -----
498
    Users implementing their own Factors should subclass CustomFactor and
499
    implement a method named `compute` with the following signature:
500
501
    .. code-block:: python
502
503
        def compute(self, today, assets, out, *inputs):
504
           ...
505
506
    On each simulation date, ``compute`` will be called with the current date,
507
    an array of sids, an output array, and an input array for each expression
508
    passed as inputs to the CustomFactor constructor.
509
510
    The specific types of the values passed to `compute` are as follows::
511
512
        today : np.datetime64[ns]
513
            Row label for the last row of all arrays passed as `inputs`.
514
        assets : np.array[int64, ndim=1]
515
            Column labels for `out` and`inputs`.
516
        out : np.array[float64, ndim=1]
517
            Output array of the same shape as `assets`.  `compute` should write
518
            its desired return values into `out`.
519
        *inputs : tuple of np.array
520
            Raw data arrays corresponding to the values of `self.inputs`.
521
522
    ``compute`` functions should expect to be passed NaN values for dates on
523
    which no data was available for an asset.  This may include dates on which
524
    an asset did not yet exist.
525
526
    For example, if a CustomFactor requires 10 rows of close price data, and
527
    asset A started trading on Monday June 2nd, 2014, then on Tuesday, June
528
    3rd, 2014, the column of input data for asset A will have 9 leading NaNs
529
    for the preceding days on which data was not yet available.
530
531
    Examples
532
    --------
533
534
    A CustomFactor with pre-declared defaults:
535
536
    .. code-block:: python
537
538
        class TenDayRange(CustomFactor):
539
            """
540
            Computes the difference between the highest high in the last 10
541
            days and the lowest low.
542
543
            Pre-declares high and low as default inputs and `window_length` as
544
            10.
545
            """
546
547
            inputs = [USEquityPricing.high, USEquityPricing.low]
548
            window_length = 10
549
550
            def compute(self, today, assets, out, highs, lows):
551
                from numpy import nanmin, nanmax
552
553
                highest_highs = nanmax(highs, axis=0)
554
                lowest_lows = nanmin(lows, axis=0)
555
                out[:] = highest_highs - lowest_lows
556
557
558
        # Doesn't require passing inputs or window_length because they're
559
        # pre-declared as defaults for the TenDayRange class.
560
        ten_day_range = TenDayRange()
561
562
    A CustomFactor without defaults:
563
564
    .. code-block:: python
565
566
        class MedianValue(CustomFactor):
567
            """
568
            Computes the median value of an arbitrary single input over an
569
            arbitrary window..
570
571
            Does not declare any defaults, so values for `window_length` and
572
            `inputs` must be passed explicitly on every construction.
573
            """
574
575
            def compute(self, today, assets, out, data):
576
                from numpy import nanmedian
577
                out[:] = data.nanmedian(data, axis=0)
578
579
        # Values for `inputs` and `window_length` must be passed explicitly to
580
        # MedianValue.
581
        median_close10 = MedianValue([USEquityPricing.close], window_length=10)
582
        median_low15 = MedianValue([USEquityPricing.low], window_length=15)
583
    '''
584
    ctx = nullctx()
585
586
    def _validate(self):
587
        if self.dtype != float64:
588
            raise UnsupportedDataType(dtype=self.dtype)
589
        return super(CustomFactor, self)._validate()
590