Completed
Pull Request — master (#905)
by
unknown
01:32
created

zipline.pipeline.factors.function_application()   B

Complexity

Conditions 4

Size

Total Lines 23

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 4
dl 0
loc 23
rs 8.7972

1 Method

Rating   Name   Duplication   Size   Complexity  
A zipline.pipeline.factors.mathfunc() 0 13 2
1
"""
2
factor.py
3
"""
4
from operator import attrgetter
5
from numbers import Number
6
7
from numpy import float64, inf
8
from toolz import curry
9
10
from zipline.errors import (
11
    UnknownRankMethod,
12
    UnsupportedDataType,
13
)
14
from zipline.lib.rank import masked_rankdata_2d
15
from zipline.pipeline.term import (
16
    CustomTermMixin,
17
    NotSpecified,
18
    RequiredWindowLengthMixin,
19
    SingleInputMixin,
20
    CompositeTerm,
21
)
22
from zipline.pipeline.expression import (
23
    BadBinaryOperator,
24
    COMPARISONS,
25
    is_comparison,
26
    MATH_BINOPS,
27
    method_name_for_op,
28
    NumericalExpression,
29
    NUMEXPR_MATH_FUNCS,
30
    UNARY_OPS,
31
    unary_op_name,
32
)
33
from zipline.pipeline.filters import (
34
    NumExprFilter,
35
    PercentileFilter,
36
)
37
from zipline.utils.control_flow import nullctx
38
from zipline.utils.numpy_utils import (
39
    bool_dtype,
40
    datetime64ns_dtype,
41
    float64_dtype,
42
)
43
from zipline.utils.preprocess import preprocess
44
45
46
_RANK_METHODS = frozenset(['average', 'min', 'max', 'dense', 'ordinal'])
47
48
49
def numbers_to_float64(func, argname, argvalue):
50
    """
51
    Preprocessor for converting numerical inputs into floats.
52
53
    This is used in the binary operator constructors for Factor so that
54
    `2 + Factor()` has the same behavior as `2.0 + Factor()`.
55
    """
56
    if isinstance(argvalue, Number):
57
        return float64(argvalue)
58
    return argvalue
59
60
61
@curry
62
def set_attribute(name, value):
63
    """
64
    Decorator factory for setting attributes on a function.
65
66
    Doesn't change the behavior of the wrapped function.
67
68
    Usage
69
    -----
70
    >>> @set_attribute('__name__', 'foo')
71
    ... def bar():
72
    ...     return 3
73
    ...
74
    >>> bar()
75
    3
76
    >>> bar.__name__
77
    'foo'
78
    """
79
    def decorator(f):
80
        setattr(f, name, value)
81
        return f
82
    return decorator
83
84
85
# Decorators for setting the __name__ and __doc__ properties of a decorated
86
# function.
87
# Example:
88
with_name = set_attribute('__name__')
89
with_doc = set_attribute('__doc__')
90
91
92
def binop_return_type(op):
93
    if is_comparison(op):
94
        return NumExprFilter
95
    else:
96
        return NumExprFactor
97
98
99
def binop_return_dtype(op, left, right):
100
    """
101
    Compute the expected return dtype for the given binary operator.
102
103
    Parameters
104
    ----------
105
    op : str
106
        Operator symbol, (e.g. '+', '-', ...).
107
    left : numpy.dtype
108
        Dtype of left hand side.
109
    right : numpy.dtype
110
        Dtype of right hand side.
111
112
    Returns
113
    -------
114
    outdtype : numpy.dtype
115
        The dtype of the result of `left <op> right`.
116
    """
117
    if is_comparison(op):
118
        if left != right:
119
            raise TypeError(
120
                "Don't know how to compute {left} {op} {right}.\n"
121
                "Comparisons are only supported between Factors of equal "
122
                "dtypes.".format(left=left, op=op, right=right)
123
            )
124
        return bool_dtype
125
126
    elif left != float64_dtype or right != float64_dtype:
127
        raise TypeError(
128
            "Don't know how to compute {left} {op} {right}.\n"
129
            "Arithmetic operators are only supported on Factors of "
130
            "dtype 'float64'.".format(
131
                left=left.name,
132
                op=op,
133
                right=right.name,
134
            )
135
        )
136
    return float64_dtype
137
138
139
def binary_operator(op):
140
    """
141
    Factory function for making binary operator methods on a Factor subclass.
142
143
    Returns a function, "binary_operator" suitable for implementing functions
144
    like __add__.
145
    """
146
    # When combining a Factor with a NumericalExpression, we use this
147
    # attrgetter instance to defer to the commuted implementation of the
148
    # NumericalExpression operator.
149
    commuted_method_getter = attrgetter(method_name_for_op(op, commute=True))
150
151
    @preprocess(other=numbers_to_float64)
152
    @with_doc("Binary Operator: '%s'" % op)
153
    @with_name(method_name_for_op(op))
154
    def binary_operator(self, other):
155
        # This can't be hoisted up a scope because the types returned by
156
        # binop_return_type aren't defined when the top-level function is
157
        # invoked in the class body of Factor.
158
        return_type = binop_return_type(op)
159
        if isinstance(self, NumExprFactor):
160
            self_expr, other_expr, new_inputs = self.build_binary_op(
161
                op, other,
162
            )
163
            return return_type(
164
                "({left}) {op} ({right})".format(
165
                    left=self_expr,
166
                    op=op,
167
                    right=other_expr,
168
                ),
169
                new_inputs,
170
                dtype=binop_return_dtype(op, self.dtype, other.dtype),
171
            )
172
        elif isinstance(other, NumExprFactor):
173
            # NumericalExpression overrides ops to correctly handle merging of
174
            # inputs.  Look up and call the appropriate reflected operator with
175
            # ourself as the input.
176
            return commuted_method_getter(other)(self)
177
        elif isinstance(other, Factor):
178
            if self is other:
179
                return return_type(
180
                    "x_0 {op} x_0".format(op=op),
181
                    (self,),
182
                    dtype=binop_return_dtype(op, self.dtype, other.dtype),
183
                )
184
            return return_type(
185
                "x_0 {op} x_1".format(op=op),
186
                (self, other),
187
                dtype=binop_return_dtype(op, self.dtype, other.dtype),
188
            )
189
        elif isinstance(other, Number):
190
            return return_type(
191
                "x_0 {op} ({constant})".format(op=op, constant=other),
192
                binds=(self,),
193
                # Interpret numeric literals as floats.
194
                dtype=binop_return_dtype(op, self.dtype, other.dtype)
195
            )
196
        raise BadBinaryOperator(op, self, other)
197
198
    return binary_operator
199
200
201
def reflected_binary_operator(op):
202
    """
203
    Factory function for making binary operator methods on a Factor.
204
205
    Returns a function, "reflected_binary_operator" suitable for implementing
206
    functions like __radd__.
207
    """
208
    assert not is_comparison(op)
209
210
    @preprocess(other=numbers_to_float64)
211
    @with_name(method_name_for_op(op, commute=True))
212
    def reflected_binary_operator(self, other):
213
214
        if isinstance(self, NumericalExpression):
215
            self_expr, other_expr, new_inputs = self.build_binary_op(
216
                op, other
217
            )
218
            return NumExprFactor(
219
                "({left}) {op} ({right})".format(
220
                    left=other_expr,
221
                    right=self_expr,
222
                    op=op,
223
                ),
224
                new_inputs,
225
                dtype=binop_return_dtype(op, other.dtype, self.dtype)
226
            )
227
228
        # Only have to handle the numeric case because in all other valid cases
229
        # the corresponding left-binding method will be called.
230
        elif isinstance(other, Number):
231
            return NumExprFactor(
232
                "{constant} {op} x_0".format(op=op, constant=other),
233
                binds=(self,),
234
                dtype=binop_return_dtype(op, other.dtype, self.dtype),
235
            )
236
        raise BadBinaryOperator(op, other, self)
237
    return reflected_binary_operator
238
239
240
def unary_operator(op):
241
    """
242
    Factory function for making unary operator methods for Factors.
243
    """
244
    # Only negate is currently supported.
245
    valid_ops = {'-'}
246
    if op not in valid_ops:
247
        raise ValueError("Invalid unary operator %s." % op)
248
249
    @with_doc("Unary Operator: '%s'" % op)
250
    @with_name(unary_op_name(op))
251
    def unary_operator(self):
252
        if self.dtype != float64_dtype:
253
            raise TypeError(
254
                "Can't apply unary operator {op!r} to instance of "
255
                "{typename!r} with dtype {dtypename!r}.\n"
256
                "{op!r} is only supported for Factors of dtype "
257
                "'float64'.".format(
258
                    op=op,
259
                    typename=type(self).__name__,
260
                    dtypename=self.dtype.name,
261
                )
262
            )
263
264
        # This can't be hoisted up a scope because the types returned by
265
        # unary_op_return_type aren't defined when the top-level function is
266
        # invoked.
267
        if isinstance(self, NumericalExpression):
268
            return NumExprFactor(
269
                "{op}({expr})".format(op=op, expr=self._expr),
270
                self.inputs,
271
                dtype=float64_dtype,
272
            )
273
        else:
274
            return NumExprFactor(
275
                "{op}x_0".format(op=op),
276
                (self,),
277
                dtype=float64_dtype,
278
            )
279
    return unary_operator
280
281
282
def function_application(func):
283
    """
284
    Factory function for producing function application methods for Factor
285
    subclasses.
286
    """
287
    if func not in NUMEXPR_MATH_FUNCS:
288
        raise ValueError("Unsupported mathematical function '%s'" % func)
289
290
    @with_name(func)
291
    def mathfunc(self):
292
        if isinstance(self, NumericalExpression):
293
            return NumExprFactor(
294
                "{func}({expr})".format(func=func, expr=self._expr),
295
                self.inputs,
296
                dtype=float64_dtype,
297
            )
298
        else:
299
            return NumExprFactor(
300
                "{func}(x_0)".format(func=func),
301
                (self,),
302
                dtype=float64_dtype,
303
            )
304
    return mathfunc
305
306
307
FACTOR_DTYPES = frozenset([datetime64ns_dtype, float64_dtype])
308
309
310
class Factor(CompositeTerm):
311
    """
312
    Pipeline API expression producing numerically-valued outputs.
313
    """
314
    # Dynamically add functions for creating NumExprFactor/NumExprFilter
315
    # instances.
316
    clsdict = locals()
317
    clsdict.update(
318
        {
319
            method_name_for_op(op): binary_operator(op)
320
            # Don't override __eq__ because it breaks comparisons on tuples of
321
            # Factors.
322
            for op in MATH_BINOPS.union(COMPARISONS - {'=='})
323
        }
324
    )
325
    clsdict.update(
326
        {
327
            method_name_for_op(op, commute=True): reflected_binary_operator(op)
328
            for op in MATH_BINOPS
329
        }
330
    )
331
    clsdict.update(
332
        {
333
            unary_op_name(op): unary_operator(op)
334
            for op in UNARY_OPS
335
        }
336
    )
337
338
    clsdict.update(
339
        {
340
            funcname: function_application(funcname)
341
            for funcname in NUMEXPR_MATH_FUNCS
342
        }
343
    )
344
345
    __truediv__ = clsdict['__div__']
346
    __rtruediv__ = clsdict['__rdiv__']
347
348
    eq = binary_operator('==')
349
350
    def _validate(self):
351
        # Do superclass validation first so that `NotSpecified` dtypes get
352
        # handled.
353
        retval = super(Factor, self)._validate()
354
        if self.dtype not in FACTOR_DTYPES:
355
            raise UnsupportedDataType(
356
                typename=type(self).__name__,
357
                dtype=self.dtype
358
            )
359
        return retval
360
361
    def rank(self, method='ordinal', ascending=True, mask=NotSpecified):
362
        """
363
        Construct a new Factor representing the sorted rank of each column
364
        within each row.
365
366
        Parameters
367
        ----------
368
        method : str, {'ordinal', 'min', 'max', 'dense', 'average'}
369
            The method used to assign ranks to tied elements. See
370
            `scipy.stats.rankdata` for a full description of the semantics for
371
            each ranking method. Default is 'ordinal'.
372
        ascending : bool, optional
373
            Whether to return sorted rank in ascending or descending order.
374
            Default is True.
375
        mask : zipline.pipeline.Filter, optional
376
            A Filter representing assets to consider when computing ranks.
377
            If mask is supplied, ranks are computed ignoring any asset/date
378
            pairs for which `mask` produces a value of False.
379
380
        Returns
381
        -------
382
        ranks : zipline.pipeline.factors.Rank
383
            A new factor that will compute the ranking of the data produced by
384
            `self`.
385
386
        Notes
387
        -----
388
        The default value for `method` is different from the default for
389
        `scipy.stats.rankdata`.  See that function's documentation for a full
390
        description of the valid inputs to `method`.
391
392
        Missing or non-existent data on a given day will cause an asset to be
393
        given a rank of NaN for that day.
394
395
        See Also
396
        --------
397
        scipy.stats.rankdata
398
        zipline.lib.rank
399
        zipline.pipeline.factors.Rank
400
        """
401
        return Rank(self, method=method, ascending=ascending, mask=mask)
402
403
    def top(self, N, mask=NotSpecified):
404
        """
405
        Construct a Filter matching the top N asset values of self each day.
406
407
        Parameters
408
        ----------
409
        N : int
410
            Number of assets passing the returned filter each day.
411
        mask : zipline.pipeline.Filter, optional
412
            A Filter representing assets to consider when computing ranks.
413
            If mask is supplied, top values are computed ignoring any
414
            asset/date pairs for which `mask` produces a value of False.
415
416
        Returns
417
        -------
418
        filter : zipline.pipeline.filters.Filter
419
        """
420
        return self.rank(ascending=False, mask=mask) <= N
421
422
    def bottom(self, N, mask=NotSpecified):
423
        """
424
        Construct a Filter matching the bottom N asset values of self each day.
425
426
        Parameters
427
        ----------
428
        N : int
429
            Number of assets passing the returned filter each day.
430
        mask : zipline.pipeline.Filter, optional
431
            A Filter representing assets to consider when computing ranks.
432
            If mask is supplied, bottom values are computed ignoring any
433
            asset/date pairs for which `mask` produces a value of False.
434
435
        Returns
436
        -------
437
        filter : zipline.pipeline.Filter
438
        """
439
        return self.rank(ascending=True, mask=mask) <= N
440
441
    def percentile_between(self,
442
                           min_percentile,
443
                           max_percentile,
444
                           mask=NotSpecified):
445
        """
446
        Construct a new Filter representing entries from the output of this
447
        Factor that fall within the percentile range defined by min_percentile
448
        and max_percentile.
449
450
        Parameters
451
        ----------
452
        min_percentile : float [0.0, 100.0]
453
            Return True for assets falling above this percentile in the data.
454
        max_percentile : float [0.0, 100.0]
455
            Return True for assets falling below this percentile in the data.
456
        mask : zipline.pipeline.Filter, optional
457
            A Filter representing assets to consider when percentile
458
            thresholds.  If mask is supplied, percentile cutoffs are computed
459
            each day using only assets for which `mask` returns True, and
460
            assets not passing `mask` will produce False in the output of this
461
            filter as well.
462
463
        Returns
464
        -------
465
        out : zipline.pipeline.filters.PercentileFilter
466
            A new filter that will compute the specified percentile-range mask.
467
468
        See Also
469
        --------
470
        zipline.pipeline.filters.PercentileFilter
471
        """
472
        return PercentileFilter(
473
            self,
474
            min_percentile=min_percentile,
475
            max_percentile=max_percentile,
476
            mask=mask,
477
        )
478
479
    def isnan(self):
480
        """
481
        A Filter producing True for all values where this Factor is NaN.
482
483
        Returns
484
        -------
485
        nanfilter : zipline.pipeline.filters.Filter
486
        """
487
        return self != self
488
489
    def notnan(self):
490
        """
491
        A Filter producing True for values where this Factor is not NaN.
492
493
        Returns
494
        -------
495
        nanfilter : zipline.pipeline.filters.Filter
496
        """
497
        return ~self.isnan()
498
499
    def isfinite(self):
500
        """
501
        A Filter producing True for values where this Factor is anything but
502
        NaN, inf, or -inf.
503
        """
504
        return (-inf < self) & (self < inf)
505
506
507
class NumExprFactor(NumericalExpression, Factor):
508
    """
509
    Factor computed from a numexpr expression.
510
511
    Parameters
512
    ----------
513
    expr : string
514
       A string suitable for passing to numexpr.  All variables in 'expr'
515
       should be of the form "x_i", where i is the index of the corresponding
516
       factor input in 'binds'.
517
    binds : tuple
518
       A tuple of factors to use as inputs.
519
520
    Notes
521
    -----
522
    NumExprFactors are constructed by numerical operators like `+` and `-`.
523
    Users should rarely need to construct a NumExprFactor directly.
524
    """
525
    pass
526
527
528
class Rank(SingleInputMixin, Factor):
529
    """
530
    A Factor representing the row-wise rank data of another Factor.
531
532
    Parameters
533
    ----------
534
    factor : zipline.pipeline.factors.Factor
535
        The factor on which to compute ranks.
536
    method : str, {'average', 'min', 'max', 'dense', 'ordinal'}
537
        The method used to assign ranks to tied elements.  See
538
        `scipy.stats.rankdata` for a full description of the semantics for each
539
        ranking method.
540
541
    See Also
542
    --------
543
    scipy.stats.rankdata : Underlying ranking algorithm.
544
    zipline.factors.Factor.rank : Method-style interface to same functionality.
545
546
    Notes
547
    -----
548
    Most users should call Factor.rank rather than directly construct an
549
    instance of this class.
550
    """
551
    window_length = 0
552
    dtype = float64_dtype
553
554
    def __new__(cls, factor, method, ascending, mask):
555
        return super(Rank, cls).__new__(
556
            cls,
557
            inputs=(factor,),
558
            method=method,
559
            ascending=ascending,
560
            mask=mask,
561
        )
562
563
    def _init(self, method, ascending, *args, **kwargs):
564
        self._method = method
565
        self._ascending = ascending
566
        return super(Rank, self)._init(*args, **kwargs)
567
568
    @classmethod
569
    def static_identity(cls, method, ascending, *args, **kwargs):
570
        return (
571
            super(Rank, cls).static_identity(*args, **kwargs),
572
            method,
573
            ascending,
574
        )
575
576
    def _validate(self):
577
        """
578
        Verify that the stored rank method is valid.
579
        """
580
        if self._method not in _RANK_METHODS:
581
            raise UnknownRankMethod(
582
                method=self._method,
583
                choices=set(_RANK_METHODS),
584
            )
585
        return super(Rank, self)._validate()
586
587
    def _compute(self, arrays, dates, assets, mask):
588
        """
589
        For each row in the input, compute a like-shaped array of per-row
590
        ranks.
591
        """
592
        return masked_rankdata_2d(
593
            arrays[0],
594
            mask,
595
            self.inputs[0].missing_value,
596
            self._method,
597
            self._ascending,
598
        )
599
600
    def __repr__(self):
601
        return "{type}({input_}, method='{method}', mask={mask})".format(
602
            type=type(self).__name__,
603
            input_=self.inputs[0],
604
            method=self._method,
605
            mask=self.mask,
606
        )
607
608
609
class CustomFactor(RequiredWindowLengthMixin, CustomTermMixin, Factor):
610
    '''
611
    Base class for user-defined Factors.
612
613
    Parameters
614
    ----------
615
    inputs : iterable, optional
616
        An iterable of `BoundColumn` instances (e.g. USEquityPricing.close),
617
        describing the data to load and pass to `self.compute`.  If this
618
        argument is passed to the CustomFactor constructor, we look for a
619
        class-level attribute named `inputs`.
620
    window_length : int, optional
621
        Number of rows of rows to pass for each input.  If this
622
        argument is not passed to the CustomFactor constructor, we look for a
623
        class-level attribute named `window_length`.
624
625
    Notes
626
    -----
627
    Users implementing their own Factors should subclass CustomFactor and
628
    implement a method named `compute` with the following signature:
629
630
    .. code-block:: python
631
632
        def compute(self, today, assets, out, *inputs):
633
           ...
634
635
    On each simulation date, ``compute`` will be called with the current date,
636
    an array of sids, an output array, and an input array for each expression
637
    passed as inputs to the CustomFactor constructor.
638
639
    The specific types of the values passed to `compute` are as follows::
640
641
        today : np.datetime64[ns]
642
            Row label for the last row of all arrays passed as `inputs`.
643
        assets : np.array[int64, ndim=1]
644
            Column labels for `out` and`inputs`.
645
        out : np.array[self.dtype, ndim=1]
646
            Output array of the same shape as `assets`.  `compute` should write
647
            its desired return values into `out`.
648
        *inputs : tuple of np.array
649
            Raw data arrays corresponding to the values of `self.inputs`.
650
651
    ``compute`` functions should expect to be passed NaN values for dates on
652
    which no data was available for an asset.  This may include dates on which
653
    an asset did not yet exist.
654
655
    For example, if a CustomFactor requires 10 rows of close price data, and
656
    asset A started trading on Monday June 2nd, 2014, then on Tuesday, June
657
    3rd, 2014, the column of input data for asset A will have 9 leading NaNs
658
    for the preceding days on which data was not yet available.
659
660
    Examples
661
    --------
662
663
    A CustomFactor with pre-declared defaults:
664
665
    .. code-block:: python
666
667
        class TenDayRange(CustomFactor):
668
            """
669
            Computes the difference between the highest high in the last 10
670
            days and the lowest low.
671
672
            Pre-declares high and low as default inputs and `window_length` as
673
            10.
674
            """
675
676
            inputs = [USEquityPricing.high, USEquityPricing.low]
677
            window_length = 10
678
679
            def compute(self, today, assets, out, highs, lows):
680
                from numpy import nanmin, nanmax
681
682
                highest_highs = nanmax(highs, axis=0)
683
                lowest_lows = nanmin(lows, axis=0)
684
                out[:] = highest_highs - lowest_lows
685
686
687
        # Doesn't require passing inputs or window_length because they're
688
        # pre-declared as defaults for the TenDayRange class.
689
        ten_day_range = TenDayRange()
690
691
    A CustomFactor without defaults:
692
693
    .. code-block:: python
694
695
        class MedianValue(CustomFactor):
696
            """
697
            Computes the median value of an arbitrary single input over an
698
            arbitrary window..
699
700
            Does not declare any defaults, so values for `window_length` and
701
            `inputs` must be passed explicitly on every construction.
702
            """
703
704
            def compute(self, today, assets, out, data):
705
                from numpy import nanmedian
706
                out[:] = data.nanmedian(data, axis=0)
707
708
        # Values for `inputs` and `window_length` must be passed explicitly to
709
        # MedianValue.
710
        median_close10 = MedianValue([USEquityPricing.close], window_length=10)
711
        median_low15 = MedianValue([USEquityPricing.low], window_length=15)
712
    '''
713
    dtype = float64_dtype
714
    ctx = nullctx()
715