Completed
Push — master ( a95b57...038136 )
by Alexandre M.
54s
created

hansel.list_subpaths()   B

Complexity

Conditions 6

Size

Total Lines 40

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 6
dl 0
loc 40
rs 7.5384
1
# -*- coding: utf-8 -*-
2
# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
3
# vi: set ft=python sts=4 ts=4 sw=4 et:
4
"""
5
Utilities to make crumbs
6
"""
7
import fnmatch
8
import operator
9
import os
10
import os.path as op
11
import re
12
from   collections import Mapping
13
from   copy        import deepcopy
14
from   functools   import partial, reduce
15
from   itertools   import product
16
17
from ._utils import _check_is_subset
18
19
20
def rm_dups(lst):
21
    """ Return a sorted lst of non-duplicated elements from `lst`.
22
23
    Parameters
24
    ----------
25
    lst: sequence of any
26
27
    Returns
28
    -------
29
    fslst:
30
        Filtered and sorted `lst` with non duplicated elements of `lst`.
31
    """
32
    return sorted(list(set(lst)))
33
34
35
def remove_ignored(ignore, strs):
36
    """ Remove from `strs` the matches to the `fnmatch` (glob) patterns and
37
    return the result in a list."""
38
    nustrs = deepcopy(strs)
39
    for ign in ignore:
40
        nustrs = [item for item in nustrs if not fnmatch.fnmatch(item, ign)]
41
42
    return nustrs
43
44
45
def fnmatch_filter(pattern, items, *args):
46
    """ Return the items from `items` that match the fnmatch expression in `pattern`.
47
    Parameters
48
    ----------
49
    pattern: str
50
        Regular expression
51
52
    items: list of str
53
        The items to be checked
54
55
    args: ignored
56
57
    Returns
58
    -------
59
    matches: list of str
60
        Matched items
61
    """
62
    return [item for item in items if fnmatch.fnmatch(item, pattern)]
63
64
65
def regex_match_filter(pattern, items, *args):
66
    """ Return the items from `items` that match the regular expression in `pattern`.
67
    Parameters
68
    ----------
69
    pattern: str
70
        Regular expression
71
72
    items: list of str
73
        The items to be checked
74
75
    args: re.compile arguments
76
77
    Returns
78
    -------
79
    matches: list of str
80
        Matched items
81
    """
82
    test = re.compile(pattern, *args)
83
    return [s for s in items if test.match(s)]
84
85
86
def list_children(path, just_dirs=False):
87
    """ Return the immediate elements (files and folders) in `path`.
88
    Parameters
89
    ----------
90
    path: str
91
92
    just_dirs: bool
93
        If True will return only folders.
94
95
    ignore: sequence of str
96
        Sequence of glob patterns to ignore from the listing.
97
98
    re: str
99
        Regular expression that the result items must match.
100
101
    Returns
102
    -------
103
    paths: list of str
104
    """
105
    if not op.exists(path):
106
        raise IOError("Expected an existing path, but could not"
107
                      " find {}.".format(path))
108
109
    if op.isfile(path):
110
        if just_dirs:
111
            vals = []
112
        else:
113
            vals = [path]
114
    else:
115
        if just_dirs: # this means we have to list only folders
116
            vals = [d for d in os.listdir(path) if op.isdir(op.join(path, d))]
117
        else:   # this means we have to list files
118
            vals = os.listdir(path)
119
120
    return vals
121
122
123
def list_subpaths(path, just_dirs=False, ignore=None, pattern=None,
124
                  filter_func=fnmatch_filter, filter_args=None):
125
    """ Return the immediate elements (files and folders) within `path`.
126
    Parameters
127
    ----------
128
    path: str
129
130
    just_dirs: bool
131
        If True will return only folders.
132
133
    ignore: sequence of str
134
        Sequence of glob patterns to ignore from the listing.
135
136
    pattern: str
137
        Regular expression that the result items must match.
138
139
    filter_func: func
140
        The function to match the patterns.
141
        Must have as arguments: (pattern, paths) and return
142
        a subset of paths.
143
144
    filter_args: filter func arguments
145
        Arguments for the filter function.
146
147
    Returns
148
    -------
149
    paths: list of str
150
    """
151
    paths = list_children(path, just_dirs=just_dirs)
152
153
    if ignore and ignore is not None:
154
        paths = remove_ignored(ignore, paths)
155
156
    if pattern and pattern is not None:
157
        if filter_args is None:
158
            filter_args = ()
159
160
        paths = filter_func(pattern, paths, *filter_args)
161
162
    return paths
163
164
165
def list_intersection(list1, list2):
166
    """ Return a list of elements that are the intersection between the set of elements
167
    of `list1` and `list2`·
168
    This will keep the same order of the elements in `list1`.
169
    """
170
    return (arg_name for arg_name in list1 if arg_name in list2)
171
172
173
def _intersect_crumb_args(crumb1, crumb2):
174
    """ Return a list of `arg_names` that are the intersection between the arguments
175
    of `crumb1` and `crumb2`·
176
    This will keep the same order as the arguments are in `all_args` function from `crumb1`.
177
    """
178
    return list_intersection(crumb1.all_args(), crumb2.all_args())
179
180
181
def _get_matching_items(list1, list2, items=None):
182
    """ If `items` is None, Return a list of items that are in
183
    `list1` and `list2`. Otherwise will return the elements of `items` if
184
    they are in both lists.
185
    Keep the order in `list1` or in `items`.
186
187
    Returns
188
    -------
189
    arg_names: list
190
        The matching items.
191
192
    Raises
193
    ------
194
    ValueError:
195
        If an element of items does not exists in either `list1` or `list2`.
196
    """
197
    if items is None:
198
        arg_names = list_intersection(list1, list2)
199
    else:
200
        try:
201
            _check_is_subset(items, list1)
202
            _check_is_subset(items, list2)
203
        except KeyError:
204
            arg_names = []
205
        except:
206
            raise
207
        else:
208
            arg_names = items
209
210
    return arg_names
211
212
213
def joint_value_map(crumb, arg_names, check_exists=True):
214
    """ Return a list of tuples of crumb argument values of the given `arg_names`.
215
    Parameters
216
    ----------
217
    arg_name: str
218
219
    check_exists: bool
220
        If True will return only a values_map with sets of crumb arguments that fill a crumb to an existing path.
221
        Otherwise it won't check if they exist and return all possible combinations.
222
223
    Returns
224
    -------
225
    values_map: list of lists of 2-tuples
226
        I call values_map what is called `record` in pandas. It is a list of lists of 2-tuples, where each 2-tuple
227
        has the shape (arg_name, arg_value).
228
    """
229
    values_map = []
230
    for arg_name in arg_names:
231
        values_map.append(list((arg_name, arg_value) for arg_value in crumb[arg_name]))
232
233
    if len(arg_names) == 1:
234
        return values_map[0]
235
    else:
236
        if not check_exists:
237
            values_map_checked = values_map[:]
238
        else:
239
            args_crumbs = [(args, crumb.replace(**dict(args))) for args in set(product(*values_map))]
240
            values_map_checked = [args for args, cr in args_crumbs if cr.exists()]
241
242
    return values_map_checked
243
244
245
def intersection(crumb1, crumb2, on=None):
246
    """ Return an 'inner join' of both given Crumbs, i.e., will return a list of
247
    Crumbs with common values for the common arguments of both crumbs.
248
249
    If `on` is None, will use all the common arguments names of both crumbs.
250
    Otherwise will use only the elements of `on`. All its items must be in both crumbs.
251
252
    Returns
253
    -------
254
    arg_names: list
255
        The matching items.
256
257
    Parameters
258
    ----------
259
    crumb1: hansel.Crumb
260
261
    crumb2: hansel.Crumb
262
263
    on: list of str
264
        Crumb argument names common to both input crumbs.
265
266
    Raises
267
    ------
268
    ValueError:
269
        If an element of `on` does not exists in either `list1` or `list2`.
270
271
    KeyError:
272
        If the result is empty.
273
274
    Returns
275
    -------
276
    inner_join: list[hansel.Crumb]
277
278
    Notes
279
    -----
280
    Use with care, ideally the argument matches should be in the same order in both crumbs.
281
282
    Both crumbs must have at least one matching identifier argument and one
283
    of those must be the one in `id_colname`.
284
285
    # TODO: this function can still be more efficient.
286
    """
287
    arg_names = list(_get_matching_items(list(crumb1.all_args()), list(crumb2.all_args()), items=on))
288
289
    if not arg_names:
290
        raise KeyError("Could not find matching arguments between "
291
                       "{} and  {} limited by {}.".format(list(crumb1.all_args()), list(crumb2.all_args()), on))
292
293
    maps1 = set(joint_value_map(crumb1, arg_names, check_exists=True))
294
    maps2 = set(joint_value_map(crumb2, arg_names, check_exists=True))
295
296
    intersect = maps1.intersection(maps2)
297
298
    return sorted(list(intersect))
299
300
301
class ParameterGrid(object):
302
    """
303
    Picked from sklearn: https://github.com/scikit-learn/scikit-learn
304
305
    Grid of parameters with a discrete number of values for each.
306
    Can be used to iterate over parameter value combinations with the
307
    Python built-in function iter.
308
309
    Read more in the :ref:`User Guide <grid_search>`.
310
    Parameters
311
    ----------
312
    param_grid : dict of string to sequence, or sequence of such
313
        The parameter grid to explore, as a dictionary mapping estimator
314
        parameters to sequences of allowed values.
315
        An empty dict signifies default parameters.
316
        A sequence of dicts signifies a sequence of grids to search, and is
317
        useful to avoid exploring parameter combinations that make no sense
318
        or have no effect. See the examples below.
319
    Examples
320
    --------
321
    >>> from sklearn.grid_search import ParameterGrid
322
    >>> param_grid = {'a': [1, 2], 'b': [True, False]}
323
    >>> list(ParameterGrid(param_grid)) == (
324
    ...    [{'a': 1, 'b': True}, {'a': 1, 'b': False},
325
    ...     {'a': 2, 'b': True}, {'a': 2, 'b': False}])
326
    True
327
    >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
328
    >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
329
    ...                               {'kernel': 'rbf', 'gamma': 1},
330
    ...                               {'kernel': 'rbf', 'gamma': 10}]
331
    True
332
    >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
333
    True
334
    """
335
336
    def __init__(self, param_grid):
337
        if isinstance(param_grid, Mapping):
338
            # wrap dictionary in a singleton list to support either dict
339
            # or list of dicts
340
            param_grid = [param_grid]
341
        self.param_grid = param_grid
342
343
    def __iter__(self):
344
        """Iterate over the points in the grid.
345
        Returns
346
        -------
347
        params : iterator over dict of string to any
348
            Yields dictionaries mapping each estimator parameter to one of its
349
            allowed values.
350
        """
351
        for p in self.param_grid:
352
            # Always sort the keys of a dictionary, for reproducibility
353
            items = sorted(p.items())
354
            if not items:
355
                yield {}
356
            else:
357
                keys, values = zip(*items)
358
                for v in product(*values):
359
                    params = dict(zip(keys, v))
360
                    yield params
361
362
    def __len__(self):
363
        """Number of points on the grid."""
364
        # Product function that can handle iterables (np.product can't).
365
        product = partial(reduce, operator.mul)
366
        return sum(product(len(v) for v in p.values()) if p else 1
367
                   for p in self.param_grid)
368
369
    def __getitem__(self, ind):
370
        """Get the parameters that would be ``ind``th in iteration
371
        Parameters
372
        ----------
373
        ind : int
374
            The iteration index
375
        Returns
376
        -------
377
        params : dict of string to any
378
            Equal to list(self)[ind]
379
        """
380
        # This is used to make discrete sampling without replacement memory
381
        # efficient.
382
        for sub_grid in self.param_grid:
383
            # XXX: could memoize information used here
384
            if not sub_grid:
385
                if ind == 0:
386
                    return {}
387
                else:
388
                    ind -= 1
389
                    continue
390
391
            # Reverse so most frequent cycling parameter comes first
392
            keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
393
            sizes = [len(v_list) for v_list in values_lists]
394
            product = partial(reduce, operator.mul)
395
            total = product(sizes)
396
397
            if ind >= total:
398
                # Try the next grid
399
                ind -= total
400
            else:
401
                out = {}
402
                for key, v_list, n in zip(keys, values_lists, sizes):
403
                    ind, offset = divmod(ind, n)
404
                    out[key] = v_list[offset]
405
                return out
406
407
        raise IndexError('ParameterGrid index out of range')
408