hansel.list_subpaths() - Code Metrics - Inspection of "Merge branch 'join'" - alexsavio/hansel - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( a95b57...038136 )

by Alexandre M.

created 2016-02-11 10:21 UTC

hansel.list_subpaths() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	6
dl	0
loc	40
rs	7.5384

# -*- coding: utf-8 -*-
# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
"""
Utilities to make crumbs
"""
import fnmatch
import operator
import os
import os.path as op
import re
from   collections import Mapping
from   copy        import deepcopy
from   functools   import partial, reduce
from   itertools   import product

from ._utils import _check_is_subset


def rm_dups(lst):
    """ Return a sorted lst of non-duplicated elements from `lst`.

    Parameters
    ----------
    lst: sequence of any

    Returns
    -------
    fslst:
        Filtered and sorted `lst` with non duplicated elements of `lst`.
    """
    return sorted(list(set(lst)))


def remove_ignored(ignore, strs):
    """ Remove from `strs` the matches to the `fnmatch` (glob) patterns and
    return the result in a list."""
    nustrs = deepcopy(strs)
    for ign in ignore:
        nustrs = [item for item in nustrs if not fnmatch.fnmatch(item, ign)]

    return nustrs


def fnmatch_filter(pattern, items, *args):
    """ Return the items from `items` that match the fnmatch expression in `pattern`.
    Parameters
    ----------
    pattern: str
        Regular expression

    items: list of str
        The items to be checked

    args: ignored

    Returns
    -------
    matches: list of str
        Matched items
    """
    return [item for item in items if fnmatch.fnmatch(item, pattern)]


def regex_match_filter(pattern, items, *args):
    """ Return the items from `items` that match the regular expression in `pattern`.
    Parameters
    ----------
    pattern: str
        Regular expression

    items: list of str
        The items to be checked

    args: re.compile arguments

    Returns
    -------
    matches: list of str
        Matched items
    """
    test = re.compile(pattern, *args)
    return [s for s in items if test.match(s)]


def list_children(path, just_dirs=False):
    """ Return the immediate elements (files and folders) in `path`.
    Parameters
    ----------
    path: str

    just_dirs: bool
        If True will return only folders.

    ignore: sequence of str
        Sequence of glob patterns to ignore from the listing.

    re: str
        Regular expression that the result items must match.

    Returns
    -------
    paths: list of str
    """
    if not op.exists(path):
        raise IOError("Expected an existing path, but could not"
                      " find {}.".format(path))

    if op.isfile(path):
        if just_dirs:
            vals = []
        else:
            vals = [path]
    else:
        if just_dirs: # this means we have to list only folders
            vals = [d for d in os.listdir(path) if op.isdir(op.join(path, d))]
        else:   # this means we have to list files
            vals = os.listdir(path)

    return vals


def list_subpaths(path, just_dirs=False, ignore=None, pattern=None,
                  filter_func=fnmatch_filter, filter_args=None):
    """ Return the immediate elements (files and folders) within `path`.
    Parameters
    ----------
    path: str

    just_dirs: bool
        If True will return only folders.

    ignore: sequence of str
        Sequence of glob patterns to ignore from the listing.

    pattern: str
        Regular expression that the result items must match.

    filter_func: func
        The function to match the patterns.
        Must have as arguments: (pattern, paths) and return
        a subset of paths.

    filter_args: filter func arguments
        Arguments for the filter function.

    Returns
    -------
    paths: list of str
    """
    paths = list_children(path, just_dirs=just_dirs)

    if ignore and ignore is not None:
        paths = remove_ignored(ignore, paths)

    if pattern and pattern is not None:
        if filter_args is None:
            filter_args = ()

        paths = filter_func(pattern, paths, *filter_args)

    return paths


def list_intersection(list1, list2):
    """ Return a list of elements that are the intersection between the set of elements
    of `list1` and `list2`·
    This will keep the same order of the elements in `list1`.
    """
    return (arg_name for arg_name in list1 if arg_name in list2)


def _intersect_crumb_args(crumb1, crumb2):
    """ Return a list of `arg_names` that are the intersection between the arguments
    of `crumb1` and `crumb2`·
    This will keep the same order as the arguments are in `all_args` function from `crumb1`.
    """
    return list_intersection(crumb1.all_args(), crumb2.all_args())


def _get_matching_items(list1, list2, items=None):
    """ If `items` is None, Return a list of items that are in
    `list1` and `list2`. Otherwise will return the elements of `items` if
    they are in both lists.
    Keep the order in `list1` or in `items`.

    Returns
    -------
    arg_names: list
        The matching items.

    Raises
    ------
    ValueError:
        If an element of items does not exists in either `list1` or `list2`.
    """
    if items is None:
        arg_names = list_intersection(list1, list2)
    else:
        try:
            _check_is_subset(items, list1)
            _check_is_subset(items, list2)
        except KeyError:
            arg_names = []
        except:
            raise
        else:
            arg_names = items

    return arg_names


def joint_value_map(crumb, arg_names, check_exists=True):
    """ Return a list of tuples of crumb argument values of the given `arg_names`.
    Parameters
    ----------
    arg_name: str

    check_exists: bool
        If True will return only a values_map with sets of crumb arguments that fill a crumb to an existing path.
        Otherwise it won't check if they exist and return all possible combinations.

    Returns
    -------
    values_map: list of lists of 2-tuples
        I call values_map what is called `record` in pandas. It is a list of lists of 2-tuples, where each 2-tuple
        has the shape (arg_name, arg_value).
    """
    values_map = []
    for arg_name in arg_names:
        values_map.append(list((arg_name, arg_value) for arg_value in crumb[arg_name]))

    if len(arg_names) == 1:
        return values_map[0]
    else:
        if not check_exists:
            values_map_checked = values_map[:]
        else:
            args_crumbs = [(args, crumb.replace(**dict(args))) for args in set(product(*values_map))]
            values_map_checked = [args for args, cr in args_crumbs if cr.exists()]

    return values_map_checked


def intersection(crumb1, crumb2, on=None):
    """ Return an 'inner join' of both given Crumbs, i.e., will return a list of
    Crumbs with common values for the common arguments of both crumbs.

    If `on` is None, will use all the common arguments names of both crumbs.
    Otherwise will use only the elements of `on`. All its items must be in both crumbs.

    Returns
    -------
    arg_names: list
        The matching items.

    Parameters
    ----------
    crumb1: hansel.Crumb

    crumb2: hansel.Crumb

    on: list of str
        Crumb argument names common to both input crumbs.

    Raises
    ------
    ValueError:
        If an element of `on` does not exists in either `list1` or `list2`.

    KeyError:
        If the result is empty.

    Returns
    -------
    inner_join: list[hansel.Crumb]

    Notes
    -----
    Use with care, ideally the argument matches should be in the same order in both crumbs.

    Both crumbs must have at least one matching identifier argument and one
    of those must be the one in `id_colname`.

    # TODO: this function can still be more efficient.
    """
    arg_names = list(_get_matching_items(list(crumb1.all_args()), list(crumb2.all_args()), items=on))

    if not arg_names:
        raise KeyError("Could not find matching arguments between "
                       "{} and  {} limited by {}.".format(list(crumb1.all_args()), list(crumb2.all_args()), on))

    maps1 = set(joint_value_map(crumb1, arg_names, check_exists=True))
    maps2 = set(joint_value_map(crumb2, arg_names, check_exists=True))

    intersect = maps1.intersection(maps2)

    return sorted(list(intersect))


class ParameterGrid(object):
    """
    Picked from sklearn: https://github.com/scikit-learn/scikit-learn

    Grid of parameters with a discrete number of values for each.
    Can be used to iterate over parameter value combinations with the
    Python built-in function iter.

    Read more in the :ref:`User Guide <grid_search>`.
    Parameters
    ----------
    param_grid : dict of string to sequence, or sequence of such
        The parameter grid to explore, as a dictionary mapping estimator
        parameters to sequences of allowed values.
        An empty dict signifies default parameters.
        A sequence of dicts signifies a sequence of grids to search, and is
        useful to avoid exploring parameter combinations that make no sense
        or have no effect. See the examples below.
    Examples
    --------
    >>> from sklearn.grid_search import ParameterGrid
    >>> param_grid = {'a': [1, 2], 'b': [True, False]}
    >>> list(ParameterGrid(param_grid)) == (
    ...    [{'a': 1, 'b': True}, {'a': 1, 'b': False},
    ...     {'a': 2, 'b': True}, {'a': 2, 'b': False}])
    True
    >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
    >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
    ...                               {'kernel': 'rbf', 'gamma': 1},
    ...                               {'kernel': 'rbf', 'gamma': 10}]
    True
    >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
    True
    """

    def __init__(self, param_grid):
        if isinstance(param_grid, Mapping):
            # wrap dictionary in a singleton list to support either dict
            # or list of dicts
            param_grid = [param_grid]
        self.param_grid = param_grid

    def __iter__(self):
        """Iterate over the points in the grid.
        Returns
        -------
        params : iterator over dict of string to any
            Yields dictionaries mapping each estimator parameter to one of its
            allowed values.
        """
        for p in self.param_grid:
            # Always sort the keys of a dictionary, for reproducibility
            items = sorted(p.items())
            if not items:
                yield {}
            else:
                keys, values = zip(*items)
                for v in product(*values):
                    params = dict(zip(keys, v))
                    yield params

    def __len__(self):
        """Number of points on the grid."""
        # Product function that can handle iterables (np.product can't).
        product = partial(reduce, operator.mul)
        return sum(product(len(v) for v in p.values()) if p else 1
                   for p in self.param_grid)

    def __getitem__(self, ind):
        """Get the parameters that would be ``ind``th in iteration
        Parameters
        ----------
        ind : int
            The iteration index
        Returns
        -------
        params : dict of string to any
            Equal to list(self)[ind]
        """
        # This is used to make discrete sampling without replacement memory
        # efficient.
        for sub_grid in self.param_grid:
            # XXX: could memoize information used here
            if not sub_grid:
                if ind == 0:
                    return {}
                else:
                    ind -= 1
                    continue

            # Reverse so most frequent cycling parameter comes first
            keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
            sizes = [len(v_list) for v_list in values_lists]
            product = partial(reduce, operator.mul)
            total = product(sizes)

            if ind >= total:
                # Try the next grid
                ind -= total
            else:
                out = {}
                for key, v_list, n in zip(keys, values_lists, sizes):
                    ind, offset = divmod(ind, n)
                    out[key] = v_list[offset]
                return out

        raise IndexError('ParameterGrid index out of range')


1			# -- coding: utf-8 --
2			# emacs: -- mode: python; py-indent-offset: 4; indent-tabs-mode: nil --
3			# vi: set ft=python sts=4 ts=4 sw=4 et:
4			"""
5			Utilities to make crumbs
6			"""
7			import fnmatch
8			import operator
9			import os
10			import os.path as op
11			import re
12			from collections import Mapping
13			from copy import deepcopy
14			from functools import partial, reduce
15			from itertools import product
16
17			from ._utils import _check_is_subset
18
19
20			def rm_dups(lst):
21			""" Return a sorted lst of non-duplicated elements from `lst`.
22
23			Parameters
24			----------
25			lst: sequence of any
26
27			Returns
28			-------
29			fslst:
30			Filtered and sorted `lst` with non duplicated elements of `lst`.
31			"""
32			return sorted(list(set(lst)))
33
34
35			def remove_ignored(ignore, strs):
36			""" Remove from `strs` the matches to the `fnmatch` (glob) patterns and
37			return the result in a list."""
38			nustrs = deepcopy(strs)
39			for ign in ignore:
40			nustrs = [item for item in nustrs if not fnmatch.fnmatch(item, ign)]
41
42			return nustrs
43
44
45			def fnmatch_filter(pattern, items, *args):
46			""" Return the items from `items` that match the fnmatch expression in `pattern`.
47			Parameters
48			----------
49			pattern: str
50			Regular expression
51
52			items: list of str
53			The items to be checked
54
55			args: ignored
56
57			Returns
58			-------
59			matches: list of str
60			Matched items
61			"""
62			return [item for item in items if fnmatch.fnmatch(item, pattern)]
63
64
65			def regex_match_filter(pattern, items, *args):
66			""" Return the items from `items` that match the regular expression in `pattern`.
67			Parameters
68			----------
69			pattern: str
70			Regular expression
71
72			items: list of str
73			The items to be checked
74
75			args: re.compile arguments
76
77			Returns
78			-------
79			matches: list of str
80			Matched items
81			"""
82			test = re.compile(pattern, *args)
83			return [s for s in items if test.match(s)]
84
85
86			def list_children(path, just_dirs=False):
87			""" Return the immediate elements (files and folders) in `path`.
88			Parameters
89			----------
90			path: str
91
92			just_dirs: bool
93			If True will return only folders.
94
95			ignore: sequence of str
96			Sequence of glob patterns to ignore from the listing.
97
98			re: str
99			Regular expression that the result items must match.
100
101			Returns
102			-------
103			paths: list of str
104			"""
105			if not op.exists(path):
106			raise IOError("Expected an existing path, but could not"
107			" find {}.".format(path))
108
109			if op.isfile(path):
110			if just_dirs:
111			vals = []
112			else:
113			vals = [path]
114			else:
115			if just_dirs: # this means we have to list only folders
116			vals = [d for d in os.listdir(path) if op.isdir(op.join(path, d))]
117			else: # this means we have to list files
118			vals = os.listdir(path)
119
120			return vals
121
122
123			def list_subpaths(path, just_dirs=False, ignore=None, pattern=None,
124			filter_func=fnmatch_filter, filter_args=None):
125			""" Return the immediate elements (files and folders) within `path`.
126			Parameters
127			----------
128			path: str
129
130			just_dirs: bool
131			If True will return only folders.
132
133			ignore: sequence of str
134			Sequence of glob patterns to ignore from the listing.
135
136			pattern: str
137			Regular expression that the result items must match.
138
139			filter_func: func
140			The function to match the patterns.
141			Must have as arguments: (pattern, paths) and return
142			a subset of paths.
143
144			filter_args: filter func arguments
145			Arguments for the filter function.
146
147			Returns
148			-------
149			paths: list of str
150			"""
151			paths = list_children(path, just_dirs=just_dirs)
152
153			if ignore and ignore is not None:
154			paths = remove_ignored(ignore, paths)
155
156			if pattern and pattern is not None:
157			if filter_args is None:
158			filter_args = ()
159
160			paths = filter_func(pattern, paths, *filter_args)
161
162			return paths
163
164
165			def list_intersection(list1, list2):
166			""" Return a list of elements that are the intersection between the set of elements
167			of `list1` and `list2`·
168			This will keep the same order of the elements in `list1`.
169			"""
170			return (arg_name for arg_name in list1 if arg_name in list2)
171
172
173			def _intersect_crumb_args(crumb1, crumb2):
174			""" Return a list of `arg_names` that are the intersection between the arguments
175			of `crumb1` and `crumb2`·
176			This will keep the same order as the arguments are in `all_args` function from `crumb1`.
177			"""
178			return list_intersection(crumb1.all_args(), crumb2.all_args())
179
180
181			def _get_matching_items(list1, list2, items=None):
182			""" If `items` is None, Return a list of items that are in
183			`list1` and `list2`. Otherwise will return the elements of `items` if
184			they are in both lists.
185			Keep the order in `list1` or in `items`.
186
187			Returns
188			-------
189			arg_names: list
190			The matching items.
191
192			Raises
193			------
194			ValueError:
195			If an element of items does not exists in either `list1` or `list2`.
196			"""
197			if items is None:
198			arg_names = list_intersection(list1, list2)
199			else:
200			try:
201			_check_is_subset(items, list1)
202			_check_is_subset(items, list2)
203			except KeyError:
204			arg_names = []
205			except:
206			raise
207			else:
208			arg_names = items
209
210			return arg_names
211
212
213			def joint_value_map(crumb, arg_names, check_exists=True):
214			""" Return a list of tuples of crumb argument values of the given `arg_names`.
215			Parameters
216			----------
217			arg_name: str
218
219			check_exists: bool
220			If True will return only a values_map with sets of crumb arguments that fill a crumb to an existing path.
221			Otherwise it won't check if they exist and return all possible combinations.
222
223			Returns
224			-------
225			values_map: list of lists of 2-tuples
226			I call values_map what is called `record` in pandas. It is a list of lists of 2-tuples, where each 2-tuple
227			has the shape (arg_name, arg_value).
228			"""
229			values_map = []
230			for arg_name in arg_names:
231			values_map.append(list((arg_name, arg_value) for arg_value in crumb[arg_name]))
232
233			if len(arg_names) == 1:
234			return values_map[0]
235			else:
236			if not check_exists:
237			values_map_checked = values_map[:]
238			else:
239			args_crumbs = [(args, crumb.replace(*dict(args))) for args in set(product(values_map))]
240			values_map_checked = [args for args, cr in args_crumbs if cr.exists()]
241
242			return values_map_checked
243
244
245			def intersection(crumb1, crumb2, on=None):
246			""" Return an 'inner join' of both given Crumbs, i.e., will return a list of
247			Crumbs with common values for the common arguments of both crumbs.
248
249			If `on` is None, will use all the common arguments names of both crumbs.
250			Otherwise will use only the elements of `on`. All its items must be in both crumbs.
251
252			Returns
253			-------
254			arg_names: list
255			The matching items.
256
257			Parameters
258			----------
259			crumb1: hansel.Crumb
260
261			crumb2: hansel.Crumb
262
263			on: list of str
264			Crumb argument names common to both input crumbs.
265
266			Raises
267			------
268			ValueError:
269			If an element of `on` does not exists in either `list1` or `list2`.
270
271			KeyError:
272			If the result is empty.
273
274			Returns
275			-------
276			inner_join: list[hansel.Crumb]
277
278			Notes
279			-----
280			Use with care, ideally the argument matches should be in the same order in both crumbs.
281
282			Both crumbs must have at least one matching identifier argument and one
283			of those must be the one in `id_colname`.
284
285			# TODO: this function can still be more efficient.
286			"""
287			arg_names = list(_get_matching_items(list(crumb1.all_args()), list(crumb2.all_args()), items=on))
288
289			if not arg_names:
290			raise KeyError("Could not find matching arguments between "
291			"{} and {} limited by {}.".format(list(crumb1.all_args()), list(crumb2.all_args()), on))
292
293			maps1 = set(joint_value_map(crumb1, arg_names, check_exists=True))
294			maps2 = set(joint_value_map(crumb2, arg_names, check_exists=True))
295
296			intersect = maps1.intersection(maps2)
297
298			return sorted(list(intersect))
299
300
301			class ParameterGrid(object):
302			"""
303			Picked from sklearn: https://github.com/scikit-learn/scikit-learn
304
305			Grid of parameters with a discrete number of values for each.
306			Can be used to iterate over parameter value combinations with the
307			Python built-in function iter.
308
309			Read more in the :ref:`User Guide <grid_search>`.
310			Parameters
311			----------
312			param_grid : dict of string to sequence, or sequence of such
313			The parameter grid to explore, as a dictionary mapping estimator
314			parameters to sequences of allowed values.
315			An empty dict signifies default parameters.
316			A sequence of dicts signifies a sequence of grids to search, and is
317			useful to avoid exploring parameter combinations that make no sense
318			or have no effect. See the examples below.
319			Examples
320			--------
321			>>> from sklearn.grid_search import ParameterGrid
322			>>> param_grid = {'a': [1, 2], 'b': [True, False]}
323			>>> list(ParameterGrid(param_grid)) == (
324			... [{'a': 1, 'b': True}, {'a': 1, 'b': False},
325			... {'a': 2, 'b': True}, {'a': 2, 'b': False}])
326			True
327			>>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
328			>>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
329			... {'kernel': 'rbf', 'gamma': 1},
330			... {'kernel': 'rbf', 'gamma': 10}]
331			True
332			>>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
333			True
334			"""
335
336			def __init__(self, param_grid):
337			if isinstance(param_grid, Mapping):
338			# wrap dictionary in a singleton list to support either dict
339			# or list of dicts
340			param_grid = [param_grid]
341			self.param_grid = param_grid
342
343			def __iter__(self):
344			"""Iterate over the points in the grid.
345			Returns
346			-------
347			params : iterator over dict of string to any
348			Yields dictionaries mapping each estimator parameter to one of its
349			allowed values.
350			"""
351			for p in self.param_grid:
352			# Always sort the keys of a dictionary, for reproducibility
353			items = sorted(p.items())
354			if not items:
355			yield {}
356			else:
357			keys, values = zip(*items)
358			for v in product(*values):
359			params = dict(zip(keys, v))
360			yield params
361
362			def __len__(self):
363			"""Number of points on the grid."""
364			# Product function that can handle iterables (np.product can't).
365			product = partial(reduce, operator.mul)
366			return sum(product(len(v) for v in p.values()) if p else 1
367			for p in self.param_grid)
368
369			def __getitem__(self, ind):
370			"""Get the parameters that would be ``ind``th in iteration
371			Parameters
372			----------
373			ind : int
374			The iteration index
375			Returns
376			-------
377			params : dict of string to any
378			Equal to list(self)[ind]
379			"""
380			# This is used to make discrete sampling without replacement memory
381			# efficient.
382			for sub_grid in self.param_grid:
383			# XXX: could memoize information used here
384			if not sub_grid:
385			if ind == 0:
386			return {}
387			else:
388			ind -= 1
389			continue
390
391			# Reverse so most frequent cycling parameter comes first
392			keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
393			sizes = [len(v_list) for v_list in values_lists]
394			product = partial(reduce, operator.mul)
395			total = product(sizes)
396
397			if ind >= total:
398			# Try the next grid
399			ind -= total
400			else:
401			out = {}
402			for key, v_list, n in zip(keys, values_lists, sizes):
403			ind, offset = divmod(ind, n)
404			out[key] = v_list[offset]
405			return out
406
407			raise IndexError('ParameterGrid index out of range')
408

alexsavio / hansel

Push — master ( a95b57...038136 )

hansel.list_subpaths() B

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like