Failed Conditions
Pull Request — master (#2076)
by Abdeali
02:11
created

coalib/parsing/Globbing.py (9 issues)

1
import os
2
import platform
3
import re
4
5
from coalib.misc.Decorators import yield_once
6
from coalib.misc.Constants import GLOBBING_SPECIAL_CHARS
7
8
9
def _end_of_set_index(string, start_index):
10
    """
11
    Returns the position of the appropriate closing bracket for a glob set in
12
    string.
13
14
    :param string:      Glob string with wildcards
15
    :param start_index: Index at which the set starts, meaning the position
16
                        right behind the opening bracket
17
    :return:            Position of appropriate closing bracket
18
    """
19
    length = len(string)
20
    closing_index = start_index
21
    if closing_index < length and string[closing_index] == '!':
22
        closing_index += 1
23
24
    if closing_index < length:  # the set cannot be closed by a bracket here
25
        closing_index += 1
26
27
    while closing_index < length and string[closing_index] != ']':
28
        closing_index += 1
29
30
    return closing_index
31
32
33
def glob_escape(input_string):
34
    """
35
    Escapes the given string with ``[c]`` pattern. Examples:
36
37
    >>> from coalib.parsing.Globbing import glob_escape
38
    >>> glob_escape('test (1)')
39
    'test [(]1[)]'
40
    >>> glob_escape('test folder?')
41
    'test folder[?]'
42
    >>> glob_escape('test*folder')
43
    'test[*]folder'
44
45
    :param input_string: String that is to be escaped with ``[ ]``.
46
    :return:             Escaped string in which all the special glob characters
47
                         ``()[]|?*`` are escaped.
48
    """
49
    return re.sub("(?P<char>[" + re.escape(GLOBBING_SPECIAL_CHARS) + "])",
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable GLOBBING_SPECIAL_CHARS does not seem to be defined.
Loading history...
50
                  "[\\g<char>]", input_string)
51
52
53
def _position_is_bracketed(string, position):
54
    """
55
    Tests whether the char at string[position] is inside a valid pair of
56
    brackets (and therefore loses its special meaning)
57
58
    :param string:   Glob string with wildcards
59
    :param position: Position of a char in string
60
    :return:         Whether or not the char is inside a valid set of brackets
61
    """
62
    # allow negative positions and trim too long ones
63
    position = len(string[:position])
64
65
    index, length = 0, len(string)
66
    while index < position:
67
        char = string[index]
68
        index += 1
69
        if char == '[':
70
            closing_index = _end_of_set_index(string, index)
71
            if closing_index < length:
72
                if index <= position < closing_index:
73
                    return True
74
                index = closing_index + 1
75
            else:
76
                return False
77
    return False
78
79
80
def _boundary_of_alternatives_indices(pattern):
81
    """
82
    Determines the location of a set of alternatives in a glob pattern.
83
    Alternatives are defined by a matching set of non-bracketed parentheses.
84
85
    :param pattern: Glob pattern with wildcards.
86
    :return:        Indices of the innermost set of matching non-bracketed
87
                    parentheses in a tuple. The Index of a missing parenthesis
88
                    will be passed as None.
89
    """
90
    # Taking the leftmost closing parenthesis and the rightmost opening
91
    # parenthesis left of it ensures that the parentheses belong together and
92
    # the pattern is parsed correctly from the most nested section outwards.
93
    end_pos = None
94
    for match in re.finditer('\\)', pattern):
95
        if not _position_is_bracketed(pattern, match.start()):
96
            end_pos = match.start()
97
            break  # break to get leftmost
98
99
    start_pos = None
100
    for match in re.finditer('\\(', pattern[:end_pos]):
101
        if not _position_is_bracketed(pattern, match.start()):
102
            start_pos = match.end()
103
            # no break to get rightmost
104
105
    return start_pos, end_pos
106
107
108
@yield_once
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable yield_once does not seem to be defined.
Loading history...
109
def _iter_choices(pattern):
110
    """
111
    Iterate through each choice of an alternative. Splits pattern on '|'s if
112
    they are not bracketed.
113
114
    :param pattern: String of choices separated by '|'s
115
    :return:        Iterator that yields parts of string separated by
116
                    non-bracketed '|'s
117
    """
118
    start_pos = 0
119
    split_pos_list = [match.start() for match in re.finditer('\\|', pattern)]
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable match does not seem to be defined.
Loading history...
120
    split_pos_list.append(len(pattern))
121
    for end_pos in split_pos_list:
122
        if not _position_is_bracketed(pattern, end_pos):
123
            yield pattern[start_pos: end_pos]
124
            start_pos = end_pos + 1
125
126
127
@yield_once
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable yield_once does not seem to be defined.
Loading history...
128
def _iter_alternatives(pattern):
129
    """
130
    Iterates through all glob patterns that can be obtaines by combination of
131
    all choices for each alternative
132
133
    :param pattern: Glob pattern with wildcards
134
    :return:        Iterator that yields all glob patterns without alternatives
135
                    that can be created from the given pattern containing them.
136
    """
137
    start_pos, end_pos = _boundary_of_alternatives_indices(pattern)
138
139
    if None in (start_pos, end_pos):
140
        yield pattern
141
    else:
142
        # iterate through choices inside of parenthesis (separated by '|'):
143
        for choice in _iter_choices(pattern[start_pos: end_pos]):
144
            # put glob expression back together with alternative:
145
            variant = pattern[:start_pos-1] + choice + pattern[end_pos+1:]
146
147
            # iterate through alternatives outside of parenthesis
148
            # (pattern can have more alternatives elsewhere)
149
            for glob_pattern in _iter_alternatives(variant):
150
                yield glob_pattern
151
152
153
def translate(pattern):
154
    """
155
    Translates a pattern into a regular expression.
156
157
    :param pattern: Glob pattern with wildcards
158
    :return:        Regular expression with the same meaning
159
    """
160
    index, length = 0, len(pattern)
161
    regex = ''
162
    while index < length:
163
        char = pattern[index]
164
        index += 1
165
        if char == '*':
166
            # '**' matches everything
167
            if index < length and pattern[index] == '*':
168
                regex += '.*'
169
            # on Windows, '*' matches everything but the filesystem
170
            # separators '/' and '\'.
171
            elif platform.system() == 'Windows':  # pragma: nocover (Windows)
172
                regex += '[^/\\\\]*'
173
            # on all other (~Unix-) platforms, '*' matches everything but the
174
            # filesystem separator, most likely '/'.
175
            else:
176
                regex += '[^' + re.escape(os.sep) + ']*'
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable os does not seem to be defined.
Loading history...
177
        elif char == '?':
178
            regex += '.'
179
        elif char == '[':
180
            closing_index = _end_of_set_index(pattern, index)
181
            if closing_index >= length:
182
                regex += '\\['
183
            else:
184
                sequence = pattern[index:closing_index].replace('\\', '\\\\')
185
                index = closing_index+1
186
                if sequence[0] == '!':
187
                    sequence = '^' + sequence[1:]
188
                elif sequence[0] == '^':
189
                    sequence = '\\' + sequence
190
                regex += '[' + sequence + ']'
191
        else:
192
            regex = regex + re.escape(char)
193
    return regex + '\\Z(?ms)'
194
195
196
def fnmatch(name, patterns):
197
    """
198
    Tests whether name matches pattern
199
200
    :param name:     File or directory name
201
    :param patterns: Glob string with wildcards or list of globs
202
    :return:         Boolean: Whether or not name is matched by pattern
203
204
    Glob Syntax:
205
206
    -  '[seq]':         Matches any character in seq. Cannot be empty. Any
207
                        special character looses its special meaning in a set.
208
    -  '[!seq]':        Matches any character not in seq. Cannot be empty. Any
209
                        special character looses its special meaning in a set.
210
    -  '(seq_a|seq_b)': Matches either sequence_a or sequence_b as a whole.
211
                        More than two or just one sequence can be given.
212
    -  '?':             Matches any single character.
213
    -  '*':             Matches everything but os.sep.
214
    -  '**':            Matches everything.
215
    """
216
    if isinstance(patterns, str):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable str does not seem to be defined.
Loading history...
217
        patterns = [patterns]
218
    if len(patterns) == 0:
219
        return True
220
221
    name = os.path.normcase(name)
222
    for pattern in patterns:
223
        for pat in _iter_alternatives(pattern):
224
            pat = os.path.expanduser(pat)
225
            pat = os.path.normcase(pat)
226
            match = re.compile(translate(pat)).match
227
            if match(name) is not None:
228
                return True
229
    return False
230
231
232
def _absolute_flat_glob(pattern):
233
    """
234
    Glob function for a pattern that do not contain wildcards.
235
236
    :pattern: File or directory path
237
    :return:  Iterator that yields at most one valid file or dir name
238
    """
239
    dirname, basename = os.path.split(pattern)
240
241
    if basename:
242
        if os.path.exists(pattern):
243
            yield pattern
244
    else:
245
        # Patterns ending with a slash should match only directories
246
        if os.path.isdir(dirname):
247
            yield pattern
248
    return
249
250
251
def _iter_relative_dirs(dirname):
252
    """
253
    Recursively iterates subdirectories of all levels from dirname
254
255
    :param dirname: Directory name
256
    :return:        Iterator that yields files and directory from the given dir
257
                    and all it's (recursive) subdirectories
258
    """
259
    if not dirname:
260
        dirname = os.curdir
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable os does not seem to be defined.
Loading history...
261
    try:
262
        files_or_dirs = os.listdir(dirname)
263
    except os.error:
264
        return
265
    for file_or_dir in files_or_dirs:
266
        yield file_or_dir
267
        path = os.path.join(dirname, file_or_dir)
268
        for sub_file_or_dir in _iter_relative_dirs(path):
269
            yield os.path.join(file_or_dir, sub_file_or_dir)
270
271
272
def relative_wildcard_glob(dirname, pattern):
273
    """
274
    Non-recursive glob for one directory. Accepts wildcards.
275
276
    :param dirname: Directory name
277
    :param pattern: Glob pattern with wildcards
278
    :return:        List of files in the dir of dirname that match the pattern
279
    """
280
    if not dirname:
281
        dirname = os.curdir
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable os does not seem to be defined.
Loading history...
282
    try:
283
        names = os.listdir(dirname)
284
    except OSError:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable OSError does not seem to be defined.
Loading history...
285
        return []
286
    result = []
287
    pattern = os.path.normcase(pattern)
288
    match = re.compile(translate(pattern)).match
289
    for name in names:
290
        if match(os.path.normcase(name)):
291
            result.append(name)
292
    return result
293
294
295
def relative_flat_glob(dirname, basename):
296
    """
297
    Non-recursive glob for one directory. Does not accept wildcards.
298
299
    :param dirname:  Directory name
300
    :param basename: Basename of a file in dir of dirname
301
    :return:         List containing Basename if the file exists
302
    """
303
    if os.path.exists(os.path.join(dirname, basename)):
304
        return [basename]
305
    return[]
306
307
308
def relative_recursive_glob(dirname, pattern):
309
    """
310
    Recursive Glob for one directory and all its (nested) subdirectories.
311
    Accepts only '**' as pattern.
312
313
    :param dirname: Directory name
314
    :param pattern: The recursive wildcard '**'
315
    :return:        Iterator that yields all the (nested) subdirectories of the
316
                    given dir
317
    """
318
    assert pattern == '**'
319
    if dirname:
320
        yield pattern[:0]
321
    for relative_dir in _iter_relative_dirs(dirname):
322
        yield relative_dir
323
324
325
wildcard_check_pattern = re.compile('([*?[])')
326
327
328
def has_wildcard(pattern):
329
    """
330
    Checks whether pattern has any wildcards.
331
332
    :param pattern: Glob pattern that may contain wildcards
333
    :return:        Boolean: Whether or not there are wildcards in pattern
334
    """
335
    match = wildcard_check_pattern.search(pattern)
336
    return match is not None
337
338
339
def iglob(pattern):
340
    """
341
    Iterates all filesystem paths that get matched by the glob pattern.
342
    Syntax is equal to that of fnmatch.
343
344
    :param pattern: Glob pattern with wildcards
345
    :return:        Iterator that yields all file names that match pattern
346
    """
347
    for pat in _iter_alternatives(pattern):
348
        pat = os.path.expanduser(pat)
349
        pat = os.path.normcase(pat)
350
        dirname, basename = os.path.split(pat)
351
        if not has_wildcard(pat):
352
            for file in _absolute_flat_glob(pat):
353
                yield file
354
            return
355
356
        if basename == '**':
357
            relative_glob_function = relative_recursive_glob
358
        elif has_wildcard(basename):
359
            relative_glob_function = relative_wildcard_glob
360
        else:
361
            relative_glob_function = relative_flat_glob
362
363
        if not dirname:
364
            for file in relative_glob_function(dirname, basename):
365
                yield file
366
            return
367
368
        # Prevent an infinite recursion if a drive or UNC path contains
369
        # wildcard characters (i.e. r'\\?\C:').
370
        if dirname != pat and has_wildcard(dirname):
371
            dirs = iglob(dirname)
372
        else:
373
            dirs = [dirname]
374
375
        for dirname in dirs:
376
            for name in relative_glob_function(dirname, basename):
377
                yield os.path.join(dirname, name)
378
379
380
def glob(pattern):
381
    """
382
    Iterates all filesystem paths that get matched by the glob pattern.
383
    Syntax is equal to that of fnmatch.
384
385
    :param pattern: Glob pattern with wildcards
386
    :return:        List of all file names that match pattern
387
    """
388
    return list(iglob(pattern))
389