Completed
Pull Request — master (#2296)
by Lasse
03:11 queued 01:21
created

_compile_patterns()   A

Complexity

Conditions 3

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
c 1
b 0
f 0
dl 0
loc 6
rs 9.4285
1
import os
2
import platform
3
import re
4
from functools import lru_cache
5
6
from coala_decorators.decorators import yield_once
7
from coalib.misc.Constants import GLOBBING_SPECIAL_CHARS
8
9
10
def _end_of_set_index(string, start_index):
11
    """
12
    Returns the position of the appropriate closing bracket for a glob set in
13
    string.
14
15
    :param string:      Glob string with wildcards
16
    :param start_index: Index at which the set starts, meaning the position
17
                        right behind the opening bracket
18
    :return:            Position of appropriate closing bracket
19
    """
20
    length = len(string)
21
    closing_index = start_index
22
    if closing_index < length and string[closing_index] == '!':
23
        closing_index += 1
24
25
    if closing_index < length:  # the set cannot be closed by a bracket here
26
        closing_index += 1
27
28
    while closing_index < length and string[closing_index] != ']':
29
        closing_index += 1
30
31
    return closing_index
32
33
34
def glob_escape(input_string):
35
    """
36
    Escapes the given string with ``[c]`` pattern. Examples:
37
38
    >>> from coalib.parsing.Globbing import glob_escape
39
    >>> glob_escape('test (1)')
40
    'test [(]1[)]'
41
    >>> glob_escape('test folder?')
42
    'test folder[?]'
43
    >>> glob_escape('test*folder')
44
    'test[*]folder'
45
46
    :param input_string: String that is to be escaped with ``[ ]``.
47
    :return:             Escaped string in which all the special glob characters
48
                         ``()[]|?*`` are escaped.
49
    """
50
    return re.sub("(?P<char>[" + re.escape(GLOBBING_SPECIAL_CHARS) + "])",
51
                  "[\\g<char>]", input_string)
52
53
54
def _position_is_bracketed(string, position):
55
    """
56
    Tests whether the char at string[position] is inside a valid pair of
57
    brackets (and therefore loses its special meaning)
58
59
    :param string:   Glob string with wildcards
60
    :param position: Position of a char in string
61
    :return:         Whether or not the char is inside a valid set of brackets
62
    """
63
    # allow negative positions and trim too long ones
64
    position = len(string[:position])
65
66
    index, length = 0, len(string)
67
    while index < position:
68
        char = string[index]
69
        index += 1
70
        if char == '[':
71
            closing_index = _end_of_set_index(string, index)
72
            if closing_index < length:
73
                if index <= position < closing_index:
74
                    return True
75
                index = closing_index + 1
76
            else:
77
                return False
78
    return False
79
80
81
def _boundary_of_alternatives_indices(pattern):
82
    """
83
    Determines the location of a set of alternatives in a glob pattern.
84
    Alternatives are defined by a matching set of non-bracketed parentheses.
85
86
    :param pattern: Glob pattern with wildcards.
87
    :return:        Indices of the innermost set of matching non-bracketed
88
                    parentheses in a tuple. The Index of a missing parenthesis
89
                    will be passed as None.
90
    """
91
    # Taking the leftmost closing parenthesis and the rightmost opening
92
    # parenthesis left of it ensures that the parentheses belong together and
93
    # the pattern is parsed correctly from the most nested section outwards.
94
    end_pos = None
95
    for match in re.finditer('\\)', pattern):
96
        if not _position_is_bracketed(pattern, match.start()):
97
            end_pos = match.start()
98
            break  # break to get leftmost
99
100
    start_pos = None
101
    for match in re.finditer('\\(', pattern[:end_pos]):
102
        if not _position_is_bracketed(pattern, match.start()):
103
            start_pos = match.end()
104
            # no break to get rightmost
105
106
    return start_pos, end_pos
107
108
109
@yield_once
110
def _iter_choices(pattern):
111
    """
112
    Iterate through each choice of an alternative. Splits pattern on '|'s if
113
    they are not bracketed.
114
115
    :param pattern: String of choices separated by '|'s
116
    :return:        Iterator that yields parts of string separated by
117
                    non-bracketed '|'s
118
    """
119
    start_pos = 0
120
    split_pos_list = [match.start() for match in re.finditer('\\|', pattern)]
121
    split_pos_list.append(len(pattern))
122
    for end_pos in split_pos_list:
123
        if not _position_is_bracketed(pattern, end_pos):
124
            yield pattern[start_pos: end_pos]
125
            start_pos = end_pos + 1
126
127
128
@yield_once
129
def _iter_alternatives(pattern):
130
    """
131
    Iterates through all glob patterns that can be obtaines by combination of
132
    all choices for each alternative
133
134
    :param pattern: Glob pattern with wildcards
135
    :return:        Iterator that yields all glob patterns without alternatives
136
                    that can be created from the given pattern containing them.
137
    """
138
    start_pos, end_pos = _boundary_of_alternatives_indices(pattern)
139
140
    if None in (start_pos, end_pos):
141
        yield pattern
142
    else:
143
        # iterate through choices inside of parenthesis (separated by '|'):
144
        for choice in _iter_choices(pattern[start_pos: end_pos]):
145
            # put glob expression back together with alternative:
146
            variant = pattern[:start_pos-1] + choice + pattern[end_pos+1:]
147
148
            # iterate through alternatives outside of parenthesis
149
            # (pattern can have more alternatives elsewhere)
150
            for glob_pattern in _iter_alternatives(variant):
151
                yield glob_pattern
152
153
154
def translate(pattern):
155
    """
156
    Translates a pattern into a regular expression.
157
158
    :param pattern: Glob pattern with wildcards
159
    :return:        Regular expression with the same meaning
160
    """
161
    index, length = 0, len(pattern)
162
    regex = ''
163
    while index < length:
164
        char = pattern[index]
165
        index += 1
166
        if char == '*':
167
            # '**' matches everything
168
            if index < length and pattern[index] == '*':
169
                regex += '.*'
170
            # on Windows, '*' matches everything but the filesystem
171
            # separators '/' and '\'.
172
            elif platform.system() == 'Windows':  # pragma: nocover (Windows)
173
                regex += '[^/\\\\]*'
174
            # on all other (~Unix-) platforms, '*' matches everything but the
175
            # filesystem separator, most likely '/'.
176
            else:
177
                regex += '[^' + re.escape(os.sep) + ']*'
178
        elif char == '?':
179
            regex += '.'
180
        elif char == '[':
181
            closing_index = _end_of_set_index(pattern, index)
182
            if closing_index >= length:
183
                regex += '\\['
184
            else:
185
                sequence = pattern[index:closing_index].replace('\\', '\\\\')
186
                index = closing_index+1
187
                if sequence[0] == '!':
188
                    sequence = '^' + sequence[1:]
189
                elif sequence[0] == '^':
190
                    sequence = '\\' + sequence
191
                regex += '[' + sequence + ']'
192
        else:
193
            regex = regex + re.escape(char)
194
    return regex + '\\Z(?ms)'
195
196
197
def fnmatch(name, patterns):
198
    """
199
    Tests whether name matches pattern
200
201
    :param name:     File or directory name
202
    :param patterns: Glob string with wildcards or list of globs
203
    :return:         Boolean: Whether or not name is matched by pattern
204
205
    Glob Syntax:
206
207
    -  '[seq]':         Matches any character in seq. Cannot be empty. Any
208
                        special character looses its special meaning in a set.
209
    -  '[!seq]':        Matches any character not in seq. Cannot be empty. Any
210
                        special character looses its special meaning in a set.
211
    -  '(seq_a|seq_b)': Matches either sequence_a or sequence_b as a whole.
212
                        More than two or just one sequence can be given.
213
    -  '?':             Matches any single character.
214
    -  '*':             Matches everything but os.sep.
215
    -  '**':            Matches everything.
216
    """
217
    patterns = (patterns,) if isinstance(patterns, str) else tuple(patterns)
218
219
    if len(patterns) == 0:
220
        return True
221
222
    name = os.path.normcase(name)
223
224
    return any(match(name) for match in _compile_patterns(patterns))
225
226
227
@lru_cache()
228
def _compile_patterns(patterns):
229
    return tuple(re.compile(translate(os.path.normcase(
230
                     os.path.expanduser(pat)))).match
231
                 for pattern in patterns
232
                 for pat in _iter_alternatives(pattern))
233
234
235
def _absolute_flat_glob(pattern):
236
    """
237
    Glob function for a pattern that do not contain wildcards.
238
239
    :pattern: File or directory path
240
    :return:  Iterator that yields at most one valid file or dir name
241
    """
242
    dirname, basename = os.path.split(pattern)
243
244
    if basename:
245
        if os.path.exists(pattern):
246
            yield pattern
247
    else:
248
        # Patterns ending with a slash should match only directories
249
        if os.path.isdir(dirname):
250
            yield pattern
251
    return
252
253
254
def _iter_relative_dirs(dirname):
255
    """
256
    Recursively iterates subdirectories of all levels from dirname
257
258
    :param dirname: Directory name
259
    :return:        Iterator that yields files and directory from the given dir
260
                    and all it's (recursive) subdirectories
261
    """
262
    if not dirname:
263
        dirname = os.curdir
264
    try:
265
        files_or_dirs = os.listdir(dirname)
266
    except os.error:
267
        return
268
    for file_or_dir in files_or_dirs:
269
        yield file_or_dir
270
        path = os.path.join(dirname, file_or_dir)
271
        for sub_file_or_dir in _iter_relative_dirs(path):
272
            yield os.path.join(file_or_dir, sub_file_or_dir)
273
274
275
def relative_wildcard_glob(dirname, pattern):
276
    """
277
    Non-recursive glob for one directory. Accepts wildcards.
278
279
    :param dirname: Directory name
280
    :param pattern: Glob pattern with wildcards
281
    :return:        List of files in the dir of dirname that match the pattern
282
    """
283
    if not dirname:
284
        dirname = os.curdir
285
    try:
286
        if '**' in pattern:
287
            names = list(_iter_relative_dirs(dirname))
288
        else:
289
            names = os.listdir(dirname)
290
    except OSError:
291
        return []
292
    result = []
293
    pattern = os.path.normcase(pattern)
294
    match = re.compile(translate(pattern)).match
295
    for name in names:
296
        if match(os.path.normcase(name)):
297
            result.append(name)
298
    return result
299
300
301
def relative_flat_glob(dirname, basename):
302
    """
303
    Non-recursive glob for one directory. Does not accept wildcards.
304
305
    :param dirname:  Directory name
306
    :param basename: Basename of a file in dir of dirname
307
    :return:         List containing Basename if the file exists
308
    """
309
    if os.path.exists(os.path.join(dirname, basename)):
310
        return [basename]
311
    return[]
312
313
314
def relative_recursive_glob(dirname, pattern):
315
    """
316
    Recursive Glob for one directory and all its (nested) subdirectories.
317
    Accepts only '**' as pattern.
318
319
    :param dirname: Directory name
320
    :param pattern: The recursive wildcard '**'
321
    :return:        Iterator that yields all the (nested) subdirectories of the
322
                    given dir
323
    """
324
    assert pattern == '**'
325
    if dirname:
326
        yield pattern[:0]
327
    for relative_dir in _iter_relative_dirs(dirname):
328
        yield relative_dir
329
330
331
wildcard_check_pattern = re.compile('([*?[])')
332
333
334
def has_wildcard(pattern):
335
    """
336
    Checks whether pattern has any wildcards.
337
338
    :param pattern: Glob pattern that may contain wildcards
339
    :return:        Boolean: Whether or not there are wildcards in pattern
340
    """
341
    match = wildcard_check_pattern.search(pattern)
342
    return match is not None
343
344
345
def iglob(pattern):
346
    """
347
    Iterates all filesystem paths that get matched by the glob pattern.
348
    Syntax is equal to that of fnmatch.
349
350
    :param pattern: Glob pattern with wildcards
351
    :return:        Iterator that yields all file names that match pattern
352
    """
353
    for pat in _iter_alternatives(pattern):
354
        pat = os.path.expanduser(pat)
355
        pat = os.path.normcase(pat)
356
        dirname, basename = os.path.split(pat)
357
        if not has_wildcard(pat):
358
            for file in _absolute_flat_glob(pat):
359
                yield file
360
            return
361
362
        if basename == '**':
363
            relative_glob_function = relative_recursive_glob
364
        elif has_wildcard(basename):
365
            relative_glob_function = relative_wildcard_glob
366
        else:
367
            relative_glob_function = relative_flat_glob
368
369
        if not dirname:
370
            for file in relative_glob_function(dirname, basename):
371
                yield file
372
            return
373
374
        # Prevent an infinite recursion if a drive or UNC path contains
375
        # wildcard characters (i.e. r'\\?\C:').
376
        if dirname != pat and has_wildcard(dirname):
377
            dirs = iglob(dirname)
378
        else:
379
            dirs = [dirname]
380
381
        for dirname in dirs:
382
            for name in relative_glob_function(dirname, basename):
383
                yield os.path.join(dirname, name)
384
385
386
def glob(pattern):
387
    """
388
    Iterates all filesystem paths that get matched by the glob pattern.
389
    Syntax is equal to that of fnmatch.
390
391
    :param pattern: Glob pattern with wildcards
392
    :return:        List of all file names that match pattern
393
    """
394
    return list(iglob(pattern))
395