Completed
Pull Request — master (#2292)
by Lasse
01:51
created

_compile_pattern()   A

Complexity

Conditions 2

Size

Total Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
c 1
b 0
f 0
dl 0
loc 5
rs 9.4285
1
import os
2
import platform
3
import re
4
from functools import lru_cache
5
6
from coala_decorators.decorators import yield_once
7
from coalib.misc.Constants import GLOBBING_SPECIAL_CHARS
8
9
10
def _end_of_set_index(string, start_index):
11
    """
12
    Returns the position of the appropriate closing bracket for a glob set in
13
    string.
14
15
    :param string:      Glob string with wildcards
16
    :param start_index: Index at which the set starts, meaning the position
17
                        right behind the opening bracket
18
    :return:            Position of appropriate closing bracket
19
    """
20
    length = len(string)
21
    closing_index = start_index
22
    if closing_index < length and string[closing_index] == '!':
23
        closing_index += 1
24
25
    if closing_index < length:  # the set cannot be closed by a bracket here
26
        closing_index += 1
27
28
    while closing_index < length and string[closing_index] != ']':
29
        closing_index += 1
30
31
    return closing_index
32
33
34
def glob_escape(input_string):
35
    """
36
    Escapes the given string with ``[c]`` pattern. Examples:
37
38
    >>> from coalib.parsing.Globbing import glob_escape
39
    >>> glob_escape('test (1)')
40
    'test [(]1[)]'
41
    >>> glob_escape('test folder?')
42
    'test folder[?]'
43
    >>> glob_escape('test*folder')
44
    'test[*]folder'
45
46
    :param input_string: String that is to be escaped with ``[ ]``.
47
    :return:             Escaped string in which all the special glob characters
48
                         ``()[]|?*`` are escaped.
49
    """
50
    return re.sub("(?P<char>[" + re.escape(GLOBBING_SPECIAL_CHARS) + "])",
51
                  "[\\g<char>]", input_string)
52
53
54
def _position_is_bracketed(string, position):
55
    """
56
    Tests whether the char at string[position] is inside a valid pair of
57
    brackets (and therefore loses its special meaning)
58
59
    :param string:   Glob string with wildcards
60
    :param position: Position of a char in string
61
    :return:         Whether or not the char is inside a valid set of brackets
62
    """
63
    # allow negative positions and trim too long ones
64
    position = len(string[:position])
65
66
    index, length = 0, len(string)
67
    while index < position:
68
        char = string[index]
69
        index += 1
70
        if char == '[':
71
            closing_index = _end_of_set_index(string, index)
72
            if closing_index < length:
73
                if index <= position < closing_index:
74
                    return True
75
                index = closing_index + 1
76
            else:
77
                return False
78
    return False
79
80
81
def _boundary_of_alternatives_indices(pattern):
82
    """
83
    Determines the location of a set of alternatives in a glob pattern.
84
    Alternatives are defined by a matching set of non-bracketed parentheses.
85
86
    :param pattern: Glob pattern with wildcards.
87
    :return:        Indices of the innermost set of matching non-bracketed
88
                    parentheses in a tuple. The Index of a missing parenthesis
89
                    will be passed as None.
90
    """
91
    # Taking the leftmost closing parenthesis and the rightmost opening
92
    # parenthesis left of it ensures that the parentheses belong together and
93
    # the pattern is parsed correctly from the most nested section outwards.
94
    end_pos = None
95
    for match in re.finditer('\\)', pattern):
96
        if not _position_is_bracketed(pattern, match.start()):
97
            end_pos = match.start()
98
            break  # break to get leftmost
99
100
    start_pos = None
101
    for match in re.finditer('\\(', pattern[:end_pos]):
102
        if not _position_is_bracketed(pattern, match.start()):
103
            start_pos = match.end()
104
            # no break to get rightmost
105
106
    return start_pos, end_pos
107
108
109
@yield_once
110
def _iter_choices(pattern):
111
    """
112
    Iterate through each choice of an alternative. Splits pattern on '|'s if
113
    they are not bracketed.
114
115
    :param pattern: String of choices separated by '|'s
116
    :return:        Iterator that yields parts of string separated by
117
                    non-bracketed '|'s
118
    """
119
    start_pos = 0
120
    split_pos_list = [match.start() for match in re.finditer('\\|', pattern)]
121
    split_pos_list.append(len(pattern))
122
    for end_pos in split_pos_list:
123
        if not _position_is_bracketed(pattern, end_pos):
124
            yield pattern[start_pos: end_pos]
125
            start_pos = end_pos + 1
126
127
128
@yield_once
129
def _iter_alternatives(pattern):
130
    """
131
    Iterates through all glob patterns that can be obtaines by combination of
132
    all choices for each alternative
133
134
    :param pattern: Glob pattern with wildcards
135
    :return:        Iterator that yields all glob patterns without alternatives
136
                    that can be created from the given pattern containing them.
137
    """
138
    start_pos, end_pos = _boundary_of_alternatives_indices(pattern)
139
140
    if None in (start_pos, end_pos):
141
        yield pattern
142
    else:
143
        # iterate through choices inside of parenthesis (separated by '|'):
144
        for choice in _iter_choices(pattern[start_pos: end_pos]):
145
            # put glob expression back together with alternative:
146
            variant = pattern[:start_pos-1] + choice + pattern[end_pos+1:]
147
148
            # iterate through alternatives outside of parenthesis
149
            # (pattern can have more alternatives elsewhere)
150
            for glob_pattern in _iter_alternatives(variant):
151
                yield glob_pattern
152
153
154
def translate(pattern):
155
    """
156
    Translates a pattern into a regular expression.
157
158
    :param pattern: Glob pattern with wildcards
159
    :return:        Regular expression with the same meaning
160
    """
161
    index, length = 0, len(pattern)
162
    regex = ''
163
    while index < length:
164
        char = pattern[index]
165
        index += 1
166
        if char == '*':
167
            # '**' matches everything
168
            if index < length and pattern[index] == '*':
169
                regex += '.*'
170
            # on Windows, '*' matches everything but the filesystem
171
            # separators '/' and '\'.
172
            elif platform.system() == 'Windows':  # pragma: nocover (Windows)
173
                regex += '[^/\\\\]*'
174
            # on all other (~Unix-) platforms, '*' matches everything but the
175
            # filesystem separator, most likely '/'.
176
            else:
177
                regex += '[^' + re.escape(os.sep) + ']*'
178
        elif char == '?':
179
            regex += '.'
180
        elif char == '[':
181
            closing_index = _end_of_set_index(pattern, index)
182
            if closing_index >= length:
183
                regex += '\\['
184
            else:
185
                sequence = pattern[index:closing_index].replace('\\', '\\\\')
186
                index = closing_index+1
187
                if sequence[0] == '!':
188
                    sequence = '^' + sequence[1:]
189
                elif sequence[0] == '^':
190
                    sequence = '\\' + sequence
191
                regex += '[' + sequence + ']'
192
        else:
193
            regex = regex + re.escape(char)
194
    return regex + '\\Z(?ms)'
195
196
197
def fnmatch(name, patterns):
198
    """
199
    Tests whether name matches pattern
200
201
    :param name:     File or directory name
202
    :param patterns: Glob string with wildcards or list of globs
203
    :return:         Boolean: Whether or not name is matched by pattern
204
205
    Glob Syntax:
206
207
    -  '[seq]':         Matches any character in seq. Cannot be empty. Any
208
                        special character looses its special meaning in a set.
209
    -  '[!seq]':        Matches any character not in seq. Cannot be empty. Any
210
                        special character looses its special meaning in a set.
211
    -  '(seq_a|seq_b)': Matches either sequence_a or sequence_b as a whole.
212
                        More than two or just one sequence can be given.
213
    -  '?':             Matches any single character.
214
    -  '*':             Matches everything but os.sep.
215
    -  '**':            Matches everything.
216
    """
217
    patterns = (patterns,) if isinstance(patterns, str) else tuple(patterns)
218
219
    if len(patterns) == 0:
220
        return True
221
222
    name = os.path.normcase(name)
223
224
    return any(match(name)
225
               for pattern in patterns
226
               for match in _compile_pattern(pattern))
227
228
229
@lru_cache()
230
def _compile_pattern(pattern):
231
    return tuple(re.compile(translate(os.path.normcase(
232
                     os.path.expanduser(pat)))).match
233
                 for pat in _iter_alternatives(pattern))
234
235
236
def _absolute_flat_glob(pattern):
237
    """
238
    Glob function for a pattern that do not contain wildcards.
239
240
    :pattern: File or directory path
241
    :return:  Iterator that yields at most one valid file or dir name
242
    """
243
    dirname, basename = os.path.split(pattern)
244
245
    if basename:
246
        if os.path.exists(pattern):
247
            yield pattern
248
    else:
249
        # Patterns ending with a slash should match only directories
250
        if os.path.isdir(dirname):
251
            yield pattern
252
    return
253
254
255
def _iter_relative_dirs(dirname):
256
    """
257
    Recursively iterates subdirectories of all levels from dirname
258
259
    :param dirname: Directory name
260
    :return:        Iterator that yields files and directory from the given dir
261
                    and all it's (recursive) subdirectories
262
    """
263
    if not dirname:
264
        dirname = os.curdir
265
    try:
266
        files_or_dirs = os.listdir(dirname)
267
    except os.error:
268
        return
269
    for file_or_dir in files_or_dirs:
270
        yield file_or_dir
271
        path = os.path.join(dirname, file_or_dir)
272
        for sub_file_or_dir in _iter_relative_dirs(path):
273
            yield os.path.join(file_or_dir, sub_file_or_dir)
274
275
276
def relative_wildcard_glob(dirname, pattern):
277
    """
278
    Non-recursive glob for one directory. Accepts wildcards.
279
280
    :param dirname: Directory name
281
    :param pattern: Glob pattern with wildcards
282
    :return:        List of files in the dir of dirname that match the pattern
283
    """
284
    if not dirname:
285
        dirname = os.curdir
286
    try:
287
        if '**' in pattern:
288
            names = list(_iter_relative_dirs(dirname))
289
        else:
290
            names = os.listdir(dirname)
291
    except OSError:
292
        return []
293
    result = []
294
    pattern = os.path.normcase(pattern)
295
    match = re.compile(translate(pattern)).match
296
    for name in names:
297
        if match(os.path.normcase(name)):
298
            result.append(name)
299
    return result
300
301
302
def relative_flat_glob(dirname, basename):
303
    """
304
    Non-recursive glob for one directory. Does not accept wildcards.
305
306
    :param dirname:  Directory name
307
    :param basename: Basename of a file in dir of dirname
308
    :return:         List containing Basename if the file exists
309
    """
310
    if os.path.exists(os.path.join(dirname, basename)):
311
        return [basename]
312
    return[]
313
314
315
def relative_recursive_glob(dirname, pattern):
316
    """
317
    Recursive Glob for one directory and all its (nested) subdirectories.
318
    Accepts only '**' as pattern.
319
320
    :param dirname: Directory name
321
    :param pattern: The recursive wildcard '**'
322
    :return:        Iterator that yields all the (nested) subdirectories of the
323
                    given dir
324
    """
325
    assert pattern == '**'
326
    if dirname:
327
        yield pattern[:0]
328
    for relative_dir in _iter_relative_dirs(dirname):
329
        yield relative_dir
330
331
332
wildcard_check_pattern = re.compile('([*?[])')
333
334
335
def has_wildcard(pattern):
336
    """
337
    Checks whether pattern has any wildcards.
338
339
    :param pattern: Glob pattern that may contain wildcards
340
    :return:        Boolean: Whether or not there are wildcards in pattern
341
    """
342
    match = wildcard_check_pattern.search(pattern)
343
    return match is not None
344
345
346
def iglob(pattern):
347
    """
348
    Iterates all filesystem paths that get matched by the glob pattern.
349
    Syntax is equal to that of fnmatch.
350
351
    :param pattern: Glob pattern with wildcards
352
    :return:        Iterator that yields all file names that match pattern
353
    """
354
    for pat in _iter_alternatives(pattern):
355
        pat = os.path.expanduser(pat)
356
        pat = os.path.normcase(pat)
357
        dirname, basename = os.path.split(pat)
358
        if not has_wildcard(pat):
359
            for file in _absolute_flat_glob(pat):
360
                yield file
361
            return
362
363
        if basename == '**':
364
            relative_glob_function = relative_recursive_glob
365
        elif has_wildcard(basename):
366
            relative_glob_function = relative_wildcard_glob
367
        else:
368
            relative_glob_function = relative_flat_glob
369
370
        if not dirname:
371
            for file in relative_glob_function(dirname, basename):
372
                yield file
373
            return
374
375
        # Prevent an infinite recursion if a drive or UNC path contains
376
        # wildcard characters (i.e. r'\\?\C:').
377
        if dirname != pat and has_wildcard(dirname):
378
            dirs = iglob(dirname)
379
        else:
380
            dirs = [dirname]
381
382
        for dirname in dirs:
383
            for name in relative_glob_function(dirname, basename):
384
                yield os.path.join(dirname, name)
385
386
387
def glob(pattern):
388
    """
389
    Iterates all filesystem paths that get matched by the glob pattern.
390
    Syntax is equal to that of fnmatch.
391
392
    :param pattern: Glob pattern with wildcards
393
    :return:        List of all file names that match pattern
394
    """
395
    return list(iglob(pattern))
396