Completed
Pull Request — master (#2292)
by Lasse
01:51
created

_compile_pattern()   A

Complexity

Conditions 3

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
c 0
b 0
f 0
dl 0
loc 6
rs 9.4285
1
import os
2
import platform
3
import re
4
from functools import lru_cache
5
6
from coala_decorators.decorators import yield_once
7
from coalib.misc.Constants import GLOBBING_SPECIAL_CHARS
8
9
10
def _end_of_set_index(string, start_index):
11
    """
12
    Returns the position of the appropriate closing bracket for a glob set in
13
    string.
14
15
    :param string:      Glob string with wildcards
16
    :param start_index: Index at which the set starts, meaning the position
17
                        right behind the opening bracket
18
    :return:            Position of appropriate closing bracket
19
    """
20
    length = len(string)
21
    closing_index = start_index
22
    if closing_index < length and string[closing_index] == '!':
23
        closing_index += 1
24
25
    if closing_index < length:  # the set cannot be closed by a bracket here
26
        closing_index += 1
27
28
    while closing_index < length and string[closing_index] != ']':
29
        closing_index += 1
30
31
    return closing_index
32
33
34
def glob_escape(input_string):
35
    """
36
    Escapes the given string with ``[c]`` pattern. Examples:
37
38
    >>> from coalib.parsing.Globbing import glob_escape
39
    >>> glob_escape('test (1)')
40
    'test [(]1[)]'
41
    >>> glob_escape('test folder?')
42
    'test folder[?]'
43
    >>> glob_escape('test*folder')
44
    'test[*]folder'
45
46
    :param input_string: String that is to be escaped with ``[ ]``.
47
    :return:             Escaped string in which all the special glob characters
48
                         ``()[]|?*`` are escaped.
49
    """
50
    return re.sub("(?P<char>[" + re.escape(GLOBBING_SPECIAL_CHARS) + "])",
51
                  "[\\g<char>]", input_string)
52
53
54
def _position_is_bracketed(string, position):
55
    """
56
    Tests whether the char at string[position] is inside a valid pair of
57
    brackets (and therefore loses its special meaning)
58
59
    :param string:   Glob string with wildcards
60
    :param position: Position of a char in string
61
    :return:         Whether or not the char is inside a valid set of brackets
62
    """
63
    # allow negative positions and trim too long ones
64
    position = len(string[:position])
65
66
    index, length = 0, len(string)
67
    while index < position:
68
        char = string[index]
69
        index += 1
70
        if char == '[':
71
            closing_index = _end_of_set_index(string, index)
72
            if closing_index < length:
73
                if index <= position < closing_index:
74
                    return True
75
                index = closing_index + 1
76
            else:
77
                return False
78
    return False
79
80
81
def _boundary_of_alternatives_indices(pattern):
82
    """
83
    Determines the location of a set of alternatives in a glob pattern.
84
    Alternatives are defined by a matching set of non-bracketed parentheses.
85
86
    :param pattern: Glob pattern with wildcards.
87
    :return:        Indices of the innermost set of matching non-bracketed
88
                    parentheses in a tuple. The Index of a missing parenthesis
89
                    will be passed as None.
90
    """
91
    # Taking the leftmost closing parenthesis and the rightmost opening
92
    # parenthesis left of it ensures that the parentheses belong together and
93
    # the pattern is parsed correctly from the most nested section outwards.
94
    end_pos = None
95
    for match in re.finditer('\\)', pattern):
96
        if not _position_is_bracketed(pattern, match.start()):
97
            end_pos = match.start()
98
            break  # break to get leftmost
99
100
    start_pos = None
101
    for match in re.finditer('\\(', pattern[:end_pos]):
102
        if not _position_is_bracketed(pattern, match.start()):
103
            start_pos = match.end()
104
            # no break to get rightmost
105
106
    return start_pos, end_pos
107
108
109
@yield_once
110
def _iter_choices(pattern):
111
    """
112
    Iterate through each choice of an alternative. Splits pattern on '|'s if
113
    they are not bracketed.
114
115
    :param pattern: String of choices separated by '|'s
116
    :return:        Iterator that yields parts of string separated by
117
                    non-bracketed '|'s
118
    """
119
    start_pos = 0
120
    split_pos_list = [match.start() for match in re.finditer('\\|', pattern)]
121
    split_pos_list.append(len(pattern))
122
    for end_pos in split_pos_list:
123
        if not _position_is_bracketed(pattern, end_pos):
124
            yield pattern[start_pos: end_pos]
125
            start_pos = end_pos + 1
126
127
128
@yield_once
129
def _iter_alternatives(pattern):
130
    """
131
    Iterates through all glob patterns that can be obtaines by combination of
132
    all choices for each alternative
133
134
    :param pattern: Glob pattern with wildcards
135
    :return:        Iterator that yields all glob patterns without alternatives
136
                    that can be created from the given pattern containing them.
137
    """
138
    start_pos, end_pos = _boundary_of_alternatives_indices(pattern)
139
140
    if None in (start_pos, end_pos):
141
        yield pattern
142
    else:
143
        # iterate through choices inside of parenthesis (separated by '|'):
144
        for choice in _iter_choices(pattern[start_pos: end_pos]):
145
            # put glob expression back together with alternative:
146
            variant = pattern[:start_pos-1] + choice + pattern[end_pos+1:]
147
148
            # iterate through alternatives outside of parenthesis
149
            # (pattern can have more alternatives elsewhere)
150
            for glob_pattern in _iter_alternatives(variant):
151
                yield glob_pattern
152
153
154
def translate(pattern):
155
    """
156
    Translates a pattern into a regular expression.
157
158
    :param pattern: Glob pattern with wildcards
159
    :return:        Regular expression with the same meaning
160
    """
161
    index, length = 0, len(pattern)
162
    regex = ''
163
    while index < length:
164
        char = pattern[index]
165
        index += 1
166
        if char == '*':
167
            # '**' matches everything
168
            if index < length and pattern[index] == '*':
169
                regex += '.*'
170
            # on Windows, '*' matches everything but the filesystem
171
            # separators '/' and '\'.
172
            elif platform.system() == 'Windows':  # pragma: nocover (Windows)
173
                regex += '[^/\\\\]*'
174
            # on all other (~Unix-) platforms, '*' matches everything but the
175
            # filesystem separator, most likely '/'.
176
            else:
177
                regex += '[^' + re.escape(os.sep) + ']*'
178
        elif char == '?':
179
            regex += '.'
180
        elif char == '[':
181
            closing_index = _end_of_set_index(pattern, index)
182
            if closing_index >= length:
183
                regex += '\\['
184
            else:
185
                sequence = pattern[index:closing_index].replace('\\', '\\\\')
186
                index = closing_index+1
187
                if sequence[0] == '!':
188
                    sequence = '^' + sequence[1:]
189
                elif sequence[0] == '^':
190
                    sequence = '\\' + sequence
191
                regex += '[' + sequence + ']'
192
        else:
193
            regex = regex + re.escape(char)
194
    return regex + '\\Z(?ms)'
195
196
197
def fnmatch(name, patterns):
198
    """
199
    Tests whether name matches pattern
200
201
    :param name:     File or directory name
202
    :param patterns: Glob string with wildcards or list of globs
203
    :return:         Boolean: Whether or not name is matched by pattern
204
205
    Glob Syntax:
206
207
    -  '[seq]':         Matches any character in seq. Cannot be empty. Any
208
                        special character looses its special meaning in a set.
209
    -  '[!seq]':        Matches any character not in seq. Cannot be empty. Any
210
                        special character looses its special meaning in a set.
211
    -  '(seq_a|seq_b)': Matches either sequence_a or sequence_b as a whole.
212
                        More than two or just one sequence can be given.
213
    -  '?':             Matches any single character.
214
    -  '*':             Matches everything but os.sep.
215
    -  '**':            Matches everything.
216
    """
217
    if isinstance(patterns, str):
218
        patterns = (patterns,)
219
    elif isinstance(patterns, list):
220
        patterns = tuple(patterns)
221
222
    if len(patterns) == 0:
223
        return True
224
225
    name = os.path.normcase(name)
226
227
    for match in _compile_pattern(patterns):
228
        if match(name) is not None:
229
            return True
230
    return False
231
232
233
@lru_cache()
234
def _compile_pattern(patterns):
235
    return tuple(re.compile(translate(os.path.normcase(
236
                     os.path.expanduser(pat)))).match
237
                 for pattern in patterns
238
                 for pat in _iter_alternatives(pattern))
239
240
241
def _absolute_flat_glob(pattern):
242
    """
243
    Glob function for a pattern that do not contain wildcards.
244
245
    :pattern: File or directory path
246
    :return:  Iterator that yields at most one valid file or dir name
247
    """
248
    dirname, basename = os.path.split(pattern)
249
250
    if basename:
251
        if os.path.exists(pattern):
252
            yield pattern
253
    else:
254
        # Patterns ending with a slash should match only directories
255
        if os.path.isdir(dirname):
256
            yield pattern
257
    return
258
259
260
def _iter_relative_dirs(dirname):
261
    """
262
    Recursively iterates subdirectories of all levels from dirname
263
264
    :param dirname: Directory name
265
    :return:        Iterator that yields files and directory from the given dir
266
                    and all it's (recursive) subdirectories
267
    """
268
    if not dirname:
269
        dirname = os.curdir
270
    try:
271
        files_or_dirs = os.listdir(dirname)
272
    except os.error:
273
        return
274
    for file_or_dir in files_or_dirs:
275
        yield file_or_dir
276
        path = os.path.join(dirname, file_or_dir)
277
        for sub_file_or_dir in _iter_relative_dirs(path):
278
            yield os.path.join(file_or_dir, sub_file_or_dir)
279
280
281
def relative_wildcard_glob(dirname, pattern):
282
    """
283
    Non-recursive glob for one directory. Accepts wildcards.
284
285
    :param dirname: Directory name
286
    :param pattern: Glob pattern with wildcards
287
    :return:        List of files in the dir of dirname that match the pattern
288
    """
289
    if not dirname:
290
        dirname = os.curdir
291
    try:
292
        if '**' in pattern:
293
            names = list(_iter_relative_dirs(dirname))
294
        else:
295
            names = os.listdir(dirname)
296
    except OSError:
297
        return []
298
    result = []
299
    pattern = os.path.normcase(pattern)
300
    match = re.compile(translate(pattern)).match
301
    for name in names:
302
        if match(os.path.normcase(name)):
303
            result.append(name)
304
    return result
305
306
307
def relative_flat_glob(dirname, basename):
308
    """
309
    Non-recursive glob for one directory. Does not accept wildcards.
310
311
    :param dirname:  Directory name
312
    :param basename: Basename of a file in dir of dirname
313
    :return:         List containing Basename if the file exists
314
    """
315
    if os.path.exists(os.path.join(dirname, basename)):
316
        return [basename]
317
    return[]
318
319
320
def relative_recursive_glob(dirname, pattern):
321
    """
322
    Recursive Glob for one directory and all its (nested) subdirectories.
323
    Accepts only '**' as pattern.
324
325
    :param dirname: Directory name
326
    :param pattern: The recursive wildcard '**'
327
    :return:        Iterator that yields all the (nested) subdirectories of the
328
                    given dir
329
    """
330
    assert pattern == '**'
331
    if dirname:
332
        yield pattern[:0]
333
    for relative_dir in _iter_relative_dirs(dirname):
334
        yield relative_dir
335
336
337
wildcard_check_pattern = re.compile('([*?[])')
338
339
340
def has_wildcard(pattern):
341
    """
342
    Checks whether pattern has any wildcards.
343
344
    :param pattern: Glob pattern that may contain wildcards
345
    :return:        Boolean: Whether or not there are wildcards in pattern
346
    """
347
    match = wildcard_check_pattern.search(pattern)
348
    return match is not None
349
350
351
def iglob(pattern):
352
    """
353
    Iterates all filesystem paths that get matched by the glob pattern.
354
    Syntax is equal to that of fnmatch.
355
356
    :param pattern: Glob pattern with wildcards
357
    :return:        Iterator that yields all file names that match pattern
358
    """
359
    for pat in _iter_alternatives(pattern):
360
        pat = os.path.expanduser(pat)
361
        pat = os.path.normcase(pat)
362
        dirname, basename = os.path.split(pat)
363
        if not has_wildcard(pat):
364
            for file in _absolute_flat_glob(pat):
365
                yield file
366
            return
367
368
        if basename == '**':
369
            relative_glob_function = relative_recursive_glob
370
        elif has_wildcard(basename):
371
            relative_glob_function = relative_wildcard_glob
372
        else:
373
            relative_glob_function = relative_flat_glob
374
375
        if not dirname:
376
            for file in relative_glob_function(dirname, basename):
377
                yield file
378
            return
379
380
        # Prevent an infinite recursion if a drive or UNC path contains
381
        # wildcard characters (i.e. r'\\?\C:').
382
        if dirname != pat and has_wildcard(dirname):
383
            dirs = iglob(dirname)
384
        else:
385
            dirs = [dirname]
386
387
        for dirname in dirs:
388
            for name in relative_glob_function(dirname, basename):
389
                yield os.path.join(dirname, name)
390
391
392
def glob(pattern):
393
    """
394
    Iterates all filesystem paths that get matched by the glob pattern.
395
    Syntax is equal to that of fnmatch.
396
397
    :param pattern: Glob pattern with wildcards
398
    :return:        List of all file names that match pattern
399
    """
400
    return list(iglob(pattern))
401