Failed Conditions
Pull Request — master (#1099)
by Mischa
02:17
created

coalib.parsing.StringProcessing.unescaped_rstrip()   A

Complexity

Conditions 3

Size

Total Lines 13

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 3
dl 0
loc 13
rs 9.4286
1
import re
2
3
from coalib.parsing.StringProcessing import InBetweenMatch
4
from coalib.parsing.StringProcessing.Filters import limit, trim_empty_matches
5
6
7
def search_for(pattern, string, flags=0, max_match=0, use_regex=False):
8
    """
9
    Searches for a given pattern in a string.
10
11
    :param pattern:   A pattern that defines what to match.
12
    :param string:    The string to search in.
13
    :param flags:     Additional flags to pass to the regex processor.
14
    :param max_match: Defines the maximum number of matches to perform. If 0 or
15
                      less is provided, the number of splits is not limited.
16
    :param use_regex: Specifies whether to treat the pattern as a regex or
17
                      simple string.
18
    :return:          An iterator returning MatchObject's.
19
    """
20
    if not use_regex:
21
        pattern = re.escape(pattern)
22
23
    return limit(re.finditer(pattern, string, flags), max_match)
24
25
26
def unescaped_search_for(pattern,
27
                         string,
28
                         flags=0,
29
                         max_match=0,
30
                         use_regex=False):
31
    """
32
    Searches for a given pattern in a string that is not escaped.
33
34
    :param pattern:   A pattern that defines what to match unescaped.
35
    :param string:    The string to search in.
36
    :param flags:     Additional flags to pass to the regex processor.
37
    :param max_match: Defines the maximum number of matches to perform. If 0 or
38
                      less is provided, the number of splits is not limited.
39
    :param use_regex: Specifies whether to treat the pattern as a regex or
40
                      simple string.
41
    :return:          An iterator returning MatchObject's.
42
    """
43
    _iter = limit(
44
        filter(lambda match: not position_is_escaped(string, match.start()),
45
               search_for(pattern, string, flags, 0, use_regex)),
46
        max_match)
47
48
    for elem in _iter:
49
        yield elem
50
51
52
def _split(string,
53
           max_split,
54
           remove_empty_matches,
55
           matching_function,
56
           *args,
57
           **kwargs):
58
    """
59
    Splits a string using a given matching-function that matches the separator.
60
61
    This function implements general features needed from the split functions
62
    in this module (the max-split and remove-empty-matches features).
63
64
    :param string:               The string where to split.
65
    :param max_split:            Defines the maximum number of splits. If 0 or
66
                                 less is provided, the number of splits is not
67
                                 limited.
68
    :param remove_empty_matches: Defines whether empty entries should
69
                                 be removed from the result.
70
    :param matching_function:    The matching function. It must return
71
                                 MatchObject's containing the matched
72
                                 split-separator.
73
    :param args:                 Positional arguments to invoke the
74
                                 matching_function with.
75
    :param kwargs:               Key-value arguments to invoke the
76
                                 matching_function with.
77
    """
78
    last_end_pos = 0
79
80
    for match in matching_function(*args, **kwargs):
81
        split_string = string[last_end_pos : match.start()]
82
        last_end_pos = match.end()
83
84
        if not remove_empty_matches or len(split_string) != 0:
85
            yield split_string
86
87
            max_split -= 1
88
            if max_split == 0:
89
                break  # only reachable when max_split > 0
90
91
    # Append the rest of the string.
92
    if not remove_empty_matches or len(string) > last_end_pos:
93
        yield string[last_end_pos:]
94
95
96
def split(pattern,
97
          string,
98
          max_split=0,
99
          remove_empty_matches=False,
100
          use_regex=False):
101
    """
102
    Splits the given string by the specified pattern. The return character (\n)
103
    is not a natural split pattern (if you don't specify it yourself).
104
    This function ignores escape sequences.
105
106
    :param pattern:              A pattern that defines where to split.
107
    :param string:               The string to split by the defined pattern.
108
    :param max_split:            Defines the maximum number of splits. If 0 or
109
                                 less is provided, the number of splits is not
110
                                 limited.
111
    :param remove_empty_matches: Defines whether empty entries should
112
                                 be removed from the result.
113
    :param use_regex:            Specifies whether to treat the split pattern
114
                                 as a regex or simple string.
115
    :return:                     An iterator returning the split up strings.
116
    """
117
    return _split(string,
118
                  max_split,
119
                  remove_empty_matches,
120
                  search_for,
121
                  pattern,
122
                  string,
123
                  0,
124
                  0,
125
                  use_regex)
126
127
128
def unescaped_split(pattern,
129
                    string,
130
                    max_split=0,
131
                    remove_empty_matches=False,
132
                    use_regex=False):
133
    """
134
    Splits the given string by the specified pattern. The return character (\n)
135
    is not a natural split pattern (if you don't specify it yourself).
136
    This function handles escaped split-patterns (and so splits only patterns
137
    that are unescaped).
138
139
    :param pattern:              A pattern that defines where to split.
140
    :param string:               The string to split by the defined pattern.
141
    :param max_split:            Defines the maximum number of splits. If 0 or
142
                                 less is provided, the number of splits is not
143
                                 limited.
144
    :param remove_empty_matches: Defines whether empty entries should
145
                                 be removed from the result.
146
    :param use_regex:            Specifies whether to treat the split pattern
147
                                 as a regex or simple string.
148
    :return:                     An iterator returning the split up strings.
149
    """
150
    return _split(string,
151
                  max_split,
152
                  remove_empty_matches,
153
                  unescaped_search_for,
154
                  pattern,
155
                  string,
156
                  0,
157
                  0,
158
                  use_regex)
159
160
161
def search_in_between(begin,
162
                      end,
163
                      string,
164
                      max_matches=0,
165
                      remove_empty_matches=False,
166
                      use_regex=False):
167
    """
168
    Searches for a string enclosed between a specified begin- and end-sequence.
169
    Also enclosed \n are put into the result. Doesn't handle escape sequences.
170
171
    :param begin:                A pattern that defines where to start
172
                                 matching.
173
    :param end:                  A pattern that defines where to end matching.
174
    :param string:               The string where to search in.
175
    :param max_matches           Defines the maximum number of matches. If 0 or
176
                                 less is provided, the number of matches is not
177
                                 limited.
178
    :param remove_empty_matches: Defines whether empty entries should
179
                                 be removed from the result. An entry is
180
                                 considered empty if no inner match was
181
                                 performed (regardless of matched start and
182
                                 end patterns).
183
    :param use_regex:            Specifies whether to treat the begin and end
184
                                 patterns as regexes or simple strings.
185
    :return:                     An iterator returning InBetweenMatch objects
186
                                 that hold information about the matched begin,
187
                                 inside and end string matched.
188
    """
189
190
    if not use_regex:
191
        begin = re.escape(begin)
192
        end = re.escape(end)
193
        # No need to compile the begin sequence, capturing groups get escaped.
194
        begin_pattern_groups = 0
195
    else:
196
        # Compilation of the begin sequence is needed to get the number of
197
        # capturing groups in it.
198
        begin_pattern_groups = re.compile(begin).groups
199
200
    # Regex explanation:
201
    # 1. (begin) A capturing group that matches the begin sequence.
202
    # 2. (.*?)   Match any char unlimited times, as few times as possible. Save
203
    #            the match in the second capturing group (`match.group(2)`).
204
    # 3. (end)   A capturing group that matches the end sequence.
205
    #            Because the previous group is lazy (matches as few times as
206
    #            possible) the next occurring end-sequence is matched.
207
    regex = "(" + begin + ")(.*?)(" + end + ")"
208
209
    matches = re.finditer(regex, string, re.DOTALL)
210
211
    if remove_empty_matches:
212
        matches = trim_empty_matches(matches,
213
                                     (begin_pattern_groups + 2,))
214
215
    matches = limit(matches, max_matches)
216
217
    for m in matches:
218
        yield InBetweenMatch.from_values(m.group(1),
219
                                         m.start(1),
220
                                         m.group(begin_pattern_groups + 2),
221
                                         m.start(begin_pattern_groups + 2),
222
                                         m.group(begin_pattern_groups + 3),
223
                                         m.start(begin_pattern_groups + 3))
224
225
226
def unescaped_search_in_between(begin,
227
                                end,
228
                                string,
229
                                max_matches=0,
230
                                remove_empty_matches=False,
231
                                use_regex=False):
232
    """
233
    Searches for a string enclosed between a specified begin- and end-sequence.
234
    Also enclosed \n are put into the result.
235
    Handles escaped begin- and end-sequences (and so only patterns that are
236
    unescaped).
237
    CAUTION: Using the escaped character '\' in the begin- or end-sequences
238
             the function can return strange results. The backslash can
239
             interfere with the escaping regex-sequence used internally to
240
             match the enclosed string.
241
242
    :param begin:                A regex pattern that defines where to start
243
                                 matching.
244
    :param end:                  A regex pattern that defines where to end
245
                                 matching.
246
    :param string:               The string where to search in.
247
    :param max_matches           Defines the maximum number of matches. If 0 or
248
                                 less is provided, the number of matches is not
249
                                 limited.
250
    :param remove_empty_matches: Defines whether empty entries should
251
                                 be removed from the result. An entry is
252
                                 considered empty if no inner match was
253
                                 performed (regardless of matched start and
254
                                 end patterns).
255
    :param use_regex:            Specifies whether to treat the begin and end
256
                                 patterns as regexes or simple strings.
257
    :return:                     An iterator returning the matched strings.
258
    """
259
    if not use_regex:
260
        begin = re.escape(begin)
261
        end = re.escape(end)
262
        # No need to compile the begin sequence, capturing groups get escaped.
263
        begin_pattern_groups = 0
264
    else:
265
        # Compilation of the begin sequence is needed to get the number of
266
        # capturing groups in it.
267
        begin_pattern_groups = re.compile(begin).groups
268
269
    # Regex explanation:
270
    # 1. (?<!\\)(?:\\\\)*   Unescapes the following char. The first part of
271
    #                       this regex is a look-behind assertion. Only match
272
    #                       the following if no single backslash is before it.
273
    #                       The second part matches all double backslashes.
274
    #                       In fact this sequence matches all escapes that
275
    #                       occur as a multiple of two, means the following
276
    #                       statement is not escaped.
277
    # 2. (begin)            A capturing group that matches the begin sequence.
278
    # 3. (.*?)              Match any char unlimited times, as few times as
279
    #                       possible. Save the match in the capturing group
280
    #                       after all capturing groups that can appear in
281
    #                       'begin'.
282
    # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape-
283
    #                       characters get captured.
284
    # 5. (end)              A capturing group that matches the end sequence.
285
    #                       Because the 3. group is lazy (matches as few times
286
    #                       as possible) the next occurring end-sequence is
287
    #                       matched.
288
    regex = (r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" +
289
             end + ")")
290
291
    matches = re.finditer(regex, string, re.DOTALL)
292
293
    if remove_empty_matches:
294
        matches = trim_empty_matches(matches,
295
                                     (begin_pattern_groups + 2,
296
                                      begin_pattern_groups + 3))
297
298
    matches = limit(matches, max_matches)
299
300
    for m in matches:
301
        yield InBetweenMatch.from_values(m.group(1),
302
                                         m.start(1),
303
                                         m.group(begin_pattern_groups + 2) +
304
                                             m.group(begin_pattern_groups + 3),
305
                                         m.start(begin_pattern_groups + 2),
306
                                         m.group(begin_pattern_groups + 4),
307
                                         m.start(begin_pattern_groups + 4))
308
309
310
def escape(string, escape_chars, escape_with="\\"):
311
    """
312
    Escapes all chars given inside the given string.
313
314
    :param string:       The string where to escape characters.
315
    :param escape_chars: The string or Iterable that contains the characters
316
                         to escape. Each char inside this string will be
317
                         escaped in the order given. Duplicate chars are
318
                         allowed.
319
    :param escape_with:  The string that should be used as escape sequence.
320
    :return:             The escaped string.
321
    """
322
    for chr in escape_chars:
323
        string = string.replace(chr, escape_with + chr)
324
325
    return string
326
327
328
def unescape(string):
329
    """
330
    Trimms off all escape characters from the given string.
331
332
    :param string: The string to unescape.
333
    """
334
    regex = r"\\(.)|\\$"
335
336
    return re.sub(regex, lambda m: m.group(1), string, 0, re.DOTALL)
337
338
339
def position_is_escaped(string, position=None):
340
    """
341
    Checks whether a char at a specific position of the string is preceded by
342
    an odd number of backslashes.
343
344
    :param string:   Arbitrary string
345
    :param position: Position of character in string that should be checked
346
    :return:         True if the character is escaped, False otherwise
347
    """
348
    escapes_uneven = False
349
    # iterate backwards, starting one left of position.
350
    # Slicing provides a sane default behaviour and prevents IndexErrors
351
    for i in range(len(string[:position]) - 1, -1, -1):
352
        if string[i] == '\\':
353
            escapes_uneven = not escapes_uneven
354
        else:
355
            break
356
    return escapes_uneven
357
358
359
def unescaped_rstrip(string):
360
    """
361
    Strips whitespaces from the right side of given string taking escape
362
    characters into account.
363
364
    :param string: The string where to strip whitespaces from.
365
    :return:       The right-stripped string.
366
    """
367
    stripped = string.rstrip()
368
    if (position_is_escaped(stripped, len(string)) and
369
            len(string) > len(stripped)):
370
        stripped += string[len(stripped)]
371
    return stripped
372
373
374
def unescaped_strip(string):
375
    """
376
    Strips whitespaces of the given string taking escape characters into
377
    account.
378
379
    :param string: The string where to strip whitespaces from.
380
    :return:       The stripped string.
381
    """
382
    return unescaped_rstrip(string).lstrip()
383
384
385
def _nested_search_in_between(begin, end, string):
386
    """
387
    Searches for a string enclosed between a specified begin- and end-sequence.
388
    Matches infinite times.
389
390
    This is a function specifically designed to be invoked from
391
    nested_search_in_between().
392
393
    :param begin:  A regex pattern that defines where to start matching.
394
    :param end:    A regex pattern that defines where to end matching.
395
    :param string: The string where to search in.
396
    :return:       An iterator returning the matched strings.
397
    """
398
    # Regex explanation:
399
    # 1. (begin) A capturing group that matches the begin sequence.
400
    # 2. (end)   A capturing group that matches the end sequence. Because the
401
    #            1st group is lazy (matches as few times as possible) the next
402
    #            occurring end-sequence is matched.
403
    # The '|' in the regex matches either the first or the second part.
404
    regex = "(" + begin + ")|(" + end + ")"
405
406
    left_match = None
407
    nesting_level = 0
408
    for match in re.finditer(regex, string, re.DOTALL):
409
        if match.group(1) is not None:
410
            if nesting_level == 0:
411
                # Store the match of the first nesting level to be able to
412
                # return the string until the next fitting end sequence.
413
                left_match = match
414
            nesting_level += 1
415
        else:
416
            # The second group matched. This is the only alternative if group 1
417
            # didn't, otherwise no match would be performed. No need to compile
418
            # the begin and end sequences to get the number of capturing groups
419
            # in them.
420
            if nesting_level > 0:
421
                nesting_level -= 1
422
423
            if nesting_level == 0 and left_match != None:
424
                yield InBetweenMatch.from_values(
425
                    left_match.group(),
426
                    left_match.start(),
427
                    string[left_match.end() : match.start()],
428
                    left_match.end(),
429
                    match.group(),
430
                    match.start())
431
432
                left_match = None
433
434
435
def nested_search_in_between(begin,
436
                             end,
437
                             string,
438
                             max_matches=0,
439
                             remove_empty_matches=False,
440
                             use_regex=False):
441
    """
442
    Searches for a string enclosed between a specified begin- and end-sequence.
443
    Also enclosed \n are put into the result. Doesn't handle escape sequences,
444
    but supports nesting.
445
446
    Nested sequences are ignored during the match. Means you get only the first
447
    nesting level returned. If you want to acquire more levels, just reinvoke
448
    this function again on the return value.
449
450
    Using the same begin- and end-sequence won't match anything.
451
452
    :param begin:                A pattern that defines where to start
453
                                 matching.
454
    :param end:                  A pattern that defines where to end matching.
455
    :param string:               The string where to search in.
456
    :param max_matches           Defines the maximum number of matches. If 0 or
457
                                 less is provided, the number of splits is not
458
                                 limited.
459
    :param remove_empty_matches: Defines whether empty entries should
460
                                 be removed from the result. An entry is
461
                                 considered empty if no inner match was
462
                                 performed (regardless of matched start and
463
                                 end patterns).
464
    :param use_regex:            Specifies whether to treat the begin and end
465
                                 patterns as regexes or simple strings.
466
    :return:                     An iterator returning the matched strings.
467
    """
468
469
    if not use_regex:
470
        begin = re.escape(begin)
471
        end = re.escape(end)
472
473
    strings = _nested_search_in_between(begin, end, string)
474
475
    if remove_empty_matches:
476
        strings = filter(lambda x: str(x.inside) != "", strings)
477
478
    return limit(strings, max_matches)
479