_lex()   F
last analyzed

Complexity

Conditions 29

Size

Total Lines 120

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 29
dl 0
loc 120
rs 2

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like _lex() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# Sadly, re isn't available in rpython.
2
from rpython.rlib.rsre import rsre_core
3
from rpython.rlib.rsre.rpy import get_code
4
from rpython.rlib.rbigint import rbigint as RBigInt
5
6
from trifle_types import (
7
    OpenParen, CloseParen, OpenCurlyParen, CloseCurlyParen,
8
    Integer, Float, Fraction,
9
    TrifleExceptionInstance,
10
    Symbol, Keyword, List,
11
    String, Bytestring, Character,
12
    TRUE, FALSE, NULL)
13
from errors import LexFailed, division_by_zero, lex_failed
14
15
16
# Note this an incomplete list and is purely to give us convenient
17
# constants.
18
WHITESPACE = 'whitespace'
19
COMMENT = 'comment'
20
OPEN_PAREN = 'open-paren'
21
CLOSE_PAREN = 'close-paren'
22
OPEN_CURLY_PAREN = 'open-curly-paren'
23
CLOSE_CURLY_PAREN = 'close-curly-paren'
24
INTEGER = 'integer'
25
FRACTION = 'fraction'
26
SYMBOL = 'symbol'
27
KEYWORD = 'keyword'
28
STRING = 'string'
29
CHARACTER = 'character'
30
BYTESTRING = 'bytestring'
31
FLOAT = 'float'
32
BOOLEAN = 'boolean'
33
NULL_TYPE = 'null type'
34
ATOM = 'atom'
35
HASH_LITERAL = 'hash_literal'
36
37
# Tokens are used to split strings into coarse categories.
38
TOKENS = [
39
    (WHITESPACE, get_code(r"(,|\s)+")),
40
    (COMMENT, get_code(";[^\n]*")),
41
    (OPEN_PAREN, get_code(r"\(")),
42
    (CLOSE_PAREN, get_code(r"\)")),
43
    (OPEN_CURLY_PAREN, get_code(r"\{")),
44
    (CLOSE_CURLY_PAREN, get_code(r"\}")),
45
46
    (ATOM, get_code('[:a-zA-Z0-9*/+?!<>=_.-]+')),
47
    (STRING, get_code(r'"([^"\\]|\\\\|\\n|\\")*\"')),
48
    (CHARACTER, get_code(r"'([^'\\]|\\\\|\\n|\\')'")),
49
50
    (BYTESTRING, get_code(r'#bytes\("[a-zA-Z0-9\\]*"\)')),
51
52
    (HASH_LITERAL, get_code('#[a-zA-Z]*')),
53
]
54
55
# After splitting, we lex properly.
56
LEXEMES = [
57
    (OPEN_PAREN, get_code(r"\(")),
58
    (CLOSE_PAREN, get_code(r"\)")),
59
    (OPEN_CURLY_PAREN, get_code(r"\{")),
60
    (CLOSE_CURLY_PAREN, get_code(r"\}")),
61
62
    (STRING, get_code(r'"([^"\\]|\\\\|\\n|\\")*\"$')),
63
    (BYTESTRING, get_code(r'#bytes\("[a-zA-Z0-9\\]*"\)')),
64
65
    # Either: '\\', '\n', '\'' or a simple character between quotes: 'x'
66
    (CHARACTER, get_code(r"'([^'\\]|\\\\|\\n|\\')'")),
67
68
    (FLOAT, get_code(r"-?[0-9_]+\.[0-9_]+$")),
69
70
    # TODO: support 0x123, 0o123
71
    (INTEGER, get_code('-?[0-9][0-9_]*$')),
72
73
    (FRACTION, get_code('-?[0-9_]+/[0-9_]+$')),
74
75
    (BOOLEAN, get_code('(#true|#false)$')),
76
    (NULL_TYPE, get_code('#null$')),
77
    
78
    # todoc: exactly what syntax we accept for symbols
79
    (SYMBOL, get_code('[a-zA-Z*/+?!<>=_-][a-zA-Z0-9*/+?!<>=_-]*$')),
80
    (KEYWORD, get_code(':[a-zA-Z*/+?!<>=_-][a-zA-Z0-9*/+?!<>=_-]*$')),
81
]
82
83
DIGITS = u'0123456789'
84
85
86
def remove_char(string, unwanted_char):
87
    """Return string with all instances of char removed.  We're forced to
88
    iterate over a list to keep RPython happy.
89
90
    """
91
    chars = []
92
    for char in string:
93
        if char != unwanted_char:
94
            chars.append(char)
95
96
    return "".join(chars)
97
98
def unescape_chars(string, quote_character):
99
    """Convert a string with Trifle escape sequences in it to a Python
100
    string.
101
102
    >>> unescape_chars(u'\\"')
103
    u'"'
104
105
    """
106
    chars = []
107
108
    while string:
109
        if string.startswith(u'\\n'):
110
            chars.append(u'\n')
111
            string = string[2:]
112
        elif string.startswith(u'\\\\'):
113
            chars.append(u'\\')
114
            string = string[2:]
115
        elif string.startswith(u'\\%s' % quote_character):
116
            chars.append(quote_character)
117
            string = string[2:]
118
        else:
119
            chars.append(string[0])
120
            string = string[1:]
121
        
122
    return chars
123
124
125
def unescape_bytestring_chars(string):
126
    """Convert a string with Trifle bytestring escape sequences to a Python
127
    list.
128
129
    >>> unescape_chars(u'ab\\x00')
130
    [97, 98, 0]
131
132
    """
133
    chars = []
134
135
    while string:
136
        if string.startswith(u"\\\\"):
137
            chars.append(ord('\\'))
138
            string = string[2:]
139
        
140
        # Convert hexadecimal escapes. E.g. \xFF -> 255
141
        # TODOC
142
        elif string.startswith(u'\\'):
143
            if len(string) < 4:
144
                # TODO: we should give examples of valid escape sequences.
145
                # (same for strings too)
146
                raise LexFailed(u"Invalid hexadecimal escape sequence: %s" % string)
147
148
            hexadecimal = string[1:4]
149
150
            valid_chars = [
151
                u'a', u'b', u'c', u'd', u'e', u'f',
152
                u'A', u'B', u'C', u'D', u'E', u'F',
153
                u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9'
154
            ]
155
156
            if (hexadecimal[0] != u'x' or hexadecimal[1] not in valid_chars
157
                or hexadecimal[2] not in valid_chars):
158
                raise LexFailed(u"Invalid hexadecimal escape sequence: %s" % hexadecimal)
159
160
            chars.append(int(hexadecimal[1:].encode('utf-8'), 16))
161
            string = string[4:]
162
                
163
        else:
164
            char = string[0].encode('utf-8')
165
            # The [0] here is redundant, but RPython needs it to be
166
            # certain that we only pass a single character to
167
            # ord(). It can't see that one_char.encode('utf-8) has a length of 1.
168
            chars.append(ord(char[0]))
169
            string = string[1:]
170
        
171
    return chars
172
173
174
def split_tokens(text):
175
    """Given the raw text of a trifle program, split it into things that
176
    look like tokens.
177
178
    """
179
    tokens = []
180
181
    while text:
182
        found_match = False
183
    
184
        for token, regexp in TOKENS:
185
            match = rsre_core.match(regexp, text)
186
            if match:
187
                found_match = True
188
                matched_text = text[:match.match_end]
189
                text = text[match.match_end:]
190
191
                if token in [WHITESPACE, COMMENT]:
192
                    pass
193
                else:
194
                    tokens.append(matched_text)
195
                
196
                break
197
198
        if not found_match:
199
            # TODO: It would be nice to suggest where open
200
            # brackets/quotation marks started, to give the user a hint.
201
            raise LexFailed(u"Could not lex remainder: '%s'" % text)
202
203
    return tokens
204
205
206
def _lex(tokens):
207
    """Given a Python list of unicodes that look roughly like tokens, lex
208
    them. Returns a Trifle exception if they aren't valid tokens.
209
210
    """
211
    lexed_tokens = []
212
213
    for token in tokens:
214
        found_match = False
215
    
216
        for lexeme_name, regexp in LEXEMES:
217
            match = rsre_core.match(regexp, token)
218
            if match:
219
                found_match = True
220
221
                if lexeme_name == OPEN_PAREN:
222
                    lexed_tokens.append(OpenParen())
223
                elif lexeme_name == CLOSE_PAREN:
224
                    lexed_tokens.append(CloseParen())
225
                    
226
                elif lexeme_name == OPEN_CURLY_PAREN:
227
                    lexed_tokens.append(OpenCurlyParen())
228
                elif lexeme_name == CLOSE_CURLY_PAREN:
229
                    lexed_tokens.append(CloseCurlyParen())
230
231
                elif lexeme_name == BOOLEAN:
232
                    if token == u'#true':
233
                        lexed_tokens.append(TRUE)
234
                    else:
235
                        lexed_tokens.append(FALSE)
236
                elif lexeme_name == NULL_TYPE:
237
                    lexed_tokens.append(NULL)
238
                    
239
                elif lexeme_name == INTEGER:
240
                    integer_string = remove_char(token, "_")
241
242
                    # TODO: validate that the integer string is only numbers
243
                    lexed_tokens.append(Integer.fromstr(integer_string))
244
                        
245
                elif lexeme_name == FLOAT:
246
                    float_string = remove_char(token, "_")
247
                    try:
248
                        lexed_tokens.append(Float(float(float_string)))
249
                    except ValueError:
250
                        return TrifleExceptionInstance(
251
                            lex_failed, u"Invalid float: '%s'" % token)
252
253
                elif lexeme_name == FRACTION:
254
                    fraction_string = remove_char(token, "_")
255
                    fraction_parts = fraction_string.split('/')
256
                    numerator = fraction_parts[0]
257
                    denominator = fraction_parts[1]
258
259
                    # TODO: validate that the fraction string is only numbers
260
                    numerator = RBigInt.fromstr(numerator)
261
                    denominator = RBigInt.fromstr(denominator)
262
263
                    if denominator.eq(RBigInt.fromint(0)):
264
                        return TrifleExceptionInstance(
265
                            division_by_zero,
266
                            u"Can't have fraction denominator of zero: '%s'" % token)
267
268
                    fraction = Fraction(numerator, denominator)
269
270
                    if fraction.denominator.eq(RBigInt.fromint(1)):
271
                        lexed_tokens.append(Integer(fraction.numerator))
272
                    else:
273
                        lexed_tokens.append(fraction)
274
275
                elif lexeme_name == SYMBOL:
276
                    lexed_tokens.append(Symbol(token))
277
                elif lexeme_name == KEYWORD:
278
                    # todoc
279
                    lexed_tokens.append(Keyword(token[1:]))
280
                elif lexeme_name == BYTESTRING:
281
                    string_end = match.match_end - 2
282
283
                    # This is always true, but RPython doesn't support
284
                    # negative indexes on slices and can't prove the
285
                    # slice is non-negative.
286
                    if string_end >= 0:
287
                        contents = token[8:string_end]
288
                    else:
289
                        # Unreachable.
290
                        contents = u""
291
292
                    lexed_tokens.append(Bytestring(unescape_bytestring_chars(contents)))
293
                    
294
                elif lexeme_name == STRING:
295
                    string_end = match.match_end - 1
296
297
                    # This is always true, but RPython doesn't support
298
                    # negative indexes on slices and can't prove the
299
                    # slice is non-negative.
300
                    if string_end >= 0:
301
                        
302
                        string_contents = token[1:string_end]
303
304
                        lexed_tokens.append(String(unescape_chars(string_contents, u'"')))
305
                elif lexeme_name == CHARACTER:
306
307
                    # TODO: use unescape_chars
308
                    if token == u"'\\n'":
309
                        lexed_tokens.append(Character(u'\n'))
310
                    elif token == u"'\\\\'":
311
                        lexed_tokens.append(Character(u'\\'))
312
                    elif token == u"'\\''":
313
                        lexed_tokens.append(Character(u"'"))
314
                    else:
315
                        lexed_tokens.append(Character(token[1]))
316
                else:
317
                    assert False, u"Unrecognised token '%s'" % token
318
                
319
                break
320
321
        if not found_match:
322
            return TrifleExceptionInstance(
323
                lex_failed, u"Could not lex token: '%s'" % token)
324
325
    return List(lexed_tokens)
326
327
328
def lex(text):
329
    try:
330
        raw_tokens = split_tokens(text)
331
        return _lex(raw_tokens)
332
    except LexFailed as e:
333
        return TrifleExceptionInstance(
334
            lex_failed, e.message)
335