_lex() - Code Metrics - Wilfred/trifle - Measure and Improve Code Quality continuously with Scrutinizer

_lex() F
last analyzed 2016-04-21 07:45 UTC

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

120

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	29
dl	0
loc	120
rs	2

How to fix Long Method Complexity

# Sadly, re isn't available in rpython.
from rpython.rlib.rsre import rsre_core
from rpython.rlib.rsre.rpy import get_code
from rpython.rlib.rbigint import rbigint as RBigInt

from trifle_types import (
    OpenParen, CloseParen, OpenCurlyParen, CloseCurlyParen,
    Integer, Float, Fraction,
    TrifleExceptionInstance,
    Symbol, Keyword, List,
    String, Bytestring, Character,
    TRUE, FALSE, NULL)
from errors import LexFailed, division_by_zero, lex_failed


# Note this an incomplete list and is purely to give us convenient
# constants.
WHITESPACE = 'whitespace'
COMMENT = 'comment'
OPEN_PAREN = 'open-paren'
CLOSE_PAREN = 'close-paren'
OPEN_CURLY_PAREN = 'open-curly-paren'
CLOSE_CURLY_PAREN = 'close-curly-paren'
INTEGER = 'integer'
FRACTION = 'fraction'
SYMBOL = 'symbol'
KEYWORD = 'keyword'
STRING = 'string'
CHARACTER = 'character'
BYTESTRING = 'bytestring'
FLOAT = 'float'
BOOLEAN = 'boolean'
NULL_TYPE = 'null type'
ATOM = 'atom'
HASH_LITERAL = 'hash_literal'

# Tokens are used to split strings into coarse categories.
TOKENS = [
    (WHITESPACE, get_code(r"(,|\s)+")),
    (COMMENT, get_code(";[^\n]*")),
    (OPEN_PAREN, get_code(r"\(")),
    (CLOSE_PAREN, get_code(r"\)")),
    (OPEN_CURLY_PAREN, get_code(r"\{")),
    (CLOSE_CURLY_PAREN, get_code(r"\}")),

    (ATOM, get_code('[:a-zA-Z0-9*/+?!<>=_.-]+')),
    (STRING, get_code(r'"([^"\\]|\\\\|\\n|\\")*\"')),
    (CHARACTER, get_code(r"'([^'\\]|\\\\|\\n|\\')'")),

    (BYTESTRING, get_code(r'#bytes\("[a-zA-Z0-9\\]*"\)')),

    (HASH_LITERAL, get_code('#[a-zA-Z]*')),
]

# After splitting, we lex properly.
LEXEMES = [
    (OPEN_PAREN, get_code(r"\(")),
    (CLOSE_PAREN, get_code(r"\)")),
    (OPEN_CURLY_PAREN, get_code(r"\{")),
    (CLOSE_CURLY_PAREN, get_code(r"\}")),

    (STRING, get_code(r'"([^"\\]|\\\\|\\n|\\")*\"$')),
    (BYTESTRING, get_code(r'#bytes\("[a-zA-Z0-9\\]*"\)')),

    # Either: '\\', '\n', '\'' or a simple character between quotes: 'x'
    (CHARACTER, get_code(r"'([^'\\]|\\\\|\\n|\\')'")),

    (FLOAT, get_code(r"-?[0-9_]+\.[0-9_]+$")),

    # TODO: support 0x123, 0o123
    (INTEGER, get_code('-?[0-9][0-9_]*$')),

    (FRACTION, get_code('-?[0-9_]+/[0-9_]+$')),

    (BOOLEAN, get_code('(#true|#false)$')),
    (NULL_TYPE, get_code('#null$')),
    
    # todoc: exactly what syntax we accept for symbols
    (SYMBOL, get_code('[a-zA-Z*/+?!<>=_-][a-zA-Z0-9*/+?!<>=_-]*$')),
    (KEYWORD, get_code(':[a-zA-Z*/+?!<>=_-][a-zA-Z0-9*/+?!<>=_-]*$')),
]

DIGITS = u'0123456789'


def remove_char(string, unwanted_char):
    """Return string with all instances of char removed.  We're forced to
    iterate over a list to keep RPython happy.

    """
    chars = []
    for char in string:
        if char != unwanted_char:
            chars.append(char)

    return "".join(chars)

def unescape_chars(string, quote_character):
    """Convert a string with Trifle escape sequences in it to a Python
    string.

    >>> unescape_chars(u'\\"')
    u'"'

    """
    chars = []

    while string:
        if string.startswith(u'\\n'):
            chars.append(u'\n')
            string = string[2:]
        elif string.startswith(u'\\\\'):
            chars.append(u'\\')
            string = string[2:]
        elif string.startswith(u'\\%s' % quote_character):
            chars.append(quote_character)
            string = string[2:]
        else:
            chars.append(string[0])
            string = string[1:]
        
    return chars


def unescape_bytestring_chars(string):
    """Convert a string with Trifle bytestring escape sequences to a Python
    list.

    >>> unescape_chars(u'ab\\x00')
    [97, 98, 0]

    """
    chars = []

    while string:
        if string.startswith(u"\\\\"):
            chars.append(ord('\\'))
            string = string[2:]
        
        # Convert hexadecimal escapes. E.g. \xFF -> 255
        # TODOC
        elif string.startswith(u'\\'):
            if len(string) < 4:
                # TODO: we should give examples of valid escape sequences.
                # (same for strings too)
                raise LexFailed(u"Invalid hexadecimal escape sequence: %s" % string)

            hexadecimal = string[1:4]

            valid_chars = [
                u'a', u'b', u'c', u'd', u'e', u'f',
                u'A', u'B', u'C', u'D', u'E', u'F',
                u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9'
            ]

            if (hexadecimal[0] != u'x' or hexadecimal[1] not in valid_chars
                or hexadecimal[2] not in valid_chars):
                raise LexFailed(u"Invalid hexadecimal escape sequence: %s" % hexadecimal)

            chars.append(int(hexadecimal[1:].encode('utf-8'), 16))
            string = string[4:]
                
        else:
            char = string[0].encode('utf-8')
            # The [0] here is redundant, but RPython needs it to be
            # certain that we only pass a single character to
            # ord(). It can't see that one_char.encode('utf-8) has a length of 1.
            chars.append(ord(char[0]))
            string = string[1:]
        
    return chars


def split_tokens(text):
    """Given the raw text of a trifle program, split it into things that
    look like tokens.

    """
    tokens = []

    while text:
        found_match = False
    
        for token, regexp in TOKENS:
            match = rsre_core.match(regexp, text)
            if match:
                found_match = True
                matched_text = text[:match.match_end]
                text = text[match.match_end:]

                if token in [WHITESPACE, COMMENT]:
                    pass
                else:
                    tokens.append(matched_text)
                
                break

        if not found_match:
            # TODO: It would be nice to suggest where open
            # brackets/quotation marks started, to give the user a hint.
            raise LexFailed(u"Could not lex remainder: '%s'" % text)

    return tokens


def _lex(tokens):
    """Given a Python list of unicodes that look roughly like tokens, lex
    them. Returns a Trifle exception if they aren't valid tokens.

    """
    lexed_tokens = []

    for token in tokens:
        found_match = False
    
        for lexeme_name, regexp in LEXEMES:
            match = rsre_core.match(regexp, token)
            if match:
                found_match = True

                if lexeme_name == OPEN_PAREN:
                    lexed_tokens.append(OpenParen())
                elif lexeme_name == CLOSE_PAREN:
                    lexed_tokens.append(CloseParen())
                    
                elif lexeme_name == OPEN_CURLY_PAREN:
                    lexed_tokens.append(OpenCurlyParen())
                elif lexeme_name == CLOSE_CURLY_PAREN:
                    lexed_tokens.append(CloseCurlyParen())

                elif lexeme_name == BOOLEAN:
                    if token == u'#true':
                        lexed_tokens.append(TRUE)
                    else:
                        lexed_tokens.append(FALSE)
                elif lexeme_name == NULL_TYPE:
                    lexed_tokens.append(NULL)
                    
                elif lexeme_name == INTEGER:
                    integer_string = remove_char(token, "_")

                    # TODO: validate that the integer string is only numbers
                    lexed_tokens.append(Integer.fromstr(integer_string))
                        
                elif lexeme_name == FLOAT:
                    float_string = remove_char(token, "_")
                    try:
                        lexed_tokens.append(Float(float(float_string)))
                    except ValueError:
                        return TrifleExceptionInstance(
                            lex_failed, u"Invalid float: '%s'" % token)

                elif lexeme_name == FRACTION:
                    fraction_string = remove_char(token, "_")
                    fraction_parts = fraction_string.split('/')
                    numerator = fraction_parts[0]
                    denominator = fraction_parts[1]

                    # TODO: validate that the fraction string is only numbers
                    numerator = RBigInt.fromstr(numerator)
                    denominator = RBigInt.fromstr(denominator)

                    if denominator.eq(RBigInt.fromint(0)):
                        return TrifleExceptionInstance(
                            division_by_zero,
                            u"Can't have fraction denominator of zero: '%s'" % token)

                    fraction = Fraction(numerator, denominator)

                    if fraction.denominator.eq(RBigInt.fromint(1)):
                        lexed_tokens.append(Integer(fraction.numerator))
                    else:
                        lexed_tokens.append(fraction)

                elif lexeme_name == SYMBOL:
                    lexed_tokens.append(Symbol(token))
                elif lexeme_name == KEYWORD:
                    # todoc
                    lexed_tokens.append(Keyword(token[1:]))
                elif lexeme_name == BYTESTRING:
                    string_end = match.match_end - 2

                    # This is always true, but RPython doesn't support
                    # negative indexes on slices and can't prove the
                    # slice is non-negative.
                    if string_end >= 0:
                        contents = token[8:string_end]
                    else:
                        # Unreachable.
                        contents = u""

                    lexed_tokens.append(Bytestring(unescape_bytestring_chars(contents)))
                    
                elif lexeme_name == STRING:
                    string_end = match.match_end - 1

                    # This is always true, but RPython doesn't support
                    # negative indexes on slices and can't prove the
                    # slice is non-negative.
                    if string_end >= 0:
                        
                        string_contents = token[1:string_end]

                        lexed_tokens.append(String(unescape_chars(string_contents, u'"')))
                elif lexeme_name == CHARACTER:

                    # TODO: use unescape_chars
                    if token == u"'\\n'":
                        lexed_tokens.append(Character(u'\n'))
                    elif token == u"'\\\\'":
                        lexed_tokens.append(Character(u'\\'))
                    elif token == u"'\\''":
                        lexed_tokens.append(Character(u"'"))
                    else:
                        lexed_tokens.append(Character(token[1]))
                else:
                    assert False, u"Unrecognised token '%s'" % token
                
                break

        if not found_match:
            return TrifleExceptionInstance(
                lex_failed, u"Could not lex token: '%s'" % token)

    return List(lexed_tokens)


def lex(text):
    try:
        raw_tokens = split_tokens(text)
        return _lex(raw_tokens)
    except LexFailed as e:
        return TrifleExceptionInstance(
            lex_failed, e.message)


1			# Sadly, re isn't available in rpython.
2			from rpython.rlib.rsre import rsre_core
3			from rpython.rlib.rsre.rpy import get_code
4			from rpython.rlib.rbigint import rbigint as RBigInt
5
6			from trifle_types import (
7			OpenParen, CloseParen, OpenCurlyParen, CloseCurlyParen,
8			Integer, Float, Fraction,
9			TrifleExceptionInstance,
10			Symbol, Keyword, List,
11			String, Bytestring, Character,
12			TRUE, FALSE, NULL)
13			from errors import LexFailed, division_by_zero, lex_failed
14
15
16			# Note this an incomplete list and is purely to give us convenient
17			# constants.
18			WHITESPACE = 'whitespace'
19			COMMENT = 'comment'
20			OPEN_PAREN = 'open-paren'
21			CLOSE_PAREN = 'close-paren'
22			OPEN_CURLY_PAREN = 'open-curly-paren'
23			CLOSE_CURLY_PAREN = 'close-curly-paren'
24			INTEGER = 'integer'
25			FRACTION = 'fraction'
26			SYMBOL = 'symbol'
27			KEYWORD = 'keyword'
28			STRING = 'string'
29			CHARACTER = 'character'
30			BYTESTRING = 'bytestring'
31			FLOAT = 'float'
32			BOOLEAN = 'boolean'
33			NULL_TYPE = 'null type'
34			ATOM = 'atom'
35			HASH_LITERAL = 'hash_literal'
36
37			# Tokens are used to split strings into coarse categories.
38			TOKENS = [
39			(WHITESPACE, get_code(r"(,\|\s)+")),
40			(COMMENT, get_code(";[^\n]*")),
41			(OPEN_PAREN, get_code(r"\(")),
42			(CLOSE_PAREN, get_code(r"\)")),
43			(OPEN_CURLY_PAREN, get_code(r"\{")),
44			(CLOSE_CURLY_PAREN, get_code(r"\}")),
45
46			(ATOM, get_code('[:a-zA-Z0-9*/+?!<>=_.-]+')),
47			(STRING, get_code(r'"([^"\\]\|\\\\\|\\n\|\\")*\"')),
48			(CHARACTER, get_code(r"'([^'\\]\|\\\\\|\\n\|\\')'")),
49
50			(BYTESTRING, get_code(r'#bytes\("[a-zA-Z0-9\\]*"\)')),
51
52			(HASH_LITERAL, get_code('#[a-zA-Z]*')),
53			]
54
55			# After splitting, we lex properly.
56			LEXEMES = [
57			(OPEN_PAREN, get_code(r"\(")),
58			(CLOSE_PAREN, get_code(r"\)")),
59			(OPEN_CURLY_PAREN, get_code(r"\{")),
60			(CLOSE_CURLY_PAREN, get_code(r"\}")),
61
62			(STRING, get_code(r'"([^"\\]\|\\\\\|\\n\|\\")*\"$')),
63			(BYTESTRING, get_code(r'#bytes\("[a-zA-Z0-9\\]*"\)')),
64
65			# Either: '\\', '\n', '\'' or a simple character between quotes: 'x'
66			(CHARACTER, get_code(r"'([^'\\]\|\\\\\|\\n\|\\')'")),
67
68			(FLOAT, get_code(r"-?[0-9_]+\.[0-9_]+$")),
69
70			# TODO: support 0x123, 0o123
71			(INTEGER, get_code('-?[0-9][0-9_]*$')),
72
73			(FRACTION, get_code('-?[0-9_]+/[0-9_]+$')),
74
75			(BOOLEAN, get_code('(#true\|#false)$')),
76			(NULL_TYPE, get_code('#null$')),
77
78			# todoc: exactly what syntax we accept for symbols
79			(SYMBOL, get_code('[a-zA-Z/+?!<>=_-][a-zA-Z0-9/+?!<>=_-]*$')),
80			(KEYWORD, get_code(':[a-zA-Z/+?!<>=_-][a-zA-Z0-9/+?!<>=_-]*$')),
81			]
82
83			DIGITS = u'0123456789'
84
85
86			def remove_char(string, unwanted_char):
87			"""Return string with all instances of char removed. We're forced to
88			iterate over a list to keep RPython happy.
89
90			"""
91			chars = []
92			for char in string:
93			if char != unwanted_char:
94			chars.append(char)
95
96			return "".join(chars)
97
98			def unescape_chars(string, quote_character):
99			"""Convert a string with Trifle escape sequences in it to a Python
100			string.
101
102			>>> unescape_chars(u'\\"')
103			u'"'
104
105			"""
106			chars = []
107
108			while string:
109			if string.startswith(u'\\n'):
110			chars.append(u'\n')
111			string = string[2:]
112			elif string.startswith(u'\\\\'):
113			chars.append(u'\\')
114			string = string[2:]
115			elif string.startswith(u'\\%s' % quote_character):
116			chars.append(quote_character)
117			string = string[2:]
118			else:
119			chars.append(string[0])
120			string = string[1:]
121
122			return chars
123
124
125			def unescape_bytestring_chars(string):
126			"""Convert a string with Trifle bytestring escape sequences to a Python
127			list.
128
129			>>> unescape_chars(u'ab\\x00')
130			[97, 98, 0]
131
132			"""
133			chars = []
134
135			while string:
136			if string.startswith(u"\\\\"):
137			chars.append(ord('\\'))
138			string = string[2:]
139
140			# Convert hexadecimal escapes. E.g. \xFF -> 255
141			# TODOC
142			elif string.startswith(u'\\'):
143			if len(string) < 4:
144			# TODO: we should give examples of valid escape sequences.
145			# (same for strings too)
146			raise LexFailed(u"Invalid hexadecimal escape sequence: %s" % string)
147
148			hexadecimal = string[1:4]
149
150			valid_chars = [
151			u'a', u'b', u'c', u'd', u'e', u'f',
152			u'A', u'B', u'C', u'D', u'E', u'F',
153			u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9'
154			]
155
156			if (hexadecimal[0] != u'x' or hexadecimal[1] not in valid_chars
157			or hexadecimal[2] not in valid_chars):
158			raise LexFailed(u"Invalid hexadecimal escape sequence: %s" % hexadecimal)
159
160			chars.append(int(hexadecimal[1:].encode('utf-8'), 16))
161			string = string[4:]
162
163			else:
164			char = string[0].encode('utf-8')
165			# The [0] here is redundant, but RPython needs it to be
166			# certain that we only pass a single character to
167			# ord(). It can't see that one_char.encode('utf-8) has a length of 1.
168			chars.append(ord(char[0]))
169			string = string[1:]
170
171			return chars
172
173
174			def split_tokens(text):
175			"""Given the raw text of a trifle program, split it into things that
176			look like tokens.
177
178			"""
179			tokens = []
180
181			while text:
182			found_match = False
183
184			for token, regexp in TOKENS:
185			match = rsre_core.match(regexp, text)
186			if match:
187			found_match = True
188			matched_text = text[:match.match_end]
189			text = text[match.match_end:]
190
191			if token in [WHITESPACE, COMMENT]:
192			pass
193			else:
194			tokens.append(matched_text)
195
196			break
197
198			if not found_match:
199			# TODO: It would be nice to suggest where open
200			# brackets/quotation marks started, to give the user a hint.
201			raise LexFailed(u"Could not lex remainder: '%s'" % text)
202
203			return tokens
204
205
206			def _lex(tokens):
207			"""Given a Python list of unicodes that look roughly like tokens, lex
208			them. Returns a Trifle exception if they aren't valid tokens.
209
210			"""
211			lexed_tokens = []
212
213			for token in tokens:
214			found_match = False
215
216			for lexeme_name, regexp in LEXEMES:
217			match = rsre_core.match(regexp, token)
218			if match:
219			found_match = True
220
221			if lexeme_name == OPEN_PAREN:
222			lexed_tokens.append(OpenParen())
223			elif lexeme_name == CLOSE_PAREN:
224			lexed_tokens.append(CloseParen())
225
226			elif lexeme_name == OPEN_CURLY_PAREN:
227			lexed_tokens.append(OpenCurlyParen())
228			elif lexeme_name == CLOSE_CURLY_PAREN:
229			lexed_tokens.append(CloseCurlyParen())
230
231			elif lexeme_name == BOOLEAN:
232			if token == u'#true':
233			lexed_tokens.append(TRUE)
234			else:
235			lexed_tokens.append(FALSE)
236			elif lexeme_name == NULL_TYPE:
237			lexed_tokens.append(NULL)
238
239			elif lexeme_name == INTEGER:
240			integer_string = remove_char(token, "_")
241
242			# TODO: validate that the integer string is only numbers
243			lexed_tokens.append(Integer.fromstr(integer_string))
244
245			elif lexeme_name == FLOAT:
246			float_string = remove_char(token, "_")
247			try:
248			lexed_tokens.append(Float(float(float_string)))
249			except ValueError:
250			return TrifleExceptionInstance(
251			lex_failed, u"Invalid float: '%s'" % token)
252
253			elif lexeme_name == FRACTION:
254			fraction_string = remove_char(token, "_")
255			fraction_parts = fraction_string.split('/')
256			numerator = fraction_parts[0]
257			denominator = fraction_parts[1]
258
259			# TODO: validate that the fraction string is only numbers
260			numerator = RBigInt.fromstr(numerator)
261			denominator = RBigInt.fromstr(denominator)
262
263			if denominator.eq(RBigInt.fromint(0)):
264			return TrifleExceptionInstance(
265			division_by_zero,
266			u"Can't have fraction denominator of zero: '%s'" % token)
267
268			fraction = Fraction(numerator, denominator)
269
270			if fraction.denominator.eq(RBigInt.fromint(1)):
271			lexed_tokens.append(Integer(fraction.numerator))
272			else:
273			lexed_tokens.append(fraction)
274
275			elif lexeme_name == SYMBOL:
276			lexed_tokens.append(Symbol(token))
277			elif lexeme_name == KEYWORD:
278			# todoc
279			lexed_tokens.append(Keyword(token[1:]))
280			elif lexeme_name == BYTESTRING:
281			string_end = match.match_end - 2
282
283			# This is always true, but RPython doesn't support
284			# negative indexes on slices and can't prove the
285			# slice is non-negative.
286			if string_end >= 0:
287			contents = token[8:string_end]
288			else:
289			# Unreachable.
290			contents = u""
291
292			lexed_tokens.append(Bytestring(unescape_bytestring_chars(contents)))
293
294			elif lexeme_name == STRING:
295			string_end = match.match_end - 1
296
297			# This is always true, but RPython doesn't support
298			# negative indexes on slices and can't prove the
299			# slice is non-negative.
300			if string_end >= 0:
301
302			string_contents = token[1:string_end]
303
304			lexed_tokens.append(String(unescape_chars(string_contents, u'"')))
305			elif lexeme_name == CHARACTER:
306
307			# TODO: use unescape_chars
308			if token == u"'\\n'":
309			lexed_tokens.append(Character(u'\n'))
310			elif token == u"'\\\\'":
311			lexed_tokens.append(Character(u'\\'))
312			elif token == u"'\\''":
313			lexed_tokens.append(Character(u"'"))
314			else:
315			lexed_tokens.append(Character(token[1]))
316			else:
317			assert False, u"Unrecognised token '%s'" % token
318
319			break
320
321			if not found_match:
322			return TrifleExceptionInstance(
323			lex_failed, u"Could not lex token: '%s'" % token)
324
325			return List(lexed_tokens)
326
327
328			def lex(text):
329			try:
330			raw_tokens = split_tokens(text)
331			return _lex(raw_tokens)
332			except LexFailed as e:
333			return TrifleExceptionInstance(
334			lex_failed, e.message)
335

Wilfred / trifle

_lex() F last analyzed 2016-04-21 07:45 UTC

Complexity

Size

Duplication

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

_lex() F
last analyzed 2016-04-21 07:45 UTC