Completed
Pull Request — master (#141)
by Chris
11:04
created

bm_php2py._run_script()   F

Complexity

Conditions 54

Size

Total Lines 386
Code Lines 257

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 54
eloc 257
nop 0
dl 0
loc 386
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like bm_php2py._run_script() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
# Copyright 2014-2018 by Christopher C. Little.
5
# This file is part of Abydos.
6
#
7
# Abydos is free software: you can redistribute it and/or modify
8
# it under the terms of the GNU General Public License as published by
9
# the Free Software Foundation, either version 3 of the License, or
10
# (at your option) any later version.
11
#
12
# Abydos is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
# GNU General Public License for more details.
16
#
17
# You should have received a copy of the GNU General Public License
18
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
19
20
21
"""bm_php2py.py.
22
23
This helper script converts Beider-Morse Phonetic Matching Algorithm (BMPM)
24
code from PHP to Python.
25
26
It assumes that the BMPM code is located at ../../bmpm (relative to this
27
directory in the abydos repository).
28
29
It reads the BMPM reference implementation and generates the file
30
../abydos/_beider_morse_data.py.
31
32
The file _beider_morse.py may still need manual changes to be made after this
33
script is run.
34
"""
35
36
from __future__ import (
37
    absolute_import,
38
    division,
39
    print_function,
40
    unicode_literals,
41
)
42
43
import codecs
44
import re
45
import sys
46
from os import listdir
47
from os.path import isfile
48
49
# noinspection PyPackageRequirements
50
import chardet
51
52
53
def _run_script():
54
    # The list of languages from BMPM to support (might need to be updated or
55
    # tuned as BMPM is updated)
56
    lang_tuple = (
57
        'any',
58
        'arabic',
59
        'cyrillic',
60
        'czech',
61
        'dutch',
62
        'english',
63
        'french',
64
        'german',
65
        'greek',
66
        'greeklatin',
67
        'hebrew',
68
        'hungarian',
69
        'italian',
70
        'latvian',
71
        'polish',
72
        'portuguese',
73
        'romanian',
74
        'russian',
75
        'spanish',
76
        'turkish',
77
    )
78
79
    lang_dict = {}
80
    for i, l in enumerate(lang_tuple):
81
        lang_dict[l] = 2 ** i
82
    lang_dict['common'] = "'common'"
83
84
    nl = False
85
    array_seen = False
86
87
    tail_text = ''
88
89
    def c2u(name):
90
        """Convert camelCase (used in PHP) to Python-standard snake_case.
91
92
        Src:
93
        https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
94
95
        Parameters
96
        ----------
97
        name: A function or variable name in camelCase
98
99
        Returns
100
        -------
101
        str: The name in snake_case
102
103
        """
104
        s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
105
        s1 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
106
        return s1
107
108
    def pythonize(line, fn='', subdir='gen'):
109
        """Convert a line of BMPM code from PHP to Python.
110
111
        Parameters
112
        ----------
113
        line : str
114
            A line of code
115
            fn : str
116
            A filename
117
            subdir : str
118
            The file's subdirectory
119
120
        Returns
121
        -------
122
        The code in Python
123
124
        """
125
        global nl, array_seen
126
127
        if '$all' in line:
128
            return ''
129
        if 'make the sum of all languages be visible in the function' in line:
130
            return ''
131
132
        line = line.strip()
133
134
        if 'array' in line and not line.startswith('//'):
135
            array_seen = True
136
137
        line = re.sub('//+', '#', line)
138
        # line = re.sub('"\.\((\$.+?)\)\."', r'\1', line)
139
        if line and re.search(r'array\("[^"]+?"\)', line):
140
            # print("### " + line)
141
            line = ''
142
        line = line.replace('array', '')
143
        line = re.sub(r'^\s*', '', line)
144
        line = re.sub(';$', '', line)
145
        line = re.sub('^include_.+', '', line)
146
147
        line = re.sub(
148
            r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", '
149
            + r'\$languages\)\] = \$([a-zA-Z]+)',
150
            lambda m: (
151
                "BMDATA['"
152
                + subdir
153
                + "']['"
154
                + m.group(1)
155
                + "'][L_"
156
                + m.group(2).upper()
157
                + '] = _'
158
                + subdir.upper()
159
                + '_'
160
                + c2u(m.group(3)).upper()
161
            ),
162
            line,
163
        )
164
165
        line = re.sub(
166
            r'\$(approx|rules|exact|hebrew)([A-Za-z]+) = _merge'
167
            + r'\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
168
            lambda m: (
169
                "BMDATA['"
170
                + subdir
171
                + "']['"
172
                + m.group(1)
173
                + "'][L_"
174
                + c2u(m.group(2)).upper()
175
                + '] = _'
176
                + subdir.upper()
177
                + '_'
178
                + c2u(m.group(3)).upper()
179
                + ' + _'
180
                + subdir.upper()
181
                + '_'
182
                + c2u(m.group(4)).upper()
183
            ),
184
            line,
185
        )
186
187
        line = re.sub(
188
            r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", '
189
            + r'\$languages\)\] = _merge\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
190
            lambda m: (
191
                "BMDATA['"
192
                + subdir
193
                + "']['"
194
                + m.group(1)
195
                + "'][L_"
196
                + c2u(m.group(2)).upper()
197
                + '] = _'
198
                + subdir.upper()
199
                + '_'
200
                + c2u(m.group(3)).upper()
201
                + ' + _'
202
                + subdir.upper()
203
                + '_'
204
                + c2u(m.group(4)).upper()
205
            ),
206
            line,
207
        )
208
209
        line = re.sub(
210
            r'^\$([a-zA-Z]+)',
211
            lambda m: '_' + s.upper() + '_' + c2u(m.group(1)).upper(),
212
            line,
213
        )
214
215
        for _ in range(len(lang_tuple)):
216
            line = re.sub(r'($[a-zA-Z]+) *\+ *($[a-zA-Z]+)', r'\1\+\2', line)
217
218
        line = re.sub(
219
            r'\$([a-zA-Z]+)',
220
            lambda m: (
221
                'L_' + m.group(1).upper()
222
                if m.group(1) in lang_dict
223
                else '$' + m.group(1)
224
            ),
225
            line,
226
        )
227
        line = re.sub(r'\[\"\.\((L_[A-Z_+]+)\)\.\"\]', r'[\1]', line)
228
229
        line = re.sub(
230
            'L_([A-Z]+)', lambda m: str(lang_dict[m.group(1).lower()]), line
231
        )
232
        for _ in range(4):
233
            line = re.sub(
234
                r'([0-9]+) *\+ *([0-9]+)',
235
                lambda m: str(int(m.group(1)) + int(m.group(2))),
236
                line,
237
            )
238
239
        if fn == 'lang':
240
            if len(line.split(',')) >= 3:
241
                parts = line.split(',')
242
                parts[0] = re.sub('/(.+?)/', r'\1', parts[0])
243
                # parts[1] = re.sub('\$', 'L_', parts[1])
244
                # parts[1] = re.sub(' *\+ *', '|', parts[1])
245
                parts[2] = parts[2].title()
246
                line = ','.join(parts)
247
248
        if 'languagenames' in fn:
249
            line = line.replace('"', "'")
250
            line = line.replace("','", "', '")
251
            if line and line[0] == "'":
252
                line = ' ' * 14 + line
253
254
        # fix upstream
255
        # line = line.replace('ë', 'ü')
256
257
        comment = ''
258
        if '#' in line:
259
            hashsign = line.find('#')
260
            comment = line[hashsign:]
261
            code = line[:hashsign]
262
        else:
263
            code = line
264
265
        code = code.rstrip()
266
        comment = comment.strip()
267
        if not re.match(r'^\s*$', code):
268
            comment = '  ' + comment
269
270
        if '(' in code and ')' in code:
271
            prefix = code[: code.find('(') + 1]
272
            suffix = code[code.rfind(')') :]
273
            tuplecontent = code[len(prefix) : len(code) - len(suffix)]
274
275
            elts = tuplecontent.split(',')
276
            for i in range(len(elts)):
277
                elts[i] = elts[i].strip()
278
                if elts[i][0] == '"' and elts[i][-1] == '"':
279
                    elts[i] = "'" + elts[i][1:-1].replace("'", "\\'") + "'"
280
            tuplecontent = ', '.join(elts)
281
282
            code = prefix + tuplecontent + suffix
283
284
        line = code + comment
285
        line = re.sub('# *', '# ', line)
286
287
        if line:
288
            nl = False
289
            if array_seen and not (
290
                line[0] == '_' or line.startswith('BMDATA')
291
            ):
292
                line = ' ' * 4 + line
293
            return line + '\n'
294
        elif not nl:
295
            nl = True
296
            return '\n'
297
        else:
298
            return ''
299
300
    if len(sys.argv) > 1:
301
        bmdir = sys.argv[1].rstrip('/') + '/'
302
    else:
303
        bmdir = '../../bmpm/'
304
305
    outfilename = '../abydos/phonetic/_beider_morse_data.py'
306
    outfile = codecs.open(outfilename, 'w', 'utf-8')
307
308
    outfile.write(
309
        r'# -*- coding: utf-8 -*-\n\n# Copyright 2014-2018 by \
310
Christopher C. Little.\n# This file is part of Abydos.\n#\n# This file is \
311
based on Alexander Beider and Stephen P. Morse\'s implementation\n# of the \
312
Beider-Morse Phonetic Matching (BMPM) System, available at\n# \
313
http://stevemorse.org/phonetics/bmpm.htm.\n#\n# Abydos is free software: \
314
you can redistribute it and/or modify\n# it under the terms of the GNU \
315
General Public License as published by\n# the Free Software Foundation, \
316
either version 3 of the License, or\n# (at your option) any later version.\n\
317
#\n# Abydos is distributed in the hope that it will be useful,\n# but WITHOUT \
318
ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or \
319
FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for \
320
more details.\n#\n# You should have received a copy of the GNU General Public \
321
License\n# along with Abydos. If not, see <http://www.gnu.org/licenses/>.\n\n\
322
"""abydos.phonetic._beider_morse_data.\n\nBehind-the-scenes constants, \
323
rules, etc. for the Beider-Morse Phonentic\nMatching (BMPM) algorithm\n\nDO \
324
NOT EDIT - This document is automatically generated from the reference\n\
325
implementation in PHP.\n"""\n# pylint: disable=line-too-long\n\nfrom \
326
__future__ import (\n    absolute_import,\n    division,\n    print_function,\
327
    unicode_literals,\n)\n'
328
    )
329
330
    outfile.write('L_NONE = 0\n')
331
    for i, l in enumerate(lang_tuple):
332
        outfile.write('L_' + l.upper() + ' = 2**' + str(i) + '\n')
333
    outfile.write('\n\n')
334
335
    tail_text += '\nBMDATA = {}\n'
336
337
    subdirs = ('gen', 'sep', 'ash')
338
339
    for s in subdirs:
340
        tail_text += '\nBMDATA[\'' + s + '\'] = {}\n'
341
        tail_text += 'BMDATA[\'' + s + '\'][\'approx\'] = {}\n'
342
        tail_text += 'BMDATA[\'' + s + '\'][\'exact\'] = {}\n'
343
        tail_text += 'BMDATA[\'' + s + '\'][\'rules\'] = {}\n'
344
        tail_text += 'BMDATA[\'' + s + '\'][\'hebrew\'] = {}\n\n'
345
        tail_text += (
346
            'BMDATA[\''
347
            + s
348
            + '\'][\'language_rules\'] = _'
349
            + s.upper()
350
            + '_LANGUAGE_RULES\n'
351
        )
352
        tail_text += (
353
            'BMDATA[\''
354
            + s
355
            + '\'][\'languages\'] = _'
356
            + s.upper()
357
            + '_LANGUAGES\n'
358
        )
359
360
        phps = [
361
            f
362
            for f in sorted(listdir(bmdir + s + '/'))
363
            if (isfile(bmdir + s + '/' + f) and f.endswith('.php'))
364
        ]
365
        for infilename in phps:
366
            for pfx in (
367
                'rules',
368
                'approx',
369
                'exact',
370
                'hebrew',
371
                'language',
372
                'lang',
373
            ):
374
                if infilename.startswith(pfx):
375
                    array_seen = False
376
                    infilepath = bmdir + s + '/' + infilename
377
                    infileenc = chardet.detect(open(infilepath, 'rb').read())[
378
                        'encoding'
379
                    ]
380
                    print(s + '/' + infilename)  # noqa: T001
381
                    infile = codecs.open(infilepath, 'r', infileenc)
382
                    # if infilename.startswith('lang'):
383
                    #     tuplename = infilename[:-4]
384
                    # else:
385
                    #     tuplename = pfx + '_' + infilename[len(pfx) : -4]
386
                    # indent = len(tuplename) + 21
387
388
                    outfile.write('# ' + s + '/' + infilename + '\n')
389
390
                    ignore = True
391
                    for line in infile:
392
                        if 'function Language' in line:
393
                            break
394
                        if not ignore:
395
                            if re.search(r'\?>', line):
396
                                ignore = True
397
                            else:
398
                                line = pythonize(line, infilename[:-4], s)
399
                                if line.startswith('BMDATA'):
400
                                    tail_text += line
401
                                else:
402
                                    outfile.write(line)
403
                        if '*/' in line:
404
                            ignore = False
405
406
                    outfile.write('\n\n')
407
                    break
408
409
    outfile.write(tail_text)
410
411
    outfile.close()
412
    outfilelines = codecs.open(outfilename, 'r', 'utf-8').readlines()
413
    outfile = codecs.open(outfilename, 'w', 'utf-8')
414
    nl = False
415
    fixlanguagesarray = False
416
417
    sep_lang = (
418
        "('any', 'french', 'hebrew', 'italian', 'portuguese', 'spanish')"
419
    )
420
421
    for line in outfilelines:
422
        line = line.rstrip()
423
        if line:
424
            if fixlanguagesarray:
425
                line = ' ' + line.strip()
426
                fixlanguagesarray = False
427
            if len(line) > 79 or sep_lang in line:
428
                line += '  # noqa: E501'
429
            outfile.write(line)
430
            if not line.endswith('='):
431
                outfile.write('\n')
432
            else:
433
                fixlanguagesarray = True
434
            nl = False
435
        else:
436
            if not nl:
437
                outfile.write('\n')
438
            nl = True
439
440
441
if __name__ == '__main__':
442
    _run_script()
443