bm_php2py.c2u()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 18
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 4
nop 1
dl 0
loc 18
rs 10
c 0
b 0
f 0
1
#!/usr/bin/env python3
2
# Copyright 2014-2020 by Christopher C. Little.
3
# This file is part of Abydos.
4
#
5
# Abydos is free software: you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation, either version 3 of the License, or
8
# (at your option) any later version.
9
#
10
# Abydos is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
14
#
15
# You should have received a copy of the GNU General Public License
16
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
17
18
19
"""bm_php2py.py.
20
21
This helper script converts Beider-Morse Phonetic Matching Algorithm (BMPM)
22
code from PHP to Python.
23
24
It assumes that the BMPM code is located at ../../bmpm (relative to this
25
directory in the abydos repository).
26
27
It reads the BMPM reference implementation and generates the file
28
../abydos/_beider_morse_data.py.
29
30
The file _beider_morse.py may still need manual changes to be made after this
31
script is run.
32
"""
33
34
import codecs
35
import re
36
import sys
37
from os import listdir
38
from os.path import isfile
39
40
# noinspection PyPackageRequirements
41
import chardet
42
43
# The list of languages from BMPM to support (might need to be updated or
44
# tuned as BMPM is updated)
45
lang_tuple = (
46
    'any',
47
    'arabic',
48
    'cyrillic',
49
    'czech',
50
    'dutch',
51
    'english',
52
    'french',
53
    'german',
54
    'greek',
55
    'greeklatin',
56
    'hebrew',
57
    'hungarian',
58
    'italian',
59
    'latvian',
60
    'polish',
61
    'portuguese',
62
    'romanian',
63
    'russian',
64
    'spanish',
65
    'turkish',
66
)
67
68
lang_dict = {}
69
for i, l in enumerate(lang_tuple):
70
    lang_dict[l] = 2 ** i
71
lang_dict['common'] = "'common'"
72
73
nl = False
74
array_seen = False
75
76
tail_text = ''
77
sd = ''
78
79
80
def c2u(name):
81
    """Convert camelCase (used in PHP) to Python-standard snake_case.
82
83
    Src:
84
    https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
85
86
    Parameters
87
    ----------
88
    name: A function or variable name in camelCase
89
90
    Returns
91
    -------
92
    str: The name in snake_case
93
94
    """
95
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
96
    s1 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
97
    return s1
98
99
100
def pythonize(line, fn='', subdir='gen'):
101
    """Convert a line of BMPM code from PHP to Python.
102
103
    Parameters
104
    ----------
105
    line : str
106
        A line of code
107
        fn : str
108
        A filename
109
        subdir : str
110
        The file's subdirectory
111
112
    Returns
113
    -------
114
    The code in Python
115
116
    """
117
    global array_seen, nl, sd
118
119
    if '$all' in line:
120
        return ''
121
    if 'make the sum of all languages be visible in the function' in line:
122
        return ''
123
124
    line = line.strip()
125
126
    if 'array' in line and not line.startswith('//'):
127
        array_seen = True
128
129
    line = re.sub('//+', '#', line)
130
    # line = re.sub('"\.\((\$.+?)\)\."', r'\1', line)
131
    if line and re.search(r'array\("[^"]+?"\)', line):
132
        # print("### " + line)
133
        line = ''
134
    line = line.replace('array', '')
135
    line = re.sub(r'^\s*', '', line)
136
    line = re.sub(';$', '', line)
137
    line = re.sub('^include_.+', '', line)
138
139
    line = re.sub(
140
        r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", '
141
        + r'\$languages\)\] = \$([a-zA-Z]+)',
142
        lambda m: (
143
            "BMDATA['"
144
            + subdir
145
            + "']['"
146
            + m.group(1)
147
            + "'][L_"
148
            + m.group(2).upper()
149
            + '] = _'
150
            + subdir.upper()
151
            + '_'
152
            + c2u(m.group(3)).upper()
153
        ),
154
        line,
155
    )
156
157
    line = re.sub(
158
        r'\$(approx|rules|exact|hebrew)([A-Za-z]+) = _merge'
159
        + r'\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
160
        lambda m: (
161
            "BMDATA['"
162
            + subdir
163
            + "']['"
164
            + m.group(1)
165
            + "'][L_"
166
            + c2u(m.group(2)).upper()
167
            + '] = _'
168
            + subdir.upper()
169
            + '_'
170
            + c2u(m.group(3)).upper()
171
            + ' + _'
172
            + subdir.upper()
173
            + '_'
174
            + c2u(m.group(4)).upper()
175
        ),
176
        line,
177
    )
178
179
    line = re.sub(
180
        r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", '
181
        + r'\$languages\)\] = _merge\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
182
        lambda m: (
183
            "BMDATA['"
184
            + subdir
185
            + "']['"
186
            + m.group(1)
187
            + "'][L_"
188
            + c2u(m.group(2)).upper()
189
            + '] = _'
190
            + subdir.upper()
191
            + '_'
192
            + c2u(m.group(3)).upper()
193
            + ' + _'
194
            + subdir.upper()
195
            + '_'
196
            + c2u(m.group(4)).upper()
197
        ),
198
        line,
199
    )
200
201
    line = re.sub(
202
        r'^\$([a-zA-Z]+)',
203
        lambda m: '_' + sd.upper() + '_' + c2u(m.group(1)).upper(),
204
        line,
205
    )
206
207
    for _ in range(len(lang_tuple)):
208
        line = re.sub(r'($[a-zA-Z]+) *\+ *($[a-zA-Z]+)', r'\1\+\2', line)
209
210
    line = re.sub(
211
        r'\$([a-zA-Z]+)',
212
        lambda m: (
213
            'L_' + m.group(1).upper()
214
            if m.group(1) in lang_dict
215
            else '$' + m.group(1)
216
        ),
217
        line,
218
    )
219
    line = re.sub(r'\[\"\.\((L_[A-Z_+]+)\)\.\"\]', r'[\1]', line)
220
221
    line = re.sub(
222
        'L_([A-Z]+)', lambda m: str(lang_dict[m.group(1).lower()]), line
223
    )
224
    for _ in range(4):
225
        line = re.sub(
226
            r'([0-9]+) *\+ *([0-9]+)',
227
            lambda m: str(int(m.group(1)) + int(m.group(2))),
228
            line,
229
        )
230
231
    if fn == 'lang':
232
        if len(line.split(',')) >= 3:
233
            parts = line.split(',')
234
            parts[0] = re.sub('/(.+?)/', r'\1', parts[0])
235
            # parts[1] = re.sub('\$', 'L_', parts[1])
236
            # parts[1] = re.sub(' *\+ *', '|', parts[1])
237
            parts[2] = parts[2].title()
238
            line = ','.join(parts)
239
240
    if 'languagenames' in fn:
241
        line = line.replace('"', "'")
242
        line = line.replace("','", "', '")
243
        if line and line[0] == "'":
244
            line = ' ' * 14 + line
245
246
    # fix upstream
247
    # line = line.replace('ë', 'ü')
248
249
    comment = ''
250
    if '#' in line:
251
        hashsign = line.find('#')
252
        comment = line[hashsign:]
253
        code = line[:hashsign]
254
    else:
255
        code = line
256
257
    code = code.rstrip()
258
    comment = comment.strip()
259
    if not re.match(r'^\s*$', code):
260
        comment = '  ' + comment
261
262
    if '(' in code and ')' in code:
263
        prefix = code[: code.find('(') + 1]
264
        suffix = code[code.rfind(')') :]
265
        tuplecontent = code[len(prefix) : len(code) - len(suffix)]
266
267
        elts = tuplecontent.split(',')
268
        for i in range(len(elts)):
269
            elts[i] = elts[i].strip()
270
            if elts[i][0] == '"' and elts[i][-1] == '"':
271
                elts[i] = "'" + elts[i][1:-1].replace("'", "\\'") + "'"
272
        tuplecontent = ', '.join(elts)
273
274
        code = prefix + tuplecontent + suffix
275
276
    line = code + comment
277
    line = re.sub('# *', '# ', line)
278
279
    if line:
280
        nl = False
281
        if array_seen and not (line[0] == '_' or line.startswith('BMDATA')):
282
            line = ' ' * 4 + line
283
        return line + '\n'
284
    elif not nl:
285
        nl = True
286
        return '\n'
287
    else:
288
        return ''
289
290
291
def _run_script():
292
    global array_seen, nl, sd, tail_text
293
294
    if len(sys.argv) > 1:
295
        bmdir = sys.argv[1].rstrip('/') + '/'
296
    else:
297
        bmdir = '../../bmpm/'
298
299
    outfilename = '../abydos/phonetic/_beider_morse_data.py'
300
    outfile = codecs.open(outfilename, 'w', 'utf-8')
301
302
    outfile.write(
303
        '# Copyright 2014-2020 by \
304
Christopher C. Little.\n# This file is part of Abydos.\n#\n# This file is \
305
based on Alexander Beider and Stephen P. Morse\'s implementation\n# of the \
306
Beider-Morse Phonetic Matching (BMPM) System, available at\n# \
307
http://stevemorse.org/phonetics/bmpm.htm.\n#\n# Abydos is free software: \
308
you can redistribute it and/or modify\n# it under the terms of the GNU \
309
General Public License as published by\n# the Free Software Foundation, \
310
either version 3 of the License, or\n# (at your option) any later version.\n\
311
#\n# Abydos is distributed in the hope that it will be useful,\n# but WITHOUT \
312
ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or \
313
FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for \
314
more details.\n#\n# You should have received a copy of the GNU General Public \
315
License\n# along with Abydos. If not, see <http://www.gnu.org/licenses/>.\n\n\
316
"""abydos.phonetic._beider_morse_data.\n\nBehind-the-scenes constants, \
317
rules, etc. for the Beider-Morse Phonentic\nMatching (BMPM) algorithm\n\nDO \
318
NOT EDIT - This document is automatically generated from the reference\n\
319
implementation in PHP.\n"""\n\nfrom \
320
__future__ import (\n    absolute_import,\n    division,\n    print_function,\
321
    unicode_literals,\n)\n'
322
    )
323
324
    outfile.write('L_NONE = 0\n')
325
    for i, l in enumerate(lang_tuple):
326
        outfile.write('L_' + l.upper() + ' = 2**' + str(i) + '\n')
327
    outfile.write('\n\n')
328
329
    tail_text += '\nBMDATA = {}  # type: ignore\n'
330
331
    subdirs = ('gen', 'sep', 'ash')
332
333
    for s in subdirs:
334
        sd = s
335
        tail_text += "\nBMDATA['" + s + "'] = {}\n"
336
        tail_text += "BMDATA['" + s + "']['approx'] = {}\n"
337
        tail_text += "BMDATA['" + s + "']['exact'] = {}\n"
338
        tail_text += "BMDATA['" + s + "']['rules'] = {}\n"
339
        tail_text += "BMDATA['" + s + "']['hebrew'] = {}\n\n"
340
        tail_text += (
341
            "BMDATA['"
342
            + s
343
            + "']['language_rules'] = _"
344
            + s.upper()
345
            + '_LANGUAGE_RULES\n'
346
        )
347
        tail_text += (
348
            "BMDATA['" + s + "']['languages'] = _" + s.upper() + '_LANGUAGES\n'
349
        )
350
351
        phps = [
352
            f
353
            for f in sorted(listdir(bmdir + s + '/'))
354
            if (isfile(bmdir + s + '/' + f) and f.endswith('.php'))
355
        ]
356
        for infilename in phps:
357
            for pfx in (
358
                'rules',
359
                'approx',
360
                'exact',
361
                'hebrew',
362
                'language',
363
                'lang',
364
            ):
365
                if infilename.startswith(pfx):
366
                    array_seen = False
367
                    infilepath = bmdir + s + '/' + infilename
368
                    infileenc = chardet.detect(open(infilepath, 'rb').read())[
369
                        'encoding'
370
                    ]
371
                    print(s + '/' + infilename)  # noqa: T001
372
                    infile = codecs.open(infilepath, 'r', infileenc)
373
                    # if infilename.startswith('lang'):
374
                    #     tuplename = infilename[:-4]
375
                    # else:
376
                    #     tuplename = pfx + '_' + infilename[len(pfx) : -4]
377
                    # indent = len(tuplename) + 21
378
379
                    outfile.write('# ' + s + '/' + infilename + '\n')
380
381
                    ignore = True
382
                    for line in infile:
383
                        if 'function Language' in line:
384
                            break
385
                        if not ignore:
386
                            if re.search(r'\?>', line):
387
                                ignore = True
388
                            else:
389
                                line = pythonize(line, infilename[:-4], s)
390
                                if line.startswith('BMDATA'):
391
                                    tail_text += line
392
                                else:
393
                                    outfile.write(line)
394
                        if '*/' in line:
395
                            ignore = False
396
397
                    outfile.write('\n\n')
398
                    break
399
400
    outfile.write(tail_text)
401
402
    outfile.close()
403
    outfilelines = codecs.open(outfilename, 'r', 'utf-8').readlines()
404
    outfile = codecs.open(outfilename, 'w', 'utf-8')
405
    nl = False
406
    fixlanguagesarray = False
407
408
    sep_lang = (
409
        "('any', 'french', 'hebrew', 'italian', 'portuguese', 'spanish')"
410
    )
411
412
    for line in outfilelines:
413
        line = line.rstrip()
414
        if line:
415
            if fixlanguagesarray:
416
                line = ' ' + line.strip()
417
                fixlanguagesarray = False
418
            if len(line) > 79 or sep_lang in line:
419
                line += '  # noqa: E501'
420
            outfile.write(line)
421
            if not line.endswith('='):
422
                outfile.write('\n')
423
            else:
424
                fixlanguagesarray = True
425
            nl = False
426
        else:
427
            if not nl:
428
                outfile.write('\n')
429
            nl = True
430
431
    outfile.write(
432
        "\n\nif __name__ == '__main__':\n    import doctest\n\n\
433
    doctest.testmod()\n"
434
    )
435
436
437
if __name__ == '__main__':
438
    _run_script()
439