Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

bm_php2py.c2u()   A

Complexity

Conditions 1

Size

Total Lines 18
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 4
nop 1
dl 0
loc 18
rs 10
c 0
b 0
f 0
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
# Copyright 2014-2018 by Christopher C. Little.
5
# This file is part of Abydos.
6
#
7
# Abydos is free software: you can redistribute it and/or modify
8
# it under the terms of the GNU General Public License as published by
9
# the Free Software Foundation, either version 3 of the License, or
10
# (at your option) any later version.
11
#
12
# Abydos is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
# GNU General Public License for more details.
16
#
17
# You should have received a copy of the GNU General Public License
18
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
19
20
21
"""bm_php2py.py.
22
23
This helper script converts Beider-Morse Phonetic Matching Algorithm (BMPM)
24
code from PHP to Python.
25
26
It assumes that the BMPM code is located at ../../bmpm (relative to this
27
directory in the abydos repository).
28
29
It reads the BMPM reference implementation and generates the file
30
../abydos/_beider_morse_data.py.
31
32
The file _beider_morse.py may still need manual changes to be made after this
33
script is run.
34
"""
35
36
from __future__ import (
37
    absolute_import,
38
    division,
39
    print_function,
40
    unicode_literals,
41
)
42
43
import codecs
44
import re
45
import sys
46
from os import listdir
47
from os.path import isfile
48
49
# noinspection PyPackageRequirements
50
import chardet
51
52
# The list of languages from BMPM to support (might need to be updated or
53
# tuned as BMPM is updated)
54
lang_tuple = (
55
    'any',
56
    'arabic',
57
    'cyrillic',
58
    'czech',
59
    'dutch',
60
    'english',
61
    'french',
62
    'german',
63
    'greek',
64
    'greeklatin',
65
    'hebrew',
66
    'hungarian',
67
    'italian',
68
    'latvian',
69
    'polish',
70
    'portuguese',
71
    'romanian',
72
    'russian',
73
    'spanish',
74
    'turkish',
75
)
76
77
lang_dict = {}
78
for i, l in enumerate(lang_tuple):
79
    lang_dict[l] = 2 ** i
80
lang_dict['common'] = "'common'"
81
82
nl = False
83
array_seen = False
84
85
tail_text = ''
86
sd = ''
87
88
89
def c2u(name):
90
    """Convert camelCase (used in PHP) to Python-standard snake_case.
91
92
    Src:
93
    https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
94
95
    Parameters
96
    ----------
97
    name: A function or variable name in camelCase
98
99
    Returns
100
    -------
101
    str: The name in snake_case
102
103
    """
104
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
105
    s1 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
106
    return s1
107
108
109
def pythonize(line, fn='', subdir='gen'):
110
    """Convert a line of BMPM code from PHP to Python.
111
112
    Parameters
113
    ----------
114
    line : str
115
        A line of code
116
        fn : str
117
        A filename
118
        subdir : str
119
        The file's subdirectory
120
121
    Returns
122
    -------
123
    The code in Python
124
125
    """
126
    global array_seen, nl, sd
127
128
    if '$all' in line:
129
        return ''
130
    if 'make the sum of all languages be visible in the function' in line:
131
        return ''
132
133
    line = line.strip()
134
135
    if 'array' in line and not line.startswith('//'):
136
        array_seen = True
137
138
    line = re.sub('//+', '#', line)
139
    # line = re.sub('"\.\((\$.+?)\)\."', r'\1', line)
140
    if line and re.search(r'array\("[^"]+?"\)', line):
141
        # print("### " + line)
142
        line = ''
143
    line = line.replace('array', '')
144
    line = re.sub(r'^\s*', '', line)
145
    line = re.sub(';$', '', line)
146
    line = re.sub('^include_.+', '', line)
147
148
    line = re.sub(
149
        r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", '
150
        + r'\$languages\)\] = \$([a-zA-Z]+)',
151
        lambda m: (
152
            "BMDATA['"
153
            + subdir
154
            + "']['"
155
            + m.group(1)
156
            + "'][L_"
157
            + m.group(2).upper()
158
            + '] = _'
159
            + subdir.upper()
160
            + '_'
161
            + c2u(m.group(3)).upper()
162
        ),
163
        line,
164
    )
165
166
    line = re.sub(
167
        r'\$(approx|rules|exact|hebrew)([A-Za-z]+) = _merge'
168
        + r'\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
169
        lambda m: (
170
            "BMDATA['"
171
            + subdir
172
            + "']['"
173
            + m.group(1)
174
            + "'][L_"
175
            + c2u(m.group(2)).upper()
176
            + '] = _'
177
            + subdir.upper()
178
            + '_'
179
            + c2u(m.group(3)).upper()
180
            + ' + _'
181
            + subdir.upper()
182
            + '_'
183
            + c2u(m.group(4)).upper()
184
        ),
185
        line,
186
    )
187
188
    line = re.sub(
189
        r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", '
190
        + r'\$languages\)\] = _merge\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
191
        lambda m: (
192
            "BMDATA['"
193
            + subdir
194
            + "']['"
195
            + m.group(1)
196
            + "'][L_"
197
            + c2u(m.group(2)).upper()
198
            + '] = _'
199
            + subdir.upper()
200
            + '_'
201
            + c2u(m.group(3)).upper()
202
            + ' + _'
203
            + subdir.upper()
204
            + '_'
205
            + c2u(m.group(4)).upper()
206
        ),
207
        line,
208
    )
209
210
    line = re.sub(
211
        r'^\$([a-zA-Z]+)',
212
        lambda m: '_' + sd.upper() + '_' + c2u(m.group(1)).upper(),
213
        line,
214
    )
215
216
    for _ in range(len(lang_tuple)):
217
        line = re.sub(r'($[a-zA-Z]+) *\+ *($[a-zA-Z]+)', r'\1\+\2', line)
218
219
    line = re.sub(
220
        r'\$([a-zA-Z]+)',
221
        lambda m: (
222
            'L_' + m.group(1).upper()
223
            if m.group(1) in lang_dict
224
            else '$' + m.group(1)
225
        ),
226
        line,
227
    )
228
    line = re.sub(r'\[\"\.\((L_[A-Z_+]+)\)\.\"\]', r'[\1]', line)
229
230
    line = re.sub(
231
        'L_([A-Z]+)', lambda m: str(lang_dict[m.group(1).lower()]), line
232
    )
233
    for _ in range(4):
234
        line = re.sub(
235
            r'([0-9]+) *\+ *([0-9]+)',
236
            lambda m: str(int(m.group(1)) + int(m.group(2))),
237
            line,
238
        )
239
240
    if fn == 'lang':
241
        if len(line.split(',')) >= 3:
242
            parts = line.split(',')
243
            parts[0] = re.sub('/(.+?)/', r'\1', parts[0])
244
            # parts[1] = re.sub('\$', 'L_', parts[1])
245
            # parts[1] = re.sub(' *\+ *', '|', parts[1])
246
            parts[2] = parts[2].title()
247
            line = ','.join(parts)
248
249
    if 'languagenames' in fn:
250
        line = line.replace('"', "'")
251
        line = line.replace("','", "', '")
252
        if line and line[0] == "'":
253
            line = ' ' * 14 + line
254
255
    # fix upstream
256
    # line = line.replace('ë', 'ü')
257
258
    comment = ''
259
    if '#' in line:
260
        hashsign = line.find('#')
261
        comment = line[hashsign:]
262
        code = line[:hashsign]
263
    else:
264
        code = line
265
266
    code = code.rstrip()
267
    comment = comment.strip()
268
    if not re.match(r'^\s*$', code):
269
        comment = '  ' + comment
270
271
    if '(' in code and ')' in code:
272
        prefix = code[: code.find('(') + 1]
273
        suffix = code[code.rfind(')') :]
274
        tuplecontent = code[len(prefix) : len(code) - len(suffix)]
275
276
        elts = tuplecontent.split(',')
277
        for i in range(len(elts)):
278
            elts[i] = elts[i].strip()
279
            if elts[i][0] == '"' and elts[i][-1] == '"':
280
                elts[i] = "'" + elts[i][1:-1].replace("'", "\\'") + "'"
281
        tuplecontent = ', '.join(elts)
282
283
        code = prefix + tuplecontent + suffix
284
285
    line = code + comment
286
    line = re.sub('# *', '# ', line)
287
288
    if line:
289
        nl = False
290
        if array_seen and not (line[0] == '_' or line.startswith('BMDATA')):
291
            line = ' ' * 4 + line
292
        return line + '\n'
293
    elif not nl:
294
        nl = True
295
        return '\n'
296
    else:
297
        return ''
298
299
300
def _run_script():
301
    global array_seen, nl, sd, tail_text
302
303
    if len(sys.argv) > 1:
304
        bmdir = sys.argv[1].rstrip('/') + '/'
305
    else:
306
        bmdir = '../../bmpm/'
307
308
    outfilename = '../abydos/phonetic/_beider_morse_data.py'
309
    outfile = codecs.open(outfilename, 'w', 'utf-8')
310
311
    outfile.write(
312
        '# -*- coding: utf-8 -*-\n\n# Copyright 2014-2018 by \
313
Christopher C. Little.\n# This file is part of Abydos.\n#\n# This file is \
314
based on Alexander Beider and Stephen P. Morse\'s implementation\n# of the \
315
Beider-Morse Phonetic Matching (BMPM) System, available at\n# \
316
http://stevemorse.org/phonetics/bmpm.htm.\n#\n# Abydos is free software: \
317
you can redistribute it and/or modify\n# it under the terms of the GNU \
318
General Public License as published by\n# the Free Software Foundation, \
319
either version 3 of the License, or\n# (at your option) any later version.\n\
320
#\n# Abydos is distributed in the hope that it will be useful,\n# but WITHOUT \
321
ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or \
322
FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for \
323
more details.\n#\n# You should have received a copy of the GNU General Public \
324
License\n# along with Abydos. If not, see <http://www.gnu.org/licenses/>.\n\n\
325
"""abydos.phonetic._beider_morse_data.\n\nBehind-the-scenes constants, \
326
rules, etc. for the Beider-Morse Phonentic\nMatching (BMPM) algorithm\n\nDO \
327
NOT EDIT - This document is automatically generated from the reference\n\
328
implementation in PHP.\n"""\n\nfrom \
329
__future__ import (\n    absolute_import,\n    division,\n    print_function,\
330
    unicode_literals,\n)\n'
331
    )
332
333
    outfile.write('L_NONE = 0\n')
334
    for i, l in enumerate(lang_tuple):
335
        outfile.write('L_' + l.upper() + ' = 2**' + str(i) + '\n')
336
    outfile.write('\n\n')
337
338
    tail_text += '\nBMDATA = {}\n'
339
340
    subdirs = ('gen', 'sep', 'ash')
341
342
    for s in subdirs:
343
        sd = s
344
        tail_text += '\nBMDATA[\'' + s + '\'] = {}\n'
345
        tail_text += 'BMDATA[\'' + s + '\'][\'approx\'] = {}\n'
346
        tail_text += 'BMDATA[\'' + s + '\'][\'exact\'] = {}\n'
347
        tail_text += 'BMDATA[\'' + s + '\'][\'rules\'] = {}\n'
348
        tail_text += 'BMDATA[\'' + s + '\'][\'hebrew\'] = {}\n\n'
349
        tail_text += (
350
            'BMDATA[\''
351
            + s
352
            + '\'][\'language_rules\'] = _'
353
            + s.upper()
354
            + '_LANGUAGE_RULES\n'
355
        )
356
        tail_text += (
357
            'BMDATA[\''
358
            + s
359
            + '\'][\'languages\'] = _'
360
            + s.upper()
361
            + '_LANGUAGES\n'
362
        )
363
364
        phps = [
365
            f
366
            for f in sorted(listdir(bmdir + s + '/'))
367
            if (isfile(bmdir + s + '/' + f) and f.endswith('.php'))
368
        ]
369
        for infilename in phps:
370
            for pfx in (
371
                'rules',
372
                'approx',
373
                'exact',
374
                'hebrew',
375
                'language',
376
                'lang',
377
            ):
378
                if infilename.startswith(pfx):
379
                    array_seen = False
380
                    infilepath = bmdir + s + '/' + infilename
381
                    infileenc = chardet.detect(open(infilepath, 'rb').read())[
382
                        'encoding'
383
                    ]
384
                    print(s + '/' + infilename)  # noqa: T001
385
                    infile = codecs.open(infilepath, 'r', infileenc)
386
                    # if infilename.startswith('lang'):
387
                    #     tuplename = infilename[:-4]
388
                    # else:
389
                    #     tuplename = pfx + '_' + infilename[len(pfx) : -4]
390
                    # indent = len(tuplename) + 21
391
392
                    outfile.write('# ' + s + '/' + infilename + '\n')
393
394
                    ignore = True
395
                    for line in infile:
396
                        if 'function Language' in line:
397
                            break
398
                        if not ignore:
399
                            if re.search(r'\?>', line):
400
                                ignore = True
401
                            else:
402
                                line = pythonize(line, infilename[:-4], s)
403
                                if line.startswith('BMDATA'):
404
                                    tail_text += line
405
                                else:
406
                                    outfile.write(line)
407
                        if '*/' in line:
408
                            ignore = False
409
410
                    outfile.write('\n\n')
411
                    break
412
413
    outfile.write(tail_text)
414
415
    outfile.close()
416
    outfilelines = codecs.open(outfilename, 'r', 'utf-8').readlines()
417
    outfile = codecs.open(outfilename, 'w', 'utf-8')
418
    nl = False
419
    fixlanguagesarray = False
420
421
    sep_lang = (
422
        "('any', 'french', 'hebrew', 'italian', 'portuguese', 'spanish')"
423
    )
424
425
    for line in outfilelines:
426
        line = line.rstrip()
427
        if line:
428
            if fixlanguagesarray:
429
                line = ' ' + line.strip()
430
                fixlanguagesarray = False
431
            if len(line) > 79 or sep_lang in line:
432
                line += '  # noqa: E501'
433
            outfile.write(line)
434
            if not line.endswith('='):
435
                outfile.write('\n')
436
            else:
437
                fixlanguagesarray = True
438
            nl = False
439
        else:
440
            if not nl:
441
                outfile.write('\n')
442
            nl = True
443
444
    outfile.write(
445
        '\n\nif __name__ == \'__main__\':\n    import doctest\n\n\
446
    doctest.testmod()\n'
447
    )
448
449
450
if __name__ == '__main__':
451
    _run_script()
452