bm_php2py.c2u() - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

bm_php2py.c2u() A

↳ Parent: bm_php2py

Complexity

Conditions

Size

Total Lines	18
Code Lines	4

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	4
nop	1
dl	0
loc	18
rs	10
c	0
b	0
f	0

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.


"""bm_php2py.py.

This helper script converts Beider-Morse Phonetic Matching Algorithm (BMPM)
code from PHP to Python.

It assumes that the BMPM code is located at ../../bmpm (relative to this
directory in the abydos repository).

It reads the BMPM reference implementation and generates the file
../abydos/_beider_morse_data.py.

The file _beider_morse.py may still need manual changes to be made after this
script is run.
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

import codecs
import re
import sys
from os import listdir
from os.path import isfile

# noinspection PyPackageRequirements
import chardet

# The list of languages from BMPM to support (might need to be updated or
# tuned as BMPM is updated)
lang_tuple = (
    'any',
    'arabic',
    'cyrillic',
    'czech',
    'dutch',
    'english',
    'french',
    'german',
    'greek',
    'greeklatin',
    'hebrew',
    'hungarian',
    'italian',
    'latvian',
    'polish',
    'portuguese',
    'romanian',
    'russian',
    'spanish',
    'turkish',
)

lang_dict = {}
for i, l in enumerate(lang_tuple):
    lang_dict[l] = 2 ** i
lang_dict['common'] = "'common'"

nl = False
array_seen = False

tail_text = ''
sd = ''


def c2u(name):
    """Convert camelCase (used in PHP) to Python-standard snake_case.

    Src:
    https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case

    Parameters
    ----------
    name: A function or variable name in camelCase

    Returns
    -------
    str: The name in snake_case

    """
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    s1 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    return s1


def pythonize(line, fn='', subdir='gen'):
    """Convert a line of BMPM code from PHP to Python.

    Parameters
    ----------
    line : str
        A line of code
        fn : str
        A filename
        subdir : str
        The file's subdirectory

    Returns
    -------
    The code in Python

    """
    global array_seen, nl, sd

    if '$all' in line:
        return ''
    if 'make the sum of all languages be visible in the function' in line:
        return ''

    line = line.strip()

    if 'array' in line and not line.startswith('//'):
        array_seen = True

    line = re.sub('//+', '#', line)
    # line = re.sub('"\.\((\$.+?)\)\."', r'\1', line)
    if line and re.search(r'array\("[^"]+?"\)', line):
        # print("### " + line)
        line = ''
    line = line.replace('array', '')
    line = re.sub(r'^\s*', '', line)
    line = re.sub(';$', '', line)
    line = re.sub('^include_.+', '', line)

    line = re.sub(
        r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", '
        + r'\$languages\)\] = \$([a-zA-Z]+)',
        lambda m: (
            "BMDATA['"
            + subdir
            + "']['"
            + m.group(1)
            + "'][L_"
            + m.group(2).upper()
            + '] = _'
            + subdir.upper()
            + '_'
            + c2u(m.group(3)).upper()
        ),
        line,
    )

    line = re.sub(
        r'\$(approx|rules|exact|hebrew)([A-Za-z]+) = _merge'
        + r'\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
        lambda m: (
            "BMDATA['"
            + subdir
            + "']['"
            + m.group(1)
            + "'][L_"
            + c2u(m.group(2)).upper()
            + '] = _'
            + subdir.upper()
            + '_'
            + c2u(m.group(3)).upper()
            + ' + _'
            + subdir.upper()
            + '_'
            + c2u(m.group(4)).upper()
        ),
        line,
    )

    line = re.sub(
        r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", '
        + r'\$languages\)\] = _merge\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
        lambda m: (
            "BMDATA['"
            + subdir
            + "']['"
            + m.group(1)
            + "'][L_"
            + c2u(m.group(2)).upper()
            + '] = _'
            + subdir.upper()
            + '_'
            + c2u(m.group(3)).upper()
            + ' + _'
            + subdir.upper()
            + '_'
            + c2u(m.group(4)).upper()
        ),
        line,
    )

    line = re.sub(
        r'^\$([a-zA-Z]+)',
        lambda m: '_' + sd.upper() + '_' + c2u(m.group(1)).upper(),
        line,
    )

    for _ in range(len(lang_tuple)):
        line = re.sub(r'($[a-zA-Z]+) *\+ *($[a-zA-Z]+)', r'\1\+\2', line)

    line = re.sub(
        r'\$([a-zA-Z]+)',
        lambda m: (
            'L_' + m.group(1).upper()
            if m.group(1) in lang_dict
            else '$' + m.group(1)
        ),
        line,
    )
    line = re.sub(r'\[\"\.\((L_[A-Z_+]+)\)\.\"\]', r'[\1]', line)

    line = re.sub(
        'L_([A-Z]+)', lambda m: str(lang_dict[m.group(1).lower()]), line
    )
    for _ in range(4):
        line = re.sub(
            r'([0-9]+) *\+ *([0-9]+)',
            lambda m: str(int(m.group(1)) + int(m.group(2))),
            line,
        )

    if fn == 'lang':
        if len(line.split(',')) >= 3:
            parts = line.split(',')
            parts[0] = re.sub('/(.+?)/', r'\1', parts[0])
            # parts[1] = re.sub('\$', 'L_', parts[1])
            # parts[1] = re.sub(' *\+ *', '|', parts[1])
            parts[2] = parts[2].title()
            line = ','.join(parts)

    if 'languagenames' in fn:
        line = line.replace('"', "'")
        line = line.replace("','", "', '")
        if line and line[0] == "'":
            line = ' ' * 14 + line

    # fix upstream
    # line = line.replace('ë', 'ü')

    comment = ''
    if '#' in line:
        hashsign = line.find('#')
        comment = line[hashsign:]
        code = line[:hashsign]
    else:
        code = line

    code = code.rstrip()
    comment = comment.strip()
    if not re.match(r'^\s*$', code):
        comment = '  ' + comment

    if '(' in code and ')' in code:
        prefix = code[: code.find('(') + 1]
        suffix = code[code.rfind(')') :]
        tuplecontent = code[len(prefix) : len(code) - len(suffix)]

        elts = tuplecontent.split(',')
        for i in range(len(elts)):
            elts[i] = elts[i].strip()
            if elts[i][0] == '"' and elts[i][-1] == '"':
                elts[i] = "'" + elts[i][1:-1].replace("'", "\\'") + "'"
        tuplecontent = ', '.join(elts)

        code = prefix + tuplecontent + suffix

    line = code + comment
    line = re.sub('# *', '# ', line)

    if line:
        nl = False
        if array_seen and not (line[0] == '_' or line.startswith('BMDATA')):
            line = ' ' * 4 + line
        return line + '\n'
    elif not nl:
        nl = True
        return '\n'
    else:
        return ''


def _run_script():
    global array_seen, nl, sd, tail_text

    if len(sys.argv) > 1:
        bmdir = sys.argv[1].rstrip('/') + '/'
    else:
        bmdir = '../../bmpm/'

    outfilename = '../abydos/phonetic/_beider_morse_data.py'
    outfile = codecs.open(outfilename, 'w', 'utf-8')

    outfile.write(
        '# -*- coding: utf-8 -*-\n\n# Copyright 2014-2018 by \
Christopher C. Little.\n# This file is part of Abydos.\n#\n# This file is \
based on Alexander Beider and Stephen P. Morse\'s implementation\n# of the \
Beider-Morse Phonetic Matching (BMPM) System, available at\n# \
http://stevemorse.org/phonetics/bmpm.htm.\n#\n# Abydos is free software: \
you can redistribute it and/or modify\n# it under the terms of the GNU \
General Public License as published by\n# the Free Software Foundation, \
either version 3 of the License, or\n# (at your option) any later version.\n\
#\n# Abydos is distributed in the hope that it will be useful,\n# but WITHOUT \
ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or \
FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for \
more details.\n#\n# You should have received a copy of the GNU General Public \
License\n# along with Abydos. If not, see <http://www.gnu.org/licenses/>.\n\n\
"""abydos.phonetic._beider_morse_data.\n\nBehind-the-scenes constants, \
rules, etc. for the Beider-Morse Phonentic\nMatching (BMPM) algorithm\n\nDO \
NOT EDIT - This document is automatically generated from the reference\n\
implementation in PHP.\n"""\n\nfrom \
__future__ import (\n    absolute_import,\n    division,\n    print_function,\
    unicode_literals,\n)\n'
    )

    outfile.write('L_NONE = 0\n')
    for i, l in enumerate(lang_tuple):
        outfile.write('L_' + l.upper() + ' = 2**' + str(i) + '\n')
    outfile.write('\n\n')

    tail_text += '\nBMDATA = {}\n'

    subdirs = ('gen', 'sep', 'ash')

    for s in subdirs:
        sd = s
        tail_text += '\nBMDATA[\'' + s + '\'] = {}\n'
        tail_text += 'BMDATA[\'' + s + '\'][\'approx\'] = {}\n'
        tail_text += 'BMDATA[\'' + s + '\'][\'exact\'] = {}\n'
        tail_text += 'BMDATA[\'' + s + '\'][\'rules\'] = {}\n'
        tail_text += 'BMDATA[\'' + s + '\'][\'hebrew\'] = {}\n\n'
        tail_text += (
            'BMDATA[\''
            + s
            + '\'][\'language_rules\'] = _'
            + s.upper()
            + '_LANGUAGE_RULES\n'
        )
        tail_text += (
            'BMDATA[\''
            + s
            + '\'][\'languages\'] = _'
            + s.upper()
            + '_LANGUAGES\n'
        )

        phps = [
            f
            for f in sorted(listdir(bmdir + s + '/'))
            if (isfile(bmdir + s + '/' + f) and f.endswith('.php'))
        ]
        for infilename in phps:
            for pfx in (
                'rules',
                'approx',
                'exact',
                'hebrew',
                'language',
                'lang',
            ):
                if infilename.startswith(pfx):
                    array_seen = False
                    infilepath = bmdir + s + '/' + infilename
                    infileenc = chardet.detect(open(infilepath, 'rb').read())[
                        'encoding'
                    ]
                    print(s + '/' + infilename)  # noqa: T001
                    infile = codecs.open(infilepath, 'r', infileenc)
                    # if infilename.startswith('lang'):
                    #     tuplename = infilename[:-4]
                    # else:
                    #     tuplename = pfx + '_' + infilename[len(pfx) : -4]
                    # indent = len(tuplename) + 21

                    outfile.write('# ' + s + '/' + infilename + '\n')

                    ignore = True
                    for line in infile:
                        if 'function Language' in line:
                            break
                        if not ignore:
                            if re.search(r'\?>', line):
                                ignore = True
                            else:
                                line = pythonize(line, infilename[:-4], s)
                                if line.startswith('BMDATA'):
                                    tail_text += line
                                else:
                                    outfile.write(line)
                        if '*/' in line:
                            ignore = False

                    outfile.write('\n\n')
                    break

    outfile.write(tail_text)

    outfile.close()
    outfilelines = codecs.open(outfilename, 'r', 'utf-8').readlines()
    outfile = codecs.open(outfilename, 'w', 'utf-8')
    nl = False
    fixlanguagesarray = False

    sep_lang = (
        "('any', 'french', 'hebrew', 'italian', 'portuguese', 'spanish')"
    )

    for line in outfilelines:
        line = line.rstrip()
        if line:
            if fixlanguagesarray:
                line = ' ' + line.strip()
                fixlanguagesarray = False
            if len(line) > 79 or sep_lang in line:
                line += '  # noqa: E501'
            outfile.write(line)
            if not line.endswith('='):
                outfile.write('\n')
            else:
                fixlanguagesarray = True
            nl = False
        else:
            if not nl:
                outfile.write('\n')
            nl = True

    outfile.write(
        '\n\nif __name__ == \'__main__\':\n    import doctest\n\n\
    doctest.testmod()\n'
    )


if __name__ == '__main__':
    _run_script()


1			#!/usr/bin/env python
2			# -- coding: utf-8 --
3
4			# Copyright 2014-2018 by Christopher C. Little.
5			# This file is part of Abydos.
6			#
7			# Abydos is free software: you can redistribute it and/or modify
8			# it under the terms of the GNU General Public License as published by
9			# the Free Software Foundation, either version 3 of the License, or
10			# (at your option) any later version.
11			#
12			# Abydos is distributed in the hope that it will be useful,
13			# but WITHOUT ANY WARRANTY; without even the implied warranty of
14			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15			# GNU General Public License for more details.
16			#
17			# You should have received a copy of the GNU General Public License
18			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
19
20
21			"""bm_php2py.py.
22
23			This helper script converts Beider-Morse Phonetic Matching Algorithm (BMPM)
24			code from PHP to Python.
25
26			It assumes that the BMPM code is located at ../../bmpm (relative to this
27			directory in the abydos repository).
28
29			It reads the BMPM reference implementation and generates the file
30			../abydos/_beider_morse_data.py.
31
32			The file _beider_morse.py may still need manual changes to be made after this
33			script is run.
34			"""
35
36			from __future__ import (
37			absolute_import,
38			division,
39			print_function,
40			unicode_literals,
41			)
42
43			import codecs
44			import re
45			import sys
46			from os import listdir
47			from os.path import isfile
48
49			# noinspection PyPackageRequirements
50			import chardet
51
52			# The list of languages from BMPM to support (might need to be updated or
53			# tuned as BMPM is updated)
54			lang_tuple = (
55			'any',
56			'arabic',
57			'cyrillic',
58			'czech',
59			'dutch',
60			'english',
61			'french',
62			'german',
63			'greek',
64			'greeklatin',
65			'hebrew',
66			'hungarian',
67			'italian',
68			'latvian',
69			'polish',
70			'portuguese',
71			'romanian',
72			'russian',
73			'spanish',
74			'turkish',
75			)
76
77			lang_dict = {}
78			for i, l in enumerate(lang_tuple):
79			lang_dict[l] = 2 ** i
80			lang_dict['common'] = "'common'"
81
82			nl = False
83			array_seen = False
84
85			tail_text = ''
86			sd = ''
87
88
89			def c2u(name):
90			"""Convert camelCase (used in PHP) to Python-standard snake_case.
91
92			Src:
93			https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
94
95			Parameters
96			----------
97			name: A function or variable name in camelCase
98
99			Returns
100			-------
101			str: The name in snake_case
102
103			"""
104			s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
105			s1 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
106			return s1
107
108
109			def pythonize(line, fn='', subdir='gen'):
110			"""Convert a line of BMPM code from PHP to Python.
111
112			Parameters
113			----------
114			line : str
115			A line of code
116			fn : str
117			A filename
118			subdir : str
119			The file's subdirectory
120
121			Returns
122			-------
123			The code in Python
124
125			"""
126			global array_seen, nl, sd
127
128			if '$all' in line:
129			return ''
130			if 'make the sum of all languages be visible in the function' in line:
131			return ''
132
133			line = line.strip()
134
135			if 'array' in line and not line.startswith('//'):
136			array_seen = True
137
138			line = re.sub('//+', '#', line)
139			# line = re.sub('"\.\((\$.+?)\)\."', r'\1', line)
140			if line and re.search(r'array\("[^"]+?"\)', line):
141			# print("### " + line)
142			line = ''
143			line = line.replace('array', '')
144			line = re.sub(r'^\s*', '', line)
145			line = re.sub(';$', '', line)
146			line = re.sub('^include_.+', '', line)
147
148			line = re.sub(
149			r'\$(approx\|rules\|exact)\[LanguageIndex\("([^"]+)", '
150			+ r'\$languages\)\] = \$([a-zA-Z]+)',
151			lambda m: (
152			"BMDATA['"
153			+ subdir
154			+ "']['"
155			+ m.group(1)
156			+ "'][L_"
157			+ m.group(2).upper()
158			+ '] = _'
159			+ subdir.upper()
160			+ '_'
161			+ c2u(m.group(3)).upper()
162			),
163			line,
164			)
165
166			line = re.sub(
167			r'\$(approx\|rules\|exact\|hebrew)([A-Za-z]+) = _merge'
168			+ r'\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
169			lambda m: (
170			"BMDATA['"
171			+ subdir
172			+ "']['"
173			+ m.group(1)
174			+ "'][L_"
175			+ c2u(m.group(2)).upper()
176			+ '] = _'
177			+ subdir.upper()
178			+ '_'
179			+ c2u(m.group(3)).upper()
180			+ ' + _'
181			+ subdir.upper()
182			+ '_'
183			+ c2u(m.group(4)).upper()
184			),
185			line,
186			)
187
188			line = re.sub(
189			r'\$(approx\|rules\|exact)\[LanguageIndex\("([^"]+)", '
190			+ r'\$languages\)\] = _merge\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
191			lambda m: (
192			"BMDATA['"
193			+ subdir
194			+ "']['"
195			+ m.group(1)
196			+ "'][L_"
197			+ c2u(m.group(2)).upper()
198			+ '] = _'
199			+ subdir.upper()
200			+ '_'
201			+ c2u(m.group(3)).upper()
202			+ ' + _'
203			+ subdir.upper()
204			+ '_'
205			+ c2u(m.group(4)).upper()
206			),
207			line,
208			)
209
210			line = re.sub(
211			r'^\$([a-zA-Z]+)',
212			lambda m: '_' + sd.upper() + '_' + c2u(m.group(1)).upper(),
213			line,
214			)
215
216			for _ in range(len(lang_tuple)):
217			line = re.sub(r'($[a-zA-Z]+) \+ ($[a-zA-Z]+)', r'\1\+\2', line)
218
219			line = re.sub(
220			r'\$([a-zA-Z]+)',
221			lambda m: (
222			'L_' + m.group(1).upper()
223			if m.group(1) in lang_dict
224			else '$' + m.group(1)
225			),
226			line,
227			)
228			line = re.sub(r'\[\"\.\((L_[A-Z_+]+)\)\.\"\]', r'[\1]', line)
229
230			line = re.sub(
231			'L_([A-Z]+)', lambda m: str(lang_dict[m.group(1).lower()]), line
232			)
233			for _ in range(4):
234			line = re.sub(
235			r'([0-9]+) \+ ([0-9]+)',
236			lambda m: str(int(m.group(1)) + int(m.group(2))),
237			line,
238			)
239
240			if fn == 'lang':
241			if len(line.split(',')) >= 3:
242			parts = line.split(',')
243			parts[0] = re.sub('/(.+?)/', r'\1', parts[0])
244			# parts[1] = re.sub('\$', 'L_', parts[1])
245			# parts[1] = re.sub(' \+ ', '\|', parts[1])
246			parts[2] = parts[2].title()
247			line = ','.join(parts)
248
249			if 'languagenames' in fn:
250			line = line.replace('"', "'")
251			line = line.replace("','", "', '")
252			if line and line[0] == "'":
253			line = ' ' * 14 + line
254
255			# fix upstream
256			# line = line.replace('ë', 'ü')
257
258			comment = ''
259			if '#' in line:
260			hashsign = line.find('#')
261			comment = line[hashsign:]
262			code = line[:hashsign]
263			else:
264			code = line
265
266			code = code.rstrip()
267			comment = comment.strip()
268			if not re.match(r'^\s*$', code):
269			comment = ' ' + comment
270
271			if '(' in code and ')' in code:
272			prefix = code[: code.find('(') + 1]
273			suffix = code[code.rfind(')') :]
274			tuplecontent = code[len(prefix) : len(code) - len(suffix)]
275
276			elts = tuplecontent.split(',')
277			for i in range(len(elts)):
278			elts[i] = elts[i].strip()
279			if elts[i][0] == '"' and elts[i][-1] == '"':
280			elts[i] = "'" + elts[i][1:-1].replace("'", "\\'") + "'"
281			tuplecontent = ', '.join(elts)
282
283			code = prefix + tuplecontent + suffix
284
285			line = code + comment
286			line = re.sub('# *', '# ', line)
287
288			if line:
289			nl = False
290			if array_seen and not (line[0] == '_' or line.startswith('BMDATA')):
291			line = ' ' * 4 + line
292			return line + '\n'
293			elif not nl:
294			nl = True
295			return '\n'
296			else:
297			return ''
298
299
300			def _run_script():
301			global array_seen, nl, sd, tail_text
302
303			if len(sys.argv) > 1:
304			bmdir = sys.argv[1].rstrip('/') + '/'
305			else:
306			bmdir = '../../bmpm/'
307
308			outfilename = '../abydos/phonetic/_beider_morse_data.py'
309			outfile = codecs.open(outfilename, 'w', 'utf-8')
310
311			outfile.write(
312			'# -- coding: utf-8 --\n\n# Copyright 2014-2018 by \
313			Christopher C. Little.\n# This file is part of Abydos.\n#\n# This file is \
314			based on Alexander Beider and Stephen P. Morse\'s implementation\n# of the \
315			Beider-Morse Phonetic Matching (BMPM) System, available at\n# \
316			http://stevemorse.org/phonetics/bmpm.htm.\n#\n# Abydos is free software: \
317			you can redistribute it and/or modify\n# it under the terms of the GNU \
318			General Public License as published by\n# the Free Software Foundation, \
319			either version 3 of the License, or\n# (at your option) any later version.\n\
320			#\n# Abydos is distributed in the hope that it will be useful,\n# but WITHOUT \
321			ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or \
322			FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for \
323			more details.\n#\n# You should have received a copy of the GNU General Public \
324			License\n# along with Abydos. If not, see <http://www.gnu.org/licenses/>.\n\n\
325			"""abydos.phonetic._beider_morse_data.\n\nBehind-the-scenes constants, \
326			rules, etc. for the Beider-Morse Phonentic\nMatching (BMPM) algorithm\n\nDO \
327			NOT EDIT - This document is automatically generated from the reference\n\
328			implementation in PHP.\n"""\n\nfrom \
329			__future__ import (\n absolute_import,\n division,\n print_function,\
330			unicode_literals,\n)\n'
331			)
332
333			outfile.write('L_NONE = 0\n')
334			for i, l in enumerate(lang_tuple):
335			outfile.write('L_' + l.upper() + ' = 2**' + str(i) + '\n')
336			outfile.write('\n\n')
337
338			tail_text += '\nBMDATA = {}\n'
339
340			subdirs = ('gen', 'sep', 'ash')
341
342			for s in subdirs:
343			sd = s
344			tail_text += '\nBMDATA[\'' + s + '\'] = {}\n'
345			tail_text += 'BMDATA[\'' + s + '\'][\'approx\'] = {}\n'
346			tail_text += 'BMDATA[\'' + s + '\'][\'exact\'] = {}\n'
347			tail_text += 'BMDATA[\'' + s + '\'][\'rules\'] = {}\n'
348			tail_text += 'BMDATA[\'' + s + '\'][\'hebrew\'] = {}\n\n'
349			tail_text += (
350			'BMDATA[\''
351			+ s
352			+ '\'][\'language_rules\'] = _'
353			+ s.upper()
354			+ '_LANGUAGE_RULES\n'
355			)
356			tail_text += (
357			'BMDATA[\''
358			+ s
359			+ '\'][\'languages\'] = _'
360			+ s.upper()
361			+ '_LANGUAGES\n'
362			)
363
364			phps = [
365			f
366			for f in sorted(listdir(bmdir + s + '/'))
367			if (isfile(bmdir + s + '/' + f) and f.endswith('.php'))
368			]
369			for infilename in phps:
370			for pfx in (
371			'rules',
372			'approx',
373			'exact',
374			'hebrew',
375			'language',
376			'lang',
377			):
378			if infilename.startswith(pfx):
379			array_seen = False
380			infilepath = bmdir + s + '/' + infilename
381			infileenc = chardet.detect(open(infilepath, 'rb').read())[
382			'encoding'
383			]
384			print(s + '/' + infilename) # noqa: T001
385			infile = codecs.open(infilepath, 'r', infileenc)
386			# if infilename.startswith('lang'):
387			# tuplename = infilename[:-4]
388			# else:
389			# tuplename = pfx + '_' + infilename[len(pfx) : -4]
390			# indent = len(tuplename) + 21
391
392			outfile.write('# ' + s + '/' + infilename + '\n')
393
394			ignore = True
395			for line in infile:
396			if 'function Language' in line:
397			break
398			if not ignore:
399			if re.search(r'\?>', line):
400			ignore = True
401			else:
402			line = pythonize(line, infilename[:-4], s)
403			if line.startswith('BMDATA'):
404			tail_text += line
405			else:
406			outfile.write(line)
407			if '*/' in line:
408			ignore = False
409
410			outfile.write('\n\n')
411			break
412
413			outfile.write(tail_text)
414
415			outfile.close()
416			outfilelines = codecs.open(outfilename, 'r', 'utf-8').readlines()
417			outfile = codecs.open(outfilename, 'w', 'utf-8')
418			nl = False
419			fixlanguagesarray = False
420
421			sep_lang = (
422			"('any', 'french', 'hebrew', 'italian', 'portuguese', 'spanish')"
423			)
424
425			for line in outfilelines:
426			line = line.rstrip()
427			if line:
428			if fixlanguagesarray:
429			line = ' ' + line.strip()
430			fixlanguagesarray = False
431			if len(line) > 79 or sep_lang in line:
432			line += ' # noqa: E501'
433			outfile.write(line)
434			if not line.endswith('='):
435			outfile.write('\n')
436			else:
437			fixlanguagesarray = True
438			nl = False
439			else:
440			if not nl:
441			outfile.write('\n')
442			nl = True
443
444			outfile.write(
445			'\n\nif __name__ == \'__main__\':\n import doctest\n\n\
446			doctest.testmod()\n'
447			)
448
449
450			if __name__ == '__main__':
451			_run_script()
452

chrislit / abydos

Push — master ( f43547...71985b )

bm_php2py.c2u() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like