bm_php2py.c2u() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

bm_php2py.c2u() A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: bm_php2py

Complexity

Conditions

Size

Total Lines	18
Code Lines	4

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	4
nop	1
dl	0
loc	18
rs	10
c	0
b	0
f	0

#!/usr/bin/env python3
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.


"""bm_php2py.py.

This helper script converts Beider-Morse Phonetic Matching Algorithm (BMPM)
code from PHP to Python.

It assumes that the BMPM code is located at ../../bmpm (relative to this
directory in the abydos repository).

It reads the BMPM reference implementation and generates the file
../abydos/_beider_morse_data.py.

The file _beider_morse.py may still need manual changes to be made after this
script is run.
"""

import codecs
import re
import sys
from os import listdir
from os.path import isfile

# noinspection PyPackageRequirements
import chardet

# The list of languages from BMPM to support (might need to be updated or
# tuned as BMPM is updated)
lang_tuple = (
    'any',
    'arabic',
    'cyrillic',
    'czech',
    'dutch',
    'english',
    'french',
    'german',
    'greek',
    'greeklatin',
    'hebrew',
    'hungarian',
    'italian',
    'latvian',
    'polish',
    'portuguese',
    'romanian',
    'russian',
    'spanish',
    'turkish',
)

lang_dict = {}
for i, l in enumerate(lang_tuple):
    lang_dict[l] = 2 ** i
lang_dict['common'] = "'common'"

nl = False
array_seen = False

tail_text = ''
sd = ''


def c2u(name):
    """Convert camelCase (used in PHP) to Python-standard snake_case.

    Src:
    https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case

    Parameters
    ----------
    name: A function or variable name in camelCase

    Returns
    -------
    str: The name in snake_case

    """
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    s1 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    return s1


def pythonize(line, fn='', subdir='gen'):
    """Convert a line of BMPM code from PHP to Python.

    Parameters
    ----------
    line : str
        A line of code
        fn : str
        A filename
        subdir : str
        The file's subdirectory

    Returns
    -------
    The code in Python

    """
    global array_seen, nl, sd

    if '$all' in line:
        return ''
    if 'make the sum of all languages be visible in the function' in line:
        return ''

    line = line.strip()

    if 'array' in line and not line.startswith('//'):
        array_seen = True

    line = re.sub('//+', '#', line)
    # line = re.sub('"\.\((\$.+?)\)\."', r'\1', line)
    if line and re.search(r'array\("[^"]+?"\)', line):
        # print("### " + line)
        line = ''
    line = line.replace('array', '')
    line = re.sub(r'^\s*', '', line)
    line = re.sub(';$', '', line)
    line = re.sub('^include_.+', '', line)

    line = re.sub(
        r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", '
        + r'\$languages\)\] = \$([a-zA-Z]+)',
        lambda m: (
            "BMDATA['"
            + subdir
            + "']['"
            + m.group(1)
            + "'][L_"
            + m.group(2).upper()
            + '] = _'
            + subdir.upper()
            + '_'
            + c2u(m.group(3)).upper()
        ),
        line,
    )

    line = re.sub(
        r'\$(approx|rules|exact|hebrew)([A-Za-z]+) = _merge'
        + r'\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
        lambda m: (
            "BMDATA['"
            + subdir
            + "']['"
            + m.group(1)
            + "'][L_"
            + c2u(m.group(2)).upper()
            + '] = _'
            + subdir.upper()
            + '_'
            + c2u(m.group(3)).upper()
            + ' + _'
            + subdir.upper()
            + '_'
            + c2u(m.group(4)).upper()
        ),
        line,
    )

    line = re.sub(
        r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", '
        + r'\$languages\)\] = _merge\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
        lambda m: (
            "BMDATA['"
            + subdir
            + "']['"
            + m.group(1)
            + "'][L_"
            + c2u(m.group(2)).upper()
            + '] = _'
            + subdir.upper()
            + '_'
            + c2u(m.group(3)).upper()
            + ' + _'
            + subdir.upper()
            + '_'
            + c2u(m.group(4)).upper()
        ),
        line,
    )

    line = re.sub(
        r'^\$([a-zA-Z]+)',
        lambda m: '_' + sd.upper() + '_' + c2u(m.group(1)).upper(),
        line,
    )

    for _ in range(len(lang_tuple)):
        line = re.sub(r'($[a-zA-Z]+) *\+ *($[a-zA-Z]+)', r'\1\+\2', line)

    line = re.sub(
        r'\$([a-zA-Z]+)',
        lambda m: (
            'L_' + m.group(1).upper()
            if m.group(1) in lang_dict
            else '$' + m.group(1)
        ),
        line,
    )
    line = re.sub(r'\[\"\.\((L_[A-Z_+]+)\)\.\"\]', r'[\1]', line)

    line = re.sub(
        'L_([A-Z]+)', lambda m: str(lang_dict[m.group(1).lower()]), line
    )
    for _ in range(4):
        line = re.sub(
            r'([0-9]+) *\+ *([0-9]+)',
            lambda m: str(int(m.group(1)) + int(m.group(2))),
            line,
        )

    if fn == 'lang':
        if len(line.split(',')) >= 3:
            parts = line.split(',')
            parts[0] = re.sub('/(.+?)/', r'\1', parts[0])
            # parts[1] = re.sub('\$', 'L_', parts[1])
            # parts[1] = re.sub(' *\+ *', '|', parts[1])
            parts[2] = parts[2].title()
            line = ','.join(parts)

    if 'languagenames' in fn:
        line = line.replace('"', "'")
        line = line.replace("','", "', '")
        if line and line[0] == "'":
            line = ' ' * 14 + line

    # fix upstream
    # line = line.replace('ë', 'ü')

    comment = ''
    if '#' in line:
        hashsign = line.find('#')
        comment = line[hashsign:]
        code = line[:hashsign]
    else:
        code = line

    code = code.rstrip()
    comment = comment.strip()
    if not re.match(r'^\s*$', code):
        comment = '  ' + comment

    if '(' in code and ')' in code:
        prefix = code[: code.find('(') + 1]
        suffix = code[code.rfind(')') :]
        tuplecontent = code[len(prefix) : len(code) - len(suffix)]

        elts = tuplecontent.split(',')
        for i in range(len(elts)):
            elts[i] = elts[i].strip()
            if elts[i][0] == '"' and elts[i][-1] == '"':
                elts[i] = "'" + elts[i][1:-1].replace("'", "\\'") + "'"
        tuplecontent = ', '.join(elts)

        code = prefix + tuplecontent + suffix

    line = code + comment
    line = re.sub('# *', '# ', line)

    if line:
        nl = False
        if array_seen and not (line[0] == '_' or line.startswith('BMDATA')):
            line = ' ' * 4 + line
        return line + '\n'
    elif not nl:
        nl = True
        return '\n'
    else:
        return ''


def _run_script():
    global array_seen, nl, sd, tail_text

    if len(sys.argv) > 1:
        bmdir = sys.argv[1].rstrip('/') + '/'
    else:
        bmdir = '../../bmpm/'

    outfilename = '../abydos/phonetic/_beider_morse_data.py'
    outfile = codecs.open(outfilename, 'w', 'utf-8')

    outfile.write(
        '# Copyright 2014-2020 by \
Christopher C. Little.\n# This file is part of Abydos.\n#\n# This file is \
based on Alexander Beider and Stephen P. Morse\'s implementation\n# of the \
Beider-Morse Phonetic Matching (BMPM) System, available at\n# \
http://stevemorse.org/phonetics/bmpm.htm.\n#\n# Abydos is free software: \
you can redistribute it and/or modify\n# it under the terms of the GNU \
General Public License as published by\n# the Free Software Foundation, \
either version 3 of the License, or\n# (at your option) any later version.\n\
#\n# Abydos is distributed in the hope that it will be useful,\n# but WITHOUT \
ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or \
FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for \
more details.\n#\n# You should have received a copy of the GNU General Public \
License\n# along with Abydos. If not, see <http://www.gnu.org/licenses/>.\n\n\
"""abydos.phonetic._beider_morse_data.\n\nBehind-the-scenes constants, \
rules, etc. for the Beider-Morse Phonentic\nMatching (BMPM) algorithm\n\nDO \
NOT EDIT - This document is automatically generated from the reference\n\
implementation in PHP.\n"""\n\nfrom \
__future__ import (\n    absolute_import,\n    division,\n    print_function,\
    unicode_literals,\n)\n'
    )

    outfile.write('L_NONE = 0\n')
    for i, l in enumerate(lang_tuple):
        outfile.write('L_' + l.upper() + ' = 2**' + str(i) + '\n')
    outfile.write('\n\n')

    tail_text += '\nBMDATA = {}  # type: ignore\n'

    subdirs = ('gen', 'sep', 'ash')

    for s in subdirs:
        sd = s
        tail_text += "\nBMDATA['" + s + "'] = {}\n"
        tail_text += "BMDATA['" + s + "']['approx'] = {}\n"
        tail_text += "BMDATA['" + s + "']['exact'] = {}\n"
        tail_text += "BMDATA['" + s + "']['rules'] = {}\n"
        tail_text += "BMDATA['" + s + "']['hebrew'] = {}\n\n"
        tail_text += (
            "BMDATA['"
            + s
            + "']['language_rules'] = _"
            + s.upper()
            + '_LANGUAGE_RULES\n'
        )
        tail_text += (
            "BMDATA['" + s + "']['languages'] = _" + s.upper() + '_LANGUAGES\n'
        )

        phps = [
            f
            for f in sorted(listdir(bmdir + s + '/'))
            if (isfile(bmdir + s + '/' + f) and f.endswith('.php'))
        ]
        for infilename in phps:
            for pfx in (
                'rules',
                'approx',
                'exact',
                'hebrew',
                'language',
                'lang',
            ):
                if infilename.startswith(pfx):
                    array_seen = False
                    infilepath = bmdir + s + '/' + infilename
                    infileenc = chardet.detect(open(infilepath, 'rb').read())[
                        'encoding'
                    ]
                    print(s + '/' + infilename)  # noqa: T001
                    infile = codecs.open(infilepath, 'r', infileenc)
                    # if infilename.startswith('lang'):
                    #     tuplename = infilename[:-4]
                    # else:
                    #     tuplename = pfx + '_' + infilename[len(pfx) : -4]
                    # indent = len(tuplename) + 21

                    outfile.write('# ' + s + '/' + infilename + '\n')

                    ignore = True
                    for line in infile:
                        if 'function Language' in line:
                            break
                        if not ignore:
                            if re.search(r'\?>', line):
                                ignore = True
                            else:
                                line = pythonize(line, infilename[:-4], s)
                                if line.startswith('BMDATA'):
                                    tail_text += line
                                else:
                                    outfile.write(line)
                        if '*/' in line:
                            ignore = False

                    outfile.write('\n\n')
                    break

    outfile.write(tail_text)

    outfile.close()
    outfilelines = codecs.open(outfilename, 'r', 'utf-8').readlines()
    outfile = codecs.open(outfilename, 'w', 'utf-8')
    nl = False
    fixlanguagesarray = False

    sep_lang = (
        "('any', 'french', 'hebrew', 'italian', 'portuguese', 'spanish')"
    )

    for line in outfilelines:
        line = line.rstrip()
        if line:
            if fixlanguagesarray:
                line = ' ' + line.strip()
                fixlanguagesarray = False
            if len(line) > 79 or sep_lang in line:
                line += '  # noqa: E501'
            outfile.write(line)
            if not line.endswith('='):
                outfile.write('\n')
            else:
                fixlanguagesarray = True
            nl = False
        else:
            if not nl:
                outfile.write('\n')
            nl = True

    outfile.write(
        "\n\nif __name__ == '__main__':\n    import doctest\n\n\
    doctest.testmod()\n"
    )


if __name__ == '__main__':
    _run_script()


1			#!/usr/bin/env python3
2			# Copyright 2014-2020 by Christopher C. Little.
3			# This file is part of Abydos.
4			#
5			# Abydos is free software: you can redistribute it and/or modify
6			# it under the terms of the GNU General Public License as published by
7			# the Free Software Foundation, either version 3 of the License, or
8			# (at your option) any later version.
9			#
10			# Abydos is distributed in the hope that it will be useful,
11			# but WITHOUT ANY WARRANTY; without even the implied warranty of
12			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13			# GNU General Public License for more details.
14			#
15			# You should have received a copy of the GNU General Public License
16			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
17
18
19			"""bm_php2py.py.
20
21			This helper script converts Beider-Morse Phonetic Matching Algorithm (BMPM)
22			code from PHP to Python.
23
24			It assumes that the BMPM code is located at ../../bmpm (relative to this
25			directory in the abydos repository).
26
27			It reads the BMPM reference implementation and generates the file
28			../abydos/_beider_morse_data.py.
29
30			The file _beider_morse.py may still need manual changes to be made after this
31			script is run.
32			"""
33
34			import codecs
35			import re
36			import sys
37			from os import listdir
38			from os.path import isfile
39
40			# noinspection PyPackageRequirements
41			import chardet
42
43			# The list of languages from BMPM to support (might need to be updated or
44			# tuned as BMPM is updated)
45			lang_tuple = (
46			'any',
47			'arabic',
48			'cyrillic',
49			'czech',
50			'dutch',
51			'english',
52			'french',
53			'german',
54			'greek',
55			'greeklatin',
56			'hebrew',
57			'hungarian',
58			'italian',
59			'latvian',
60			'polish',
61			'portuguese',
62			'romanian',
63			'russian',
64			'spanish',
65			'turkish',
66			)
67
68			lang_dict = {}
69			for i, l in enumerate(lang_tuple):
70			lang_dict[l] = 2 ** i
71			lang_dict['common'] = "'common'"
72
73			nl = False
74			array_seen = False
75
76			tail_text = ''
77			sd = ''
78
79
80			def c2u(name):
81			"""Convert camelCase (used in PHP) to Python-standard snake_case.
82
83			Src:
84			https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
85
86			Parameters
87			----------
88			name: A function or variable name in camelCase
89
90			Returns
91			-------
92			str: The name in snake_case
93
94			"""
95			s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
96			s1 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
97			return s1
98
99
100			def pythonize(line, fn='', subdir='gen'):
101			"""Convert a line of BMPM code from PHP to Python.
102
103			Parameters
104			----------
105			line : str
106			A line of code
107			fn : str
108			A filename
109			subdir : str
110			The file's subdirectory
111
112			Returns
113			-------
114			The code in Python
115
116			"""
117			global array_seen, nl, sd
118
119			if '$all' in line:
120			return ''
121			if 'make the sum of all languages be visible in the function' in line:
122			return ''
123
124			line = line.strip()
125
126			if 'array' in line and not line.startswith('//'):
127			array_seen = True
128
129			line = re.sub('//+', '#', line)
130			# line = re.sub('"\.\((\$.+?)\)\."', r'\1', line)
131			if line and re.search(r'array\("[^"]+?"\)', line):
132			# print("### " + line)
133			line = ''
134			line = line.replace('array', '')
135			line = re.sub(r'^\s*', '', line)
136			line = re.sub(';$', '', line)
137			line = re.sub('^include_.+', '', line)
138
139			line = re.sub(
140			r'\$(approx\|rules\|exact)\[LanguageIndex\("([^"]+)", '
141			+ r'\$languages\)\] = \$([a-zA-Z]+)',
142			lambda m: (
143			"BMDATA['"
144			+ subdir
145			+ "']['"
146			+ m.group(1)
147			+ "'][L_"
148			+ m.group(2).upper()
149			+ '] = _'
150			+ subdir.upper()
151			+ '_'
152			+ c2u(m.group(3)).upper()
153			),
154			line,
155			)
156
157			line = re.sub(
158			r'\$(approx\|rules\|exact\|hebrew)([A-Za-z]+) = _merge'
159			+ r'\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
160			lambda m: (
161			"BMDATA['"
162			+ subdir
163			+ "']['"
164			+ m.group(1)
165			+ "'][L_"
166			+ c2u(m.group(2)).upper()
167			+ '] = _'
168			+ subdir.upper()
169			+ '_'
170			+ c2u(m.group(3)).upper()
171			+ ' + _'
172			+ subdir.upper()
173			+ '_'
174			+ c2u(m.group(4)).upper()
175			),
176			line,
177			)
178
179			line = re.sub(
180			r'\$(approx\|rules\|exact)\[LanguageIndex\("([^"]+)", '
181			+ r'\$languages\)\] = _merge\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)',
182			lambda m: (
183			"BMDATA['"
184			+ subdir
185			+ "']['"
186			+ m.group(1)
187			+ "'][L_"
188			+ c2u(m.group(2)).upper()
189			+ '] = _'
190			+ subdir.upper()
191			+ '_'
192			+ c2u(m.group(3)).upper()
193			+ ' + _'
194			+ subdir.upper()
195			+ '_'
196			+ c2u(m.group(4)).upper()
197			),
198			line,
199			)
200
201			line = re.sub(
202			r'^\$([a-zA-Z]+)',
203			lambda m: '_' + sd.upper() + '_' + c2u(m.group(1)).upper(),
204			line,
205			)
206
207			for _ in range(len(lang_tuple)):
208			line = re.sub(r'($[a-zA-Z]+) \+ ($[a-zA-Z]+)', r'\1\+\2', line)
209
210			line = re.sub(
211			r'\$([a-zA-Z]+)',
212			lambda m: (
213			'L_' + m.group(1).upper()
214			if m.group(1) in lang_dict
215			else '$' + m.group(1)
216			),
217			line,
218			)
219			line = re.sub(r'\[\"\.\((L_[A-Z_+]+)\)\.\"\]', r'[\1]', line)
220
221			line = re.sub(
222			'L_([A-Z]+)', lambda m: str(lang_dict[m.group(1).lower()]), line
223			)
224			for _ in range(4):
225			line = re.sub(
226			r'([0-9]+) \+ ([0-9]+)',
227			lambda m: str(int(m.group(1)) + int(m.group(2))),
228			line,
229			)
230
231			if fn == 'lang':
232			if len(line.split(',')) >= 3:
233			parts = line.split(',')
234			parts[0] = re.sub('/(.+?)/', r'\1', parts[0])
235			# parts[1] = re.sub('\$', 'L_', parts[1])
236			# parts[1] = re.sub(' \+ ', '\|', parts[1])
237			parts[2] = parts[2].title()
238			line = ','.join(parts)
239
240			if 'languagenames' in fn:
241			line = line.replace('"', "'")
242			line = line.replace("','", "', '")
243			if line and line[0] == "'":
244			line = ' ' * 14 + line
245
246			# fix upstream
247			# line = line.replace('ë', 'ü')
248
249			comment = ''
250			if '#' in line:
251			hashsign = line.find('#')
252			comment = line[hashsign:]
253			code = line[:hashsign]
254			else:
255			code = line
256
257			code = code.rstrip()
258			comment = comment.strip()
259			if not re.match(r'^\s*$', code):
260			comment = ' ' + comment
261
262			if '(' in code and ')' in code:
263			prefix = code[: code.find('(') + 1]
264			suffix = code[code.rfind(')') :]
265			tuplecontent = code[len(prefix) : len(code) - len(suffix)]
266
267			elts = tuplecontent.split(',')
268			for i in range(len(elts)):
269			elts[i] = elts[i].strip()
270			if elts[i][0] == '"' and elts[i][-1] == '"':
271			elts[i] = "'" + elts[i][1:-1].replace("'", "\\'") + "'"
272			tuplecontent = ', '.join(elts)
273
274			code = prefix + tuplecontent + suffix
275
276			line = code + comment
277			line = re.sub('# *', '# ', line)
278
279			if line:
280			nl = False
281			if array_seen and not (line[0] == '_' or line.startswith('BMDATA')):
282			line = ' ' * 4 + line
283			return line + '\n'
284			elif not nl:
285			nl = True
286			return '\n'
287			else:
288			return ''
289
290
291			def _run_script():
292			global array_seen, nl, sd, tail_text
293
294			if len(sys.argv) > 1:
295			bmdir = sys.argv[1].rstrip('/') + '/'
296			else:
297			bmdir = '../../bmpm/'
298
299			outfilename = '../abydos/phonetic/_beider_morse_data.py'
300			outfile = codecs.open(outfilename, 'w', 'utf-8')
301
302			outfile.write(
303			'# Copyright 2014-2020 by \
304			Christopher C. Little.\n# This file is part of Abydos.\n#\n# This file is \
305			based on Alexander Beider and Stephen P. Morse\'s implementation\n# of the \
306			Beider-Morse Phonetic Matching (BMPM) System, available at\n# \
307			http://stevemorse.org/phonetics/bmpm.htm.\n#\n# Abydos is free software: \
308			you can redistribute it and/or modify\n# it under the terms of the GNU \
309			General Public License as published by\n# the Free Software Foundation, \
310			either version 3 of the License, or\n# (at your option) any later version.\n\
311			#\n# Abydos is distributed in the hope that it will be useful,\n# but WITHOUT \
312			ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or \
313			FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for \
314			more details.\n#\n# You should have received a copy of the GNU General Public \
315			License\n# along with Abydos. If not, see <http://www.gnu.org/licenses/>.\n\n\
316			"""abydos.phonetic._beider_morse_data.\n\nBehind-the-scenes constants, \
317			rules, etc. for the Beider-Morse Phonentic\nMatching (BMPM) algorithm\n\nDO \
318			NOT EDIT - This document is automatically generated from the reference\n\
319			implementation in PHP.\n"""\n\nfrom \
320			__future__ import (\n absolute_import,\n division,\n print_function,\
321			unicode_literals,\n)\n'
322			)
323
324			outfile.write('L_NONE = 0\n')
325			for i, l in enumerate(lang_tuple):
326			outfile.write('L_' + l.upper() + ' = 2**' + str(i) + '\n')
327			outfile.write('\n\n')
328
329			tail_text += '\nBMDATA = {} # type: ignore\n'
330
331			subdirs = ('gen', 'sep', 'ash')
332
333			for s in subdirs:
334			sd = s
335			tail_text += "\nBMDATA['" + s + "'] = {}\n"
336			tail_text += "BMDATA['" + s + "']['approx'] = {}\n"
337			tail_text += "BMDATA['" + s + "']['exact'] = {}\n"
338			tail_text += "BMDATA['" + s + "']['rules'] = {}\n"
339			tail_text += "BMDATA['" + s + "']['hebrew'] = {}\n\n"
340			tail_text += (
341			"BMDATA['"
342			+ s
343			+ "']['language_rules'] = _"
344			+ s.upper()
345			+ '_LANGUAGE_RULES\n'
346			)
347			tail_text += (
348			"BMDATA['" + s + "']['languages'] = _" + s.upper() + '_LANGUAGES\n'
349			)
350
351			phps = [
352			f
353			for f in sorted(listdir(bmdir + s + '/'))
354			if (isfile(bmdir + s + '/' + f) and f.endswith('.php'))
355			]
356			for infilename in phps:
357			for pfx in (
358			'rules',
359			'approx',
360			'exact',
361			'hebrew',
362			'language',
363			'lang',
364			):
365			if infilename.startswith(pfx):
366			array_seen = False
367			infilepath = bmdir + s + '/' + infilename
368			infileenc = chardet.detect(open(infilepath, 'rb').read())[
369			'encoding'
370			]
371			print(s + '/' + infilename) # noqa: T001
372			infile = codecs.open(infilepath, 'r', infileenc)
373			# if infilename.startswith('lang'):
374			# tuplename = infilename[:-4]
375			# else:
376			# tuplename = pfx + '_' + infilename[len(pfx) : -4]
377			# indent = len(tuplename) + 21
378
379			outfile.write('# ' + s + '/' + infilename + '\n')
380
381			ignore = True
382			for line in infile:
383			if 'function Language' in line:
384			break
385			if not ignore:
386			if re.search(r'\?>', line):
387			ignore = True
388			else:
389			line = pythonize(line, infilename[:-4], s)
390			if line.startswith('BMDATA'):
391			tail_text += line
392			else:
393			outfile.write(line)
394			if '*/' in line:
395			ignore = False
396
397			outfile.write('\n\n')
398			break
399
400			outfile.write(tail_text)
401
402			outfile.close()
403			outfilelines = codecs.open(outfilename, 'r', 'utf-8').readlines()
404			outfile = codecs.open(outfilename, 'w', 'utf-8')
405			nl = False
406			fixlanguagesarray = False
407
408			sep_lang = (
409			"('any', 'french', 'hebrew', 'italian', 'portuguese', 'spanish')"
410			)
411
412			for line in outfilelines:
413			line = line.rstrip()
414			if line:
415			if fixlanguagesarray:
416			line = ' ' + line.strip()
417			fixlanguagesarray = False
418			if len(line) > 79 or sep_lang in line:
419			line += ' # noqa: E501'
420			outfile.write(line)
421			if not line.endswith('='):
422			outfile.write('\n')
423			else:
424			fixlanguagesarray = True
425			nl = False
426			else:
427			if not nl:
428			outfile.write('\n')
429			nl = True
430
431			outfile.write(
432			"\n\nif __name__ == '__main__':\n import doctest\n\n\
433			doctest.testmod()\n"
434			)
435
436
437			if __name__ == '__main__':
438			_run_script()
439

chrislit / abydos

bm_php2py.c2u() A last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

bm_php2py.c2u() A
last analyzed 2020-12-31 20:10 UTC