Conditions | 54 |
Total Lines | 386 |
Code Lines | 257 |
Lines | 0 |
Ratio | 0 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like bm_php2py._run_script() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | #!/usr/bin/env python |
||
53 | def _run_script(): |
||
54 | # The list of languages from BMPM to support (might need to be updated or |
||
55 | # tuned as BMPM is updated) |
||
56 | lang_tuple = ( |
||
57 | 'any', |
||
58 | 'arabic', |
||
59 | 'cyrillic', |
||
60 | 'czech', |
||
61 | 'dutch', |
||
62 | 'english', |
||
63 | 'french', |
||
64 | 'german', |
||
65 | 'greek', |
||
66 | 'greeklatin', |
||
67 | 'hebrew', |
||
68 | 'hungarian', |
||
69 | 'italian', |
||
70 | 'latvian', |
||
71 | 'polish', |
||
72 | 'portuguese', |
||
73 | 'romanian', |
||
74 | 'russian', |
||
75 | 'spanish', |
||
76 | 'turkish', |
||
77 | ) |
||
78 | |||
79 | lang_dict = {} |
||
80 | for i, l in enumerate(lang_tuple): |
||
81 | lang_dict[l] = 2 ** i |
||
82 | lang_dict['common'] = "'common'" |
||
83 | |||
84 | nl = False |
||
85 | array_seen = False |
||
86 | |||
87 | tail_text = '' |
||
88 | |||
89 | def c2u(name): |
||
90 | """Convert camelCase (used in PHP) to Python-standard snake_case. |
||
91 | |||
92 | Src: |
||
93 | https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case |
||
94 | |||
95 | Parameters |
||
96 | ---------- |
||
97 | name: A function or variable name in camelCase |
||
98 | |||
99 | Returns |
||
100 | ------- |
||
101 | str: The name in snake_case |
||
102 | |||
103 | """ |
||
104 | s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) |
||
105 | s1 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() |
||
106 | return s1 |
||
107 | |||
108 | def pythonize(line, fn='', subdir='gen'): |
||
109 | """Convert a line of BMPM code from PHP to Python. |
||
110 | |||
111 | Parameters |
||
112 | ---------- |
||
113 | line : str |
||
114 | A line of code |
||
115 | fn : str |
||
116 | A filename |
||
117 | subdir : str |
||
118 | The file's subdirectory |
||
119 | |||
120 | Returns |
||
121 | ------- |
||
122 | The code in Python |
||
123 | |||
124 | """ |
||
125 | global nl, array_seen |
||
126 | |||
127 | if '$all' in line: |
||
128 | return '' |
||
129 | if 'make the sum of all languages be visible in the function' in line: |
||
130 | return '' |
||
131 | |||
132 | line = line.strip() |
||
133 | |||
134 | if 'array' in line and not line.startswith('//'): |
||
135 | array_seen = True |
||
136 | |||
137 | line = re.sub('//+', '#', line) |
||
138 | # line = re.sub('"\.\((\$.+?)\)\."', r'\1', line) |
||
139 | if line and re.search(r'array\("[^"]+?"\)', line): |
||
140 | # print("### " + line) |
||
141 | line = '' |
||
142 | line = line.replace('array', '') |
||
143 | line = re.sub(r'^\s*', '', line) |
||
144 | line = re.sub(';$', '', line) |
||
145 | line = re.sub('^include_.+', '', line) |
||
146 | |||
147 | line = re.sub( |
||
148 | r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", ' |
||
149 | + r'\$languages\)\] = \$([a-zA-Z]+)', |
||
150 | lambda m: ( |
||
151 | "BMDATA['" |
||
152 | + subdir |
||
153 | + "']['" |
||
154 | + m.group(1) |
||
155 | + "'][L_" |
||
156 | + m.group(2).upper() |
||
157 | + '] = _' |
||
158 | + subdir.upper() |
||
159 | + '_' |
||
160 | + c2u(m.group(3)).upper() |
||
161 | ), |
||
162 | line, |
||
163 | ) |
||
164 | |||
165 | line = re.sub( |
||
166 | r'\$(approx|rules|exact|hebrew)([A-Za-z]+) = _merge' |
||
167 | + r'\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)', |
||
168 | lambda m: ( |
||
169 | "BMDATA['" |
||
170 | + subdir |
||
171 | + "']['" |
||
172 | + m.group(1) |
||
173 | + "'][L_" |
||
174 | + c2u(m.group(2)).upper() |
||
175 | + '] = _' |
||
176 | + subdir.upper() |
||
177 | + '_' |
||
178 | + c2u(m.group(3)).upper() |
||
179 | + ' + _' |
||
180 | + subdir.upper() |
||
181 | + '_' |
||
182 | + c2u(m.group(4)).upper() |
||
183 | ), |
||
184 | line, |
||
185 | ) |
||
186 | |||
187 | line = re.sub( |
||
188 | r'\$(approx|rules|exact)\[LanguageIndex\("([^"]+)", ' |
||
189 | + r'\$languages\)\] = _merge\(\$([a-zA-Z]+), \$([a-zA-Z]+)\)', |
||
190 | lambda m: ( |
||
191 | "BMDATA['" |
||
192 | + subdir |
||
193 | + "']['" |
||
194 | + m.group(1) |
||
195 | + "'][L_" |
||
196 | + c2u(m.group(2)).upper() |
||
197 | + '] = _' |
||
198 | + subdir.upper() |
||
199 | + '_' |
||
200 | + c2u(m.group(3)).upper() |
||
201 | + ' + _' |
||
202 | + subdir.upper() |
||
203 | + '_' |
||
204 | + c2u(m.group(4)).upper() |
||
205 | ), |
||
206 | line, |
||
207 | ) |
||
208 | |||
209 | line = re.sub( |
||
210 | r'^\$([a-zA-Z]+)', |
||
211 | lambda m: '_' + s.upper() + '_' + c2u(m.group(1)).upper(), |
||
212 | line, |
||
213 | ) |
||
214 | |||
215 | for _ in range(len(lang_tuple)): |
||
216 | line = re.sub(r'($[a-zA-Z]+) *\+ *($[a-zA-Z]+)', r'\1\+\2', line) |
||
217 | |||
218 | line = re.sub( |
||
219 | r'\$([a-zA-Z]+)', |
||
220 | lambda m: ( |
||
221 | 'L_' + m.group(1).upper() |
||
222 | if m.group(1) in lang_dict |
||
223 | else '$' + m.group(1) |
||
224 | ), |
||
225 | line, |
||
226 | ) |
||
227 | line = re.sub(r'\[\"\.\((L_[A-Z_+]+)\)\.\"\]', r'[\1]', line) |
||
228 | |||
229 | line = re.sub( |
||
230 | 'L_([A-Z]+)', lambda m: str(lang_dict[m.group(1).lower()]), line |
||
231 | ) |
||
232 | for _ in range(4): |
||
233 | line = re.sub( |
||
234 | r'([0-9]+) *\+ *([0-9]+)', |
||
235 | lambda m: str(int(m.group(1)) + int(m.group(2))), |
||
236 | line, |
||
237 | ) |
||
238 | |||
239 | if fn == 'lang': |
||
240 | if len(line.split(',')) >= 3: |
||
241 | parts = line.split(',') |
||
242 | parts[0] = re.sub('/(.+?)/', r'\1', parts[0]) |
||
243 | # parts[1] = re.sub('\$', 'L_', parts[1]) |
||
244 | # parts[1] = re.sub(' *\+ *', '|', parts[1]) |
||
245 | parts[2] = parts[2].title() |
||
246 | line = ','.join(parts) |
||
247 | |||
248 | if 'languagenames' in fn: |
||
249 | line = line.replace('"', "'") |
||
250 | line = line.replace("','", "', '") |
||
251 | if line and line[0] == "'": |
||
252 | line = ' ' * 14 + line |
||
253 | |||
254 | # fix upstream |
||
255 | # line = line.replace('ë', 'ü') |
||
256 | |||
257 | comment = '' |
||
258 | if '#' in line: |
||
259 | hashsign = line.find('#') |
||
260 | comment = line[hashsign:] |
||
261 | code = line[:hashsign] |
||
262 | else: |
||
263 | code = line |
||
264 | |||
265 | code = code.rstrip() |
||
266 | comment = comment.strip() |
||
267 | if not re.match(r'^\s*$', code): |
||
268 | comment = ' ' + comment |
||
269 | |||
270 | if '(' in code and ')' in code: |
||
271 | prefix = code[: code.find('(') + 1] |
||
272 | suffix = code[code.rfind(')') :] |
||
273 | tuplecontent = code[len(prefix) : len(code) - len(suffix)] |
||
274 | |||
275 | elts = tuplecontent.split(',') |
||
276 | for i in range(len(elts)): |
||
277 | elts[i] = elts[i].strip() |
||
278 | if elts[i][0] == '"' and elts[i][-1] == '"': |
||
279 | elts[i] = "'" + elts[i][1:-1].replace("'", "\\'") + "'" |
||
280 | tuplecontent = ', '.join(elts) |
||
281 | |||
282 | code = prefix + tuplecontent + suffix |
||
283 | |||
284 | line = code + comment |
||
285 | line = re.sub('# *', '# ', line) |
||
286 | |||
287 | if line: |
||
288 | nl = False |
||
289 | if array_seen and not ( |
||
290 | line[0] == '_' or line.startswith('BMDATA') |
||
291 | ): |
||
292 | line = ' ' * 4 + line |
||
293 | return line + '\n' |
||
294 | elif not nl: |
||
295 | nl = True |
||
296 | return '\n' |
||
297 | else: |
||
298 | return '' |
||
299 | |||
300 | if len(sys.argv) > 1: |
||
301 | bmdir = sys.argv[1].rstrip('/') + '/' |
||
302 | else: |
||
303 | bmdir = '../../bmpm/' |
||
304 | |||
305 | outfilename = '../abydos/phonetic/_beider_morse_data.py' |
||
306 | outfile = codecs.open(outfilename, 'w', 'utf-8') |
||
307 | |||
308 | outfile.write( |
||
309 | r'# -*- coding: utf-8 -*-\n\n# Copyright 2014-2018 by \ |
||
310 | Christopher C. Little.\n# This file is part of Abydos.\n#\n# This file is \ |
||
311 | based on Alexander Beider and Stephen P. Morse\'s implementation\n# of the \ |
||
312 | Beider-Morse Phonetic Matching (BMPM) System, available at\n# \ |
||
313 | http://stevemorse.org/phonetics/bmpm.htm.\n#\n# Abydos is free software: \ |
||
314 | you can redistribute it and/or modify\n# it under the terms of the GNU \ |
||
315 | General Public License as published by\n# the Free Software Foundation, \ |
||
316 | either version 3 of the License, or\n# (at your option) any later version.\n\ |
||
317 | #\n# Abydos is distributed in the hope that it will be useful,\n# but WITHOUT \ |
||
318 | ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or \ |
||
319 | FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for \ |
||
320 | more details.\n#\n# You should have received a copy of the GNU General Public \ |
||
321 | License\n# along with Abydos. If not, see <http://www.gnu.org/licenses/>.\n\n\ |
||
322 | """abydos.phonetic._beider_morse_data.\n\nBehind-the-scenes constants, \ |
||
323 | rules, etc. for the Beider-Morse Phonentic\nMatching (BMPM) algorithm\n\nDO \ |
||
324 | NOT EDIT - This document is automatically generated from the reference\n\ |
||
325 | implementation in PHP.\n"""\n# pylint: disable=line-too-long\n\nfrom \ |
||
326 | __future__ import (\n absolute_import,\n division,\n print_function,\ |
||
327 | unicode_literals,\n)\n' |
||
328 | ) |
||
329 | |||
330 | outfile.write('L_NONE = 0\n') |
||
331 | for i, l in enumerate(lang_tuple): |
||
332 | outfile.write('L_' + l.upper() + ' = 2**' + str(i) + '\n') |
||
333 | outfile.write('\n\n') |
||
334 | |||
335 | tail_text += '\nBMDATA = {}\n' |
||
336 | |||
337 | subdirs = ('gen', 'sep', 'ash') |
||
338 | |||
339 | for s in subdirs: |
||
340 | tail_text += '\nBMDATA[\'' + s + '\'] = {}\n' |
||
341 | tail_text += 'BMDATA[\'' + s + '\'][\'approx\'] = {}\n' |
||
342 | tail_text += 'BMDATA[\'' + s + '\'][\'exact\'] = {}\n' |
||
343 | tail_text += 'BMDATA[\'' + s + '\'][\'rules\'] = {}\n' |
||
344 | tail_text += 'BMDATA[\'' + s + '\'][\'hebrew\'] = {}\n\n' |
||
345 | tail_text += ( |
||
346 | 'BMDATA[\'' |
||
347 | + s |
||
348 | + '\'][\'language_rules\'] = _' |
||
349 | + s.upper() |
||
350 | + '_LANGUAGE_RULES\n' |
||
351 | ) |
||
352 | tail_text += ( |
||
353 | 'BMDATA[\'' |
||
354 | + s |
||
355 | + '\'][\'languages\'] = _' |
||
356 | + s.upper() |
||
357 | + '_LANGUAGES\n' |
||
358 | ) |
||
359 | |||
360 | phps = [ |
||
361 | f |
||
362 | for f in sorted(listdir(bmdir + s + '/')) |
||
363 | if (isfile(bmdir + s + '/' + f) and f.endswith('.php')) |
||
364 | ] |
||
365 | for infilename in phps: |
||
366 | for pfx in ( |
||
367 | 'rules', |
||
368 | 'approx', |
||
369 | 'exact', |
||
370 | 'hebrew', |
||
371 | 'language', |
||
372 | 'lang', |
||
373 | ): |
||
374 | if infilename.startswith(pfx): |
||
375 | array_seen = False |
||
376 | infilepath = bmdir + s + '/' + infilename |
||
377 | infileenc = chardet.detect(open(infilepath, 'rb').read())[ |
||
378 | 'encoding' |
||
379 | ] |
||
380 | print(s + '/' + infilename) # noqa: T001 |
||
381 | infile = codecs.open(infilepath, 'r', infileenc) |
||
382 | # if infilename.startswith('lang'): |
||
383 | # tuplename = infilename[:-4] |
||
384 | # else: |
||
385 | # tuplename = pfx + '_' + infilename[len(pfx) : -4] |
||
386 | # indent = len(tuplename) + 21 |
||
387 | |||
388 | outfile.write('# ' + s + '/' + infilename + '\n') |
||
389 | |||
390 | ignore = True |
||
391 | for line in infile: |
||
392 | if 'function Language' in line: |
||
393 | break |
||
394 | if not ignore: |
||
395 | if re.search(r'\?>', line): |
||
396 | ignore = True |
||
397 | else: |
||
398 | line = pythonize(line, infilename[:-4], s) |
||
399 | if line.startswith('BMDATA'): |
||
400 | tail_text += line |
||
401 | else: |
||
402 | outfile.write(line) |
||
403 | if '*/' in line: |
||
404 | ignore = False |
||
405 | |||
406 | outfile.write('\n\n') |
||
407 | break |
||
408 | |||
409 | outfile.write(tail_text) |
||
410 | |||
411 | outfile.close() |
||
412 | outfilelines = codecs.open(outfilename, 'r', 'utf-8').readlines() |
||
413 | outfile = codecs.open(outfilename, 'w', 'utf-8') |
||
414 | nl = False |
||
415 | fixlanguagesarray = False |
||
416 | |||
417 | sep_lang = ( |
||
418 | "('any', 'french', 'hebrew', 'italian', 'portuguese', 'spanish')" |
||
419 | ) |
||
420 | |||
421 | for line in outfilelines: |
||
422 | line = line.rstrip() |
||
423 | if line: |
||
424 | if fixlanguagesarray: |
||
425 | line = ' ' + line.strip() |
||
426 | fixlanguagesarray = False |
||
427 | if len(line) > 79 or sep_lang in line: |
||
428 | line += ' # noqa: E501' |
||
429 | outfile.write(line) |
||
430 | if not line.endswith('='): |
||
431 | outfile.write('\n') |
||
432 | else: |
||
433 | fixlanguagesarray = True |
||
434 | nl = False |
||
435 | else: |
||
436 | if not nl: |
||
437 | outfile.write('\n') |
||
438 | nl = True |
||
439 | |||
443 |