Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.phonetic._beider_morse.bmpm()   A

Complexity

Conditions 1

Size

Total Lines 89
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 9
nop 6
dl 0
loc 89
ccs 2
cts 2
cp 1
crap 1
rs 9.95
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (1031/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# This file is based on Alexander Beider and Stephen P. Morse's implementation
7
# of the Beider-Morse Phonetic Matching (BMPM) System, available at
8
# http://stevemorse.org/phonetics/bmpm.htm.
9
#
10
# Abydos is free software: you can redistribute it and/or modify
11
# it under the terms of the GNU General Public License as published by
12
# the Free Software Foundation, either version 3 of the License, or
13
# (at your option) any later version.
14
#
15
# Abydos is distributed in the hope that it will be useful,
16
# but WITHOUT ANY WARRANTY; without even the implied warranty of
17
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
# GNU General Public License for more details.
19
#
20
# You should have received a copy of the GNU General Public License
21
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
22
23 1
"""abydos.phonetic._beider_morse.
24
25
Beider-Morse Phonetic Matching (BMPM) algorithm
26
"""
27
28 1
from __future__ import (
29
    absolute_import,
30
    division,
31
    print_function,
32
    unicode_literals,
33
)
34
35 1
from re import search
36 1
from unicodedata import normalize
37
38 1
from six import PY3, text_type
39 1
from six.moves import range
40
41 1
from ._beider_morse_data import (
42
    BMDATA,
43
    L_ANY,
44
    L_ARABIC,
45
    L_CYRILLIC,
46
    L_CZECH,
47
    L_DUTCH,
48
    L_ENGLISH,
49
    L_FRENCH,
50
    L_GERMAN,
51
    L_GREEK,
52
    L_GREEKLATIN,
53
    L_HEBREW,
54
    L_HUNGARIAN,
55
    L_ITALIAN,
56
    L_LATVIAN,
57
    L_NONE,
58
    L_POLISH,
59
    L_PORTUGUESE,
60
    L_ROMANIAN,
61
    L_RUSSIAN,
62
    L_SPANISH,
63
    L_TURKISH,
64
)
65 1
from ._phonetic import _Phonetic
66
67 1
__all__ = ['BeiderMorse', 'bmpm']
68
69
if PY3:
70
    long = int
0 ignored issues
show
Coding Style Naming introduced by
The name long does not conform to the class naming conventions ([A-Z_][a-zA-Z0-9]+$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
71
72 1
_LANG_DICT = {
73
    'any': L_ANY,
74
    'arabic': L_ARABIC,
75
    'cyrillic': L_CYRILLIC,
76
    'czech': L_CZECH,
77
    'dutch': L_DUTCH,
78
    'english': L_ENGLISH,
79
    'french': L_FRENCH,
80
    'german': L_GERMAN,
81
    'greek': L_GREEK,
82
    'greeklatin': L_GREEKLATIN,
83
    'hebrew': L_HEBREW,
84
    'hungarian': L_HUNGARIAN,
85
    'italian': L_ITALIAN,
86
    'latvian': L_LATVIAN,
87
    'polish': L_POLISH,
88
    'portuguese': L_PORTUGUESE,
89
    'romanian': L_ROMANIAN,
90
    'russian': L_RUSSIAN,
91
    'spanish': L_SPANISH,
92
    'turkish': L_TURKISH,
93
}
94
95 1
BMDATA['gen']['discards'] = {
96
    'da ',
97
    'dal ',
98
    'de ',
99
    'del ',
100
    'dela ',
101
    'de la ',
102
    'della ',
103
    'des ',
104
    'di ',
105
    'do ',
106
    'dos ',
107
    'du ',
108
    'van ',
109
    'von ',
110
    'd\'',
111
}
112 1
BMDATA['sep']['discards'] = {
113
    'al',
114
    'el',
115
    'da',
116
    'dal',
117
    'de',
118
    'del',
119
    'dela',
120
    'de la',
121
    'della',
122
    'des',
123
    'di',
124
    'do',
125
    'dos',
126
    'du',
127
    'van',
128
    'von',
129
}
130 1
BMDATA['ash']['discards'] = {'bar', 'ben', 'da', 'de', 'van', 'von'}
131
132
# format of rules array
133 1
_PATTERN_POS = 0
134 1
_LCONTEXT_POS = 1
135 1
_RCONTEXT_POS = 2
136 1
_PHONETIC_POS = 3
137
138
139 1
class BeiderMorse(_Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
140
    """Beider-Morse Phonetic Matching.
141
142
    The Beider-Morse Phonetic Matching algorithm is described in
143
    :cite:`Beider:2008`.
144
    The reference implementation is licensed under GPLv3.
145
    """
146
147 1
    def _language(self, name, name_mode):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
148
        """Return the best guess language ID for the word and language choices.
149
150
        Parameters
151
        ----------
152
        name : str
153
            The term to guess the language of
154
        name_mode : str
155
            The name mode of the algorithm: ``gen`` (default),
156
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
157
158
        Returns
159
        -------
160
        int
161
            Language ID
162
163
        """
164 1
        name = name.strip().lower()
165 1
        rules = BMDATA[name_mode]['language_rules']
166 1
        all_langs = (
167
            sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
168
        )
169 1
        choices_remaining = all_langs
170 1
        for rule in rules:
171 1
            letters, languages, accept = rule
172 1
            if search(letters, name) is not None:
173 1
                if accept:
174 1
                    choices_remaining &= languages
175
                else:
176 1
                    choices_remaining &= (~languages) % (all_langs + 1)
177 1
        if choices_remaining == L_NONE:
178 1
            choices_remaining = L_ANY
179 1
        return choices_remaining
180
181 1
    def _redo_language(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
182
        self, term, name_mode, rules, final_rules1, final_rules2, concat
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
183
    ):
184
        """Reassess the language of the terms and call the phonetic encoder.
185
186
        Uses a split multi-word term.
187
188
        Parameters
189
        ----------
190
        term : str
191
            The term to encode via Beider-Morse
192
        name_mode : str
193
            The name mode of the algorithm: ``gen`` (default),
194
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
195
        rules : tuple
196
            The set of initial phonetic transform regexps
197
        final_rules1 : tuple
198
            The common set of final phonetic transform regexps
199
        final_rules2 : tuple
200
            The specific set of final phonetic transform regexps
201
        concat : bool
202
            A flag to indicate concatenation
203
204
        Returns
205
        -------
206
        str
207
            A Beider-Morse phonetic code
208
209
        """
210 1
        language_arg = self._language(term, name_mode)
211 1
        return self._phonetic(
212
            term,
213
            name_mode,
214
            rules,
215
            final_rules1,
216
            final_rules2,
217
            language_arg,
218
            concat,
219
        )
220
221 1
    def _phonetic(
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (29/15).
Loading history...
222
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
223
        term,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
224
        name_mode,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
225
        rules,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
226
        final_rules1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
227
        final_rules2,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
228
        language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
229
        concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
230
    ):
231
        """Return the Beider-Morse encoding(s) of a term.
232
233
        Parameters
234
        ----------
235
        term : str
236
            The term to encode via Beider-Morse
237
        name_mode : str
238
            The name mode of the algorithm: ``gen`` (default),
239
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
240
        rules : tuple
241
            The set of initial phonetic transform regexps
242
        final_rules1 : tuple
243
            The common set of final phonetic transform regexps
244
        final_rules2 : tuple
245
            The specific set of final phonetic transform regexps
246
        language_arg : int
247
            The language of the term
248
        concat : bool
249
            A flag to indicate concatenation
250
251
        Returns
252
        -------
253
        str
254
            A Beider-Morse phonetic code
255
256
        """
257 1
        term = term.replace('-', ' ').strip()
258
259 1
        if name_mode == 'gen':  # generic case
260
            # discard and concatenate certain words if at the start of the name
261 1
            for pfx in BMDATA['gen']['discards']:
262 1
                if term.startswith(pfx):
263 1
                    remainder = term[len(pfx) :]
264 1
                    combined = pfx[:-1] + remainder
265 1
                    result = (
266
                        self._redo_language(
267
                            remainder,
268
                            name_mode,
269
                            rules,
270
                            final_rules1,
271
                            final_rules2,
272
                            concat,
273
                        )
274
                        + '-'
275
                        + self._redo_language(
276
                            combined,
277
                            name_mode,
278
                            rules,
279
                            final_rules1,
280
                            final_rules2,
281
                            concat,
282
                        )
283
                    )
284 1
                    return result
285
286 1
        words = (
287
            term.split()
288
        )  # create array of the individual words in the name
289 1
        words2 = []
290
291 1
        if name_mode == 'sep':  # Sephardic case
292
            # for each word in the name, delete portions of word preceding
293
            # apostrophe
294
            # ex: d'avila d'aguilar --> avila aguilar
295
            # also discard certain words in the name
296
297
            # note that we can never get a match on "de la" because we are
298
            # checking single words below
299
            # this is a bug, but I won't try to fix it now
300
301 1
            for word in words:
302 1
                word = word[word.rfind('\'') + 1 :]
303 1
                if word not in BMDATA['sep']['discards']:
304 1
                    words2.append(word)
305
306 1
        elif name_mode == 'ash':  # Ashkenazic case
307
            # discard certain words if at the start of the name
308 1
            if len(words) > 1 and words[0] in BMDATA['ash']['discards']:
309 1
                words2 = words[1:]
310
            else:
311 1
                words2 = list(words)
312
        else:
313 1
            words2 = list(words)
314
315 1
        if concat:
316
            # concatenate the separate words of a multi-word name
317
            # (normally used for exact matches)
318 1
            term = ' '.join(words2)
319 1
        elif len(words2) == 1:  # not a multi-word name
320 1
            term = words2[0]
321
        else:
322
            # encode each word in a multi-word name separately
323
            # (normally used for approx matches)
324 1
            result = '-'.join(
325
                [
326
                    self._redo_language(
327
                        w, name_mode, rules, final_rules1, final_rules2, concat
328
                    )
329
                    for w in words2
330
                ]
331
            )
332 1
            return result
333
334 1
        term_length = len(term)
335
336
        # apply language rules to map to phonetic alphabet
337 1
        phonetic = ''
338 1
        skip = 0
339 1
        for i in range(term_length):
340 1
            if skip:
341 1
                skip -= 1
342 1
                continue
343 1
            found = False
344 1
            for rule in rules:
345 1
                pattern = rule[_PATTERN_POS]
346 1
                pattern_length = len(pattern)
347 1
                lcontext = rule[_LCONTEXT_POS]
348 1
                rcontext = rule[_RCONTEXT_POS]
349
350
                # check to see if next sequence in input matches the string in
351
                # the rule
352 1
                if (pattern_length > term_length - i) or (
353
                    term[i : i + pattern_length] != pattern
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
354
                ):  # no match
355 1
                    continue
356
357 1
                right = '^' + rcontext
358 1
                left = lcontext + '$'
359
360
                # check that right context is satisfied
361 1
                if rcontext != '':
362 1
                    if not search(right, term[i + pattern_length :]):
363 1
                        continue
364
365
                # check that left context is satisfied
366 1
                if lcontext != '':
367 1
                    if not search(left, term[:i]):
368 1
                        continue
369
370
                # check for incompatible attributes
371 1
                candidate = self._apply_rule_if_compat(
372
                    phonetic, rule[_PHONETIC_POS], language_arg
373
                )
374
                # The below condition shouldn't ever be false
375 1
                if candidate is not None:  # pragma: no branch
376 1
                    phonetic = candidate
377 1
                    found = True
378 1
                    break
379
380 1
            if (
381
                not found
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
382
            ):  # character in name that is not in table -- e.g., space
383 1
                pattern_length = 1
384 1
            skip = pattern_length - 1
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
385
386
        # apply final rules on phonetic-alphabet,
387
        # doing a substitution of certain characters
388 1
        phonetic = self._apply_final_rules(
389
            phonetic, final_rules1, language_arg, False
390
        )  # apply common rules
391
        # final_rules1 are the common approx rules,
392
        # final_rules2 are approx rules for specific language
393 1
        phonetic = self._apply_final_rules(
394
            phonetic, final_rules2, language_arg, True
395
        )  # apply lang specific rules
396
397 1
        return phonetic
398
399 1
    def _apply_final_rules(self, phonetic, final_rules, language_arg, strip):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (21/15).
Loading history...
400
        """Apply a set of final rules to the phonetic encoding.
401
402
        Parameters
403
        ----------
404
        phonetic : str
405
            The term to which to apply the final rules
406
        final_rules : tuple
407
            The set of final phonetic transform regexps
408
        language_arg : int
409
            An integer representing the target language of the phonetic
410
            encoding
411
        strip : bool
412
            Flag to indicate whether to normalize the language attributes
413
414
        Returns
415
        -------
416
        str
417
            A Beider-Morse phonetic code
418
419
        """
420
        # optimization to save time
421 1
        if not final_rules:
422 1
            return phonetic
423
424
        # expand the result
425 1
        phonetic = self._expand_alternates(phonetic)
426 1
        phonetic_array = phonetic.split('|')
427
428 1
        for k in range(len(phonetic_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
429 1
            phonetic = phonetic_array[k]
430 1
            phonetic2 = ''
431 1
            phoneticx = self._normalize_lang_attrs(phonetic, True)
432
433 1
            i = 0
434 1
            while i < len(phonetic):
435 1
                found = False
436
437 1
                if phonetic[i] == '[':  # skip over language attribute
438 1
                    attrib_start = i
439 1
                    i += 1
440 1
                    while True:
441 1
                        if phonetic[i] == ']':
442 1
                            i += 1
443 1
                            phonetic2 += phonetic[attrib_start:i]
444 1
                            break
445 1
                        i += 1
446 1
                    continue
447
448 1
                for rule in final_rules:
449 1
                    pattern = rule[_PATTERN_POS]
450 1
                    pattern_length = len(pattern)
451 1
                    lcontext = rule[_LCONTEXT_POS]
452 1
                    rcontext = rule[_RCONTEXT_POS]
453
454 1
                    right = '^' + rcontext
455 1
                    left = lcontext + '$'
456
457
                    # check to see if next sequence in phonetic matches the
458
                    # string in the rule
459 1
                    if (pattern_length > len(phoneticx) - i) or phoneticx[
460
                        i : i + pattern_length
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
461
                    ] != pattern:
462 1
                        continue
463
464
                    # check that right context is satisfied
465 1
                    if rcontext != '':
466 1
                        if not search(right, phoneticx[i + pattern_length :]):
467 1
                            continue
468
469
                    # check that left context is satisfied
470 1
                    if lcontext != '':
471 1
                        if not search(left, phoneticx[:i]):
472 1
                            continue
473
474
                    # check for incompatible attributes
475 1
                    candidate = self._apply_rule_if_compat(
476
                        phonetic2, rule[_PHONETIC_POS], language_arg
477
                    )
478
                    # The below condition shouldn't ever be false
479 1
                    if candidate is not None:  # pragma: no branch
480 1
                        phonetic2 = candidate
481 1
                        found = True
482 1
                        break
483
484 1
                if not found:
485
                    # character in name for which there is no substitution in
486
                    # the table
487 1
                    phonetic2 += phonetic[i]
488 1
                    pattern_length = 1
489
490 1
                i += pattern_length
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
491
492 1
            phonetic_array[k] = self._expand_alternates(phonetic2)
493
494 1
        phonetic = '|'.join(phonetic_array)
495 1
        if strip:
496 1
            phonetic = self._normalize_lang_attrs(phonetic, True)
497
498 1
        if '|' in phonetic:
499 1
            phonetic = '(' + self._remove_dupes(phonetic) + ')'
500
501 1
        return phonetic
502
503 1
    def _phonetic_number(self, phonetic):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
504
        """Remove bracketed text from the end of a string.
505
506
        Parameters
507
        ----------
508
        phonetic : str
509
            A Beider-Morse phonetic encoding
510
511
        Returns
512
        -------
513
        str
514
            A Beider-Morse phonetic code
515
516
        """
517 1
        if '[' in phonetic:
518 1
            return phonetic[: phonetic.find('[')]
519
520 1
        return phonetic  # experimental !!!!
521
522 1
    def _expand_alternates(self, phonetic):
523
        """Expand phonetic alternates separated by |s.
524
525
        Parameters
526
        ----------
527
        phonetic : str
528
            A Beider-Morse phonetic encoding
529
530
        Returns
531
        -------
532
        str
533
            A Beider-Morse phonetic code
534
535
        """
536 1
        alt_start = phonetic.find('(')
537 1
        if alt_start == -1:
538 1
            return self._normalize_lang_attrs(phonetic, False)
539
540 1
        prefix = phonetic[:alt_start]
541 1
        alt_start += 1  # get past the (
542 1
        alt_end = phonetic.find(')', alt_start)
543 1
        alt_string = phonetic[alt_start:alt_end]
544 1
        alt_end += 1  # get past the )
545 1
        suffix = phonetic[alt_end:]
546 1
        alt_array = alt_string.split('|')
547 1
        result = ''
548
549 1
        for i in range(len(alt_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
550 1
            alt = alt_array[i]
551 1
            alternate = self._expand_alternates(prefix + alt + suffix)
552 1
            if alternate != '' and alternate != '[0]':
553 1
                if result != '':
554 1
                    result += '|'
555 1
                result += alternate
556
557 1
        return result
558
559 1
    def _pnums_with_leading_space(self, phonetic):
560
        """Join prefixes & suffixes in cases of alternate phonetic values.
561
562
        Parameters
563
        ----------
564
        phonetic : str
565
            A Beider-Morse phonetic encoding
566
567
        Returns
568
        -------
569
        str
570
            A Beider-Morse phonetic code
571
572
        """
573 1
        alt_start = phonetic.find('(')
574 1
        if alt_start == -1:
575 1
            return ' ' + self._phonetic_number(phonetic)
576
577 1
        prefix = phonetic[:alt_start]
578 1
        alt_start += 1  # get past the (
579 1
        alt_end = phonetic.find(')', alt_start)
580 1
        alt_string = phonetic[alt_start:alt_end]
581 1
        alt_end += 1  # get past the )
582 1
        suffix = phonetic[alt_end:]
583 1
        alt_array = alt_string.split('|')
584 1
        result = ''
585 1
        for alt in alt_array:
586 1
            result += self._pnums_with_leading_space(prefix + alt + suffix)
587
588 1
        return result
589
590 1
    def _phonetic_numbers(self, phonetic):
591
        """Prepare & join phonetic numbers.
592
593
        Split phonetic value on '-', run through _pnums_with_leading_space,
594
        and join with ' '
595
596
        Parameters
597
        ----------
598
        phonetic : str
599
            A Beider-Morse phonetic encoding
600
601
        Returns
602
        -------
603
        str
604
            A Beider-Morse phonetic code
605
606
        """
607 1
        phonetic_array = phonetic.split('-')  # for names with spaces in them
608 1
        result = ' '.join(
609
            [self._pnums_with_leading_space(i)[1:] for i in phonetic_array]
610
        )
611 1
        return result
612
613 1
    def _remove_dupes(self, phonetic):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
614
        """Remove duplicates from a phonetic encoding list.
615
616
        Parameters
617
        ----------
618
        phonetic : str
619
            A Beider-Morse phonetic encoding
620
621
        Returns
622
        -------
623
        str
624
            A Beider-Morse phonetic code
625
626
        """
627 1
        alt_string = phonetic
628 1
        alt_array = alt_string.split('|')
629
630 1
        result = '|'
631 1
        for i in range(len(alt_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
632 1
            alt = alt_array[i]
633 1
            if alt and '|' + alt + '|' not in result:
634 1
                result += alt + '|'
635
636 1
        return result[1:-1]  # remove leading and trailing |
637
638 1
    def _normalize_lang_attrs(self, text, strip):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
639
        """Remove embedded bracketed attributes.
640
641
        This (potentially) bitwise-ands bracketed attributes together and adds
642
        to the end.
643
        This is applied to a single alternative at a time -- not to a
644
        parenthesized list.
645
        It removes all embedded bracketed attributes, logically-ands them
646
        together, and places them at the end.
647
        However if strip is true, this can indeed remove embedded bracketed
648
        attributes from a parenthesized list.
649
650
        Parameters
651
        ----------
652
        text : str
653
            A Beider-Morse phonetic encoding (in progress)
654
        strip : bool
655
            Remove the bracketed attributes (and throw away)
656
657
        Returns
658
        -------
659
        str
660
            A Beider-Morse phonetic code
661
662
        Raises
663
        ------
664
        ValueError
665
            No closing square bracket
666
667
        """
668 1
        uninitialized = -1  # all 1's
669 1
        attrib = uninitialized
670 1
        while '[' in text:
671 1
            bracket_start = text.find('[')
672 1
            bracket_end = text.find(']', bracket_start)
673 1
            if bracket_end == -1:
674 1
                raise ValueError(
675
                    'No closing square bracket: text=('
676
                    + text
677
                    + ') strip=('
678
                    + text_type(strip)
679
                    + ')'
680
                )
681 1
            attrib &= int(text[bracket_start + 1 : bracket_end])
682 1
            text = text[:bracket_start] + text[bracket_end + 1 :]
683
684 1
        if attrib == uninitialized or strip:
685 1
            return text
686 1
        elif attrib == 0:
687
            # means that the attributes were incompatible and there is no
688
            # alternative here
689 1
            return '[0]'
690 1
        return text + '[' + str(attrib) + ']'
691
692 1
    def _apply_rule_if_compat(self, phonetic, target, language_arg):
693
        """Apply a phonetic regex if compatible.
694
695
        tests for compatible language rules
696
697
        to do so, apply the rule, expand the results, and detect alternatives
698
            with incompatible attributes
699
700
        then drop each alternative that has incompatible attributes and keep
701
            those that are compatible
702
703
        if there are no compatible alternatives left, return false
704
705
        otherwise return the compatible alternatives
706
707
        apply the rule
708
709
        Parameters
710
        ----------
711
        phonetic : str
712
            The Beider-Morse phonetic encoding (so far)
713
        target : str
714
            A proposed addition to the phonetic encoding
715
        language_arg : int
716
            An integer representing the target language of the phonetic
717
            encoding
718
719
        Returns
720
        -------
721
        str
722
            A candidate encoding
723
724
        """
725 1
        candidate = phonetic + target
726 1
        if '[' not in candidate:  # no attributes so we need test no further
727 1
            return candidate
728
729
        # expand the result, converting incompatible attributes to [0]
730 1
        candidate = self._expand_alternates(candidate)
731 1
        candidate_array = candidate.split('|')
732
733
        # drop each alternative that has incompatible attributes
734 1
        candidate = ''
735 1
        found = False
736
737 1
        for i in range(len(candidate_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
738 1
            this_candidate = candidate_array[i]
739 1
            if language_arg != 1:
740 1
                this_candidate = self._normalize_lang_attrs(
741
                    this_candidate + '[' + str(language_arg) + ']', False
742
                )
743 1
            if this_candidate != '[0]':
744 1
                found = True
745 1
                if candidate:
746 1
                    candidate += '|'
747 1
                candidate += this_candidate
748
749
        # return false if no compatible alternatives remain
750 1
        if not found:
751 1
            return None
752
753
        # return the result of applying the rule
754 1
        if '|' in candidate:
755 1
            candidate = '(' + candidate + ')'
756 1
        return candidate
757
758 1
    def _language_index_from_code(self, code, name_mode):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
759
        """Return the index value for a language code.
760
761
        This returns l_any if more than one code is specified or the code is
762
        out of bounds.
763
764
        Parameters
765
        ----------
766
        code : int
767
            The language code to interpret
768
        name_mode : str
769
            The name mode of the algorithm: ``gen`` (default),
770
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
771
772
        Returns
773
        -------
774
        int
775
            Language code index
776
777
        """
778 1
        if code < 1 or code > sum(
779
            _LANG_DICT[_] for _ in BMDATA[name_mode]['languages']
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
780
        ):  # code out of range
781 1
            return L_ANY
782 1
        if (
783
            code & (code - 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
784
        ) != 0:  # choice was more than one language; use any
785 1
            return L_ANY
786 1
        return code
787
788 1
    def encode(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
789
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
790
        word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
791
        language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
792
        name_mode='gen',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
793
        match_mode='approx',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
794
        concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
795
        filter_langs=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
796
    ):
797
        """Return the Beider-Morse Phonetic Matching encoding(s) of a term.
798
799
        Parameters
800
        ----------
801
        word : str
802
            The word to transform
803
        language_arg : int
804
            The language of the term; supported values include:
805
806
                - ``any``
807
                - ``arabic``
808
                - ``cyrillic``
809
                - ``czech``
810
                - ``dutch``
811
                - ``english``
812
                - ``french``
813
                - ``german``
814
                - ``greek``
815
                - ``greeklatin``
816
                - ``hebrew``
817
                - ``hungarian``
818
                - ``italian``
819
                - ``latvian``
820
                - ``polish``
821
                - ``portuguese``
822
                - ``romanian``
823
                - ``russian``
824
                - ``spanish``
825
                - ``turkish``
826
827
        name_mode : str
828
            The name mode of the algorithm:
829
830
                - ``gen`` -- general (default)
831
                - ``ash`` -- Ashkenazi
832
                - ``sep`` -- Sephardic
833
834
        match_mode : str
835
            Matching mode: ``approx`` or ``exact``
836
        concat : bool
837
            Concatenation mode
838
        filter_langs : bool
839
            Filter out incompatible languages
840
841
        Returns
842
        -------
843
        tuple
844
            The Beider-Morse phonetic value(s)
845
846
        Raises
847
        ------
848
        ValueError
849
            Unknown language
850
851
        Examples
852
        --------
853
        >>> pe = BeiderMorse()
854
        >>> pe.encode('Christopher')
855
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir
856
        xristofir xristYfir xristopi xritopir xritopi xristofi xritofir
857
        xritofi tzristopir tzristofir zristopir zristopi zritopir zritopi
858
        zristofir zristofi zritofir zritofi'
859
        >>> pe.encode('Niall')
860
        'nial niol'
861
        >>> pe.encode('Smith')
862
        'zmit'
863
        >>> pe.encode('Schmidt')
864
        'zmit stzmit'
865
866
        >>> pe.encode('Christopher', language_arg='German')
867
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir
868
        xristofir xristYfir'
869
        >>> pe.encode('Christopher', language_arg='English')
870
        'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir
871
        xristafir xrQstafir'
872
        >>> pe.encode('Christopher', language_arg='German', name_mode='ash')
873
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir
874
        xristofir xristYfir'
875
876
        >>> pe.encode('Christopher', language_arg='German', match_mode='exact')
877
        'xriStopher xriStofer xristopher xristofer'
878
879
        """
880 1
        word = normalize('NFC', text_type(word.strip().lower()))
881
882 1
        name_mode = name_mode.strip().lower()[:3]
883 1
        if name_mode not in {'ash', 'sep', 'gen'}:
884 1
            name_mode = 'gen'
885
886 1
        if match_mode != 'exact':
887 1
            match_mode = 'approx'
888
889
        # Translate the supplied language_arg value into an integer
890
        # representing a set of languages
891 1
        all_langs = (
892
            sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
893
        )
894 1
        lang_choices = 0
895 1
        if isinstance(language_arg, (int, float, long)):
0 ignored issues
show
introduced by
The variable long does not seem to be defined in case PY3 on line 69 is False. Are you sure this can never be the case?
Loading history...
896 1
            lang_choices = int(language_arg)
897 1
        elif language_arg != '' and isinstance(language_arg, (text_type, str)):
898 1
            for lang in text_type(language_arg).lower().split(','):
899 1
                if lang in _LANG_DICT and (_LANG_DICT[lang] & all_langs):
900 1
                    lang_choices += _LANG_DICT[lang]
901 1
                elif not filter_langs:
902 1
                    raise ValueError(
903
                        'Unknown \''
904
                        + name_mode
905
                        + '\' language: \''
906
                        + lang
907
                        + '\''
908
                    )
909
910
        # Language choices are either all incompatible with the name mode or
911
        # no choices were given, so try to autodetect
912 1
        if lang_choices == 0:
913 1
            language_arg = self._language(word, name_mode)
914
        else:
915 1
            language_arg = lang_choices
916 1
        language_arg2 = self._language_index_from_code(language_arg, name_mode)
917
918 1
        rules = BMDATA[name_mode]['rules'][language_arg2]
919 1
        final_rules1 = BMDATA[name_mode][match_mode]['common']
920 1
        final_rules2 = BMDATA[name_mode][match_mode][language_arg2]
921
922 1
        result = self._phonetic(
923
            word,
924
            name_mode,
925
            rules,
926
            final_rules1,
927
            final_rules2,
928
            language_arg,
929
            concat,
930
        )
931 1
        result = self._phonetic_numbers(result)
932
933 1
        return result
934
935
936 1
def bmpm(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
937
    word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
938
    language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
939
    name_mode='gen',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
940
    match_mode='approx',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
941
    concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
942
    filter_langs=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
943
):
944
    """Return the Beider-Morse Phonetic Matching encoding(s) of a term.
945
946
    This is a wrapper for :py:meth:`BeiderMorse.encode`.
947
948
    Parameters
949
    ----------
950
    word : str
951
        The word to transform
952
    language_arg : str
953
        The language of the term; supported values include:
954
955
            - ``any``
956
            - ``arabic``
957
            - ``cyrillic``
958
            - ``czech``
959
            - ``dutch``
960
            - ``english``
961
            - ``french``
962
            - ``german``
963
            - ``greek``
964
            - ``greeklatin``
965
            - ``hebrew``
966
            - ``hungarian``
967
            - ``italian``
968
            - ``latvian``
969
            - ``polish``
970
            - ``portuguese``
971
            - ``romanian``
972
            - ``russian``
973
            - ``spanish``
974
            - ``turkish``
975
976
    name_mode : str
977
        The name mode of the algorithm:
978
979
            - ``gen`` -- general (default)
980
            - ``ash`` -- Ashkenazi
981
            - ``sep`` -- Sephardic
982
983
    match_mode : str
984
        Matching mode: ``approx`` or ``exact``
985
    concat : bool
986
        Concatenation mode
987
    filter_langs : bool
988
        Filter out incompatible languages
989
990
    Returns
991
    -------
992
    tuple
993
        The Beider-Morse phonetic value(s)
994
995
    Examples
996
    --------
997
    >>> bmpm('Christopher')
998
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
999
    xristYfir xristopi xritopir xritopi xristofi xritofir xritofi
1000
    tzristopir tzristofir zristopir zristopi zritopir zritopi zristofir
1001
    zristofi zritofir zritofi'
1002
    >>> bmpm('Niall')
1003
    'nial niol'
1004
    >>> bmpm('Smith')
1005
    'zmit'
1006
    >>> bmpm('Schmidt')
1007
    'zmit stzmit'
1008
1009
    >>> bmpm('Christopher', language_arg='German')
1010
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
1011
    xristYfir'
1012
    >>> bmpm('Christopher', language_arg='English')
1013
    'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir
1014
    xristafir xrQstafir'
1015
    >>> bmpm('Christopher', language_arg='German', name_mode='ash')
1016
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
1017
    xristYfir'
1018
1019
    >>> bmpm('Christopher', language_arg='German', match_mode='exact')
1020
    'xriStopher xriStofer xristopher xristofer'
1021
1022
    """
1023 1
    return BeiderMorse().encode(
1024
        word, language_arg, name_mode, match_mode, concat, filter_langs
1025
    )
1026
1027
1028
if __name__ == '__main__':
1029
    import doctest
1030
1031
    doctest.testmod()
1032