Completed
Pull Request — master (#141)
by Chris
11:04
created

BeiderMorse._phonetic_number()   A

Complexity

Conditions 2

Size

Total Lines 14
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 4
nop 2
dl 0
loc 14
ccs 4
cts 4
cp 1
crap 2
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# This file is based on Alexander Beider and Stephen P. Morse's implementation
7
# of the Beider-Morse Phonetic Matching (BMPM) System, available at
8
# http://stevemorse.org/phonetics/bmpm.htm.
9
#
10
# Abydos is free software: you can redistribute it and/or modify
11
# it under the terms of the GNU General Public License as published by
12
# the Free Software Foundation, either version 3 of the License, or
13
# (at your option) any later version.
14
#
15
# Abydos is distributed in the hope that it will be useful,
16
# but WITHOUT ANY WARRANTY; without even the implied warranty of
17
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
# GNU General Public License for more details.
19
#
20
# You should have received a copy of the GNU General Public License
21
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
22
23 1
"""abydos.phonetic._beider_morse.
24
25
Beider-Morse Phonetic Matching (BMPM) algorithm
26
"""
27
28 1
from __future__ import (
29
    absolute_import,
30
    division,
31
    print_function,
32
    unicode_literals,
33
)
34
35 1
from re import search
36 1
from unicodedata import normalize
37
38 1
from six import PY3, text_type
39 1
from six.moves import range
40
41 1
from ._beider_morse_data import (
42
    BMDATA,
43
    L_ANY,
44
    L_ARABIC,
45
    L_CYRILLIC,
46
    L_CZECH,
47
    L_DUTCH,
48
    L_ENGLISH,
49
    L_FRENCH,
50
    L_GERMAN,
51
    L_GREEK,
52
    L_GREEKLATIN,
53
    L_HEBREW,
54
    L_HUNGARIAN,
55
    L_ITALIAN,
56
    L_LATVIAN,
57
    L_NONE,
58
    L_POLISH,
59
    L_PORTUGUESE,
60
    L_ROMANIAN,
61
    L_RUSSIAN,
62
    L_SPANISH,
63
    L_TURKISH,
64
)
65 1
from ._phonetic import Phonetic
66
67 1
__all__ = ['BeiderMorse', 'bmpm']
68
69
if PY3:
70
    long = int
0 ignored issues
show
Coding Style Naming introduced by
The name long does not conform to the class naming conventions ([A-Z_][a-zA-Z0-9]+$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
71
72 1
_LANG_DICT = {
73
    'any': L_ANY,
74
    'arabic': L_ARABIC,
75
    'cyrillic': L_CYRILLIC,
76
    'czech': L_CZECH,
77
    'dutch': L_DUTCH,
78
    'english': L_ENGLISH,
79
    'french': L_FRENCH,
80
    'german': L_GERMAN,
81
    'greek': L_GREEK,
82
    'greeklatin': L_GREEKLATIN,
83
    'hebrew': L_HEBREW,
84
    'hungarian': L_HUNGARIAN,
85
    'italian': L_ITALIAN,
86
    'latvian': L_LATVIAN,
87
    'polish': L_POLISH,
88
    'portuguese': L_PORTUGUESE,
89
    'romanian': L_ROMANIAN,
90
    'russian': L_RUSSIAN,
91
    'spanish': L_SPANISH,
92
    'turkish': L_TURKISH,
93
}
94
95 1
BMDATA['gen']['discards'] = {
96
    'da ',
97
    'dal ',
98
    'de ',
99
    'del ',
100
    'dela ',
101
    'de la ',
102
    'della ',
103
    'des ',
104
    'di ',
105
    'do ',
106
    'dos ',
107
    'du ',
108
    'van ',
109
    'von ',
110
    'd\'',
111
}
112 1
BMDATA['sep']['discards'] = {
113
    'al',
114
    'el',
115
    'da',
116
    'dal',
117
    'de',
118
    'del',
119
    'dela',
120
    'de la',
121
    'della',
122
    'des',
123
    'di',
124
    'do',
125
    'dos',
126
    'du',
127
    'van',
128
    'von',
129
}
130 1
BMDATA['ash']['discards'] = {'bar', 'ben', 'da', 'de', 'van', 'von'}
131
132
# format of rules array
133 1
_PATTERN_POS = 0
134 1
_LCONTEXT_POS = 1
135 1
_RCONTEXT_POS = 2
136 1
_PHONETIC_POS = 3
137
138
139 1
class BeiderMorse(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
140
    """Beider-Morse Phonetic Matching.
141
142
    The Beider-Morse Phonetic Matching algorithm is described in
143
    :cite:`Beider:2008`.
144
    The reference implementation is licensed under GPLv3.
145
    """
146
147 1
    def _language(self, name, name_mode):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
148
        """Return the best guess language ID for the word and language choices.
149
150
        Args:
151
            name (str): The term to guess the language of
152
            name_mode (str): the name mode of the algorithm: 'gen' (default),
153
                    'ash' (Ashkenazi), or 'sep' (Sephardic)
154
155
        Returns:
156
            int: Language ID
157
158
        """
159 1
        name = name.strip().lower()
160 1
        rules = BMDATA[name_mode]['language_rules']
161 1
        all_langs = (
162
            sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
163
        )
164 1
        choices_remaining = all_langs
165 1
        for rule in rules:
166 1
            letters, languages, accept = rule
167 1
            if search(letters, name) is not None:
168 1
                if accept:
169 1
                    choices_remaining &= languages
170
                else:
171 1
                    choices_remaining &= (~languages) % (all_langs + 1)
172 1
        if choices_remaining == L_NONE:
173 1
            choices_remaining = L_ANY
174 1
        return choices_remaining
175
176 1
    def _redo_language(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
177
        self, term, name_mode, rules, final_rules1, final_rules2, concat
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
178
    ):
179
        """Reassess the language of the terms and call the phonetic encoder.
180
181
        Uses a split multi-word term.
182
183
        Args:
184
            term (str): The term to encode via Beider-Morse
185
            name_mode (str): The name mode of the algorithm: ``gen`` (default),
186
                ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
187
            rules (tuple): The set of initial phonetic transform regexps
188
            final_rules1 (tuple): The common set of final phonetic transform
189
                regexps
190
            final_rules2 (tuple): The specific set of final phonetic transform
191
                regexps
192
            concat (bool): A flag to indicate concatenation
193
194
        Returns:
195
            str: A BMPM code
196
197
        """
198 1
        language_arg = self._language(term, name_mode)
199 1
        return self._phonetic(
200
            term,
201
            name_mode,
202
            rules,
203
            final_rules1,
204
            final_rules2,
205
            language_arg,
206
            concat,
207
        )
208
209 1
    def _phonetic(
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (29/15).
Loading history...
210
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
211
        term,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
212
        name_mode,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
213
        rules,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
214
        final_rules1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
215
        final_rules2,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
216
        language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
217
        concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
218
    ):
219
        """Return the Beider-Morse encoding(s) of a term.
220
221
        Args:
222
            term (str): The term to encode via Beider-Morse
223
            name_mode (str): The name mode of the algorithm: ``gen`` (default),
224
                ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
225
            rules (tuple): The set of initial phonetic transform regexps
226
            final_rules1 (tuple): The common set of final phonetic transform
227
                regexps
228
            final_rules2 (tuple): The specific set of final phonetic transform
229
                regexps
230
            language_arg (int): The language of the term
231
            concat (bool): A flag to indicate concatenation
232
233
        Returns:
234
            str: A BMPM code
235
236
        """
237 1
        term = term.replace('-', ' ').strip()
238
239 1
        if name_mode == 'gen':  # generic case
240
            # discard and concatenate certain words if at the start of the name
241 1
            for pfx in BMDATA['gen']['discards']:
242 1
                if term.startswith(pfx):
243 1
                    remainder = term[len(pfx) :]
244 1
                    combined = pfx[:-1] + remainder
245 1
                    result = (
246
                        self._redo_language(
247
                            remainder,
248
                            name_mode,
249
                            rules,
250
                            final_rules1,
251
                            final_rules2,
252
                            concat,
253
                        )
254
                        + '-'
255
                        + self._redo_language(
256
                            combined,
257
                            name_mode,
258
                            rules,
259
                            final_rules1,
260
                            final_rules2,
261
                            concat,
262
                        )
263
                    )
264 1
                    return result
265
266 1
        words = (
267
            term.split()
268
        )  # create array of the individual words in the name
269 1
        words2 = []
270
271 1
        if name_mode == 'sep':  # Sephardic case
272
            # for each word in the name, delete portions of word preceding
273
            # apostrophe
274
            # ex: d'avila d'aguilar --> avila aguilar
275
            # also discard certain words in the name
276
277
            # note that we can never get a match on "de la" because we are
278
            # checking single words below
279
            # this is a bug, but I won't try to fix it now
280
281 1
            for word in words:
282 1
                word = word[word.rfind('\'') + 1 :]
283 1
                if word not in BMDATA['sep']['discards']:
284 1
                    words2.append(word)
285
286 1
        elif name_mode == 'ash':  # Ashkenazic case
287
            # discard certain words if at the start of the name
288 1
            if len(words) > 1 and words[0] in BMDATA['ash']['discards']:
289 1
                words2 = words[1:]
290
            else:
291 1
                words2 = list(words)
292
        else:
293 1
            words2 = list(words)
294
295 1
        if concat:
296
            # concatenate the separate words of a multi-word name
297
            # (normally used for exact matches)
298 1
            term = ' '.join(words2)
299 1
        elif len(words2) == 1:  # not a multi-word name
300 1
            term = words2[0]
301
        else:
302
            # encode each word in a multi-word name separately
303
            # (normally used for approx matches)
304 1
            result = '-'.join(
305
                [
306
                    self._redo_language(
307
                        w, name_mode, rules, final_rules1, final_rules2, concat
308
                    )
309
                    for w in words2
310
                ]
311
            )
312 1
            return result
313
314 1
        term_length = len(term)
315
316
        # apply language rules to map to phonetic alphabet
317 1
        phonetic = ''
318 1
        skip = 0
319 1
        for i in range(term_length):
320 1
            if skip:
321 1
                skip -= 1
322 1
                continue
323 1
            found = False
324 1
            for rule in rules:
325 1
                pattern = rule[_PATTERN_POS]
326 1
                pattern_length = len(pattern)
327 1
                lcontext = rule[_LCONTEXT_POS]
328 1
                rcontext = rule[_RCONTEXT_POS]
329
330
                # check to see if next sequence in input matches the string in
331
                # the rule
332 1
                if (pattern_length > term_length - i) or (
333
                    term[i : i + pattern_length] != pattern
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
334
                ):  # no match
335 1
                    continue
336
337 1
                right = '^' + rcontext
338 1
                left = lcontext + '$'
339
340
                # check that right context is satisfied
341 1
                if rcontext != '':
342 1
                    if not search(right, term[i + pattern_length :]):
343 1
                        continue
344
345
                # check that left context is satisfied
346 1
                if lcontext != '':
347 1
                    if not search(left, term[:i]):
348 1
                        continue
349
350
                # check for incompatible attributes
351 1
                candidate = self._apply_rule_if_compat(
352
                    phonetic, rule[_PHONETIC_POS], language_arg
353
                )
354
                # The below condition shouldn't ever be false
355 1
                if candidate is not None:  # pragma: no branch
356 1
                    phonetic = candidate
357 1
                    found = True
358 1
                    break
359
360 1
            if (
361
                not found
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
362
            ):  # character in name that is not in table -- e.g., space
363 1
                pattern_length = 1
364 1
            skip = pattern_length - 1
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
365
366
        # apply final rules on phonetic-alphabet,
367
        # doing a substitution of certain characters
368 1
        phonetic = self._apply_final_rules(
369
            phonetic, final_rules1, language_arg, False
370
        )  # apply common rules
371
        # final_rules1 are the common approx rules,
372
        # final_rules2 are approx rules for specific language
373 1
        phonetic = self._apply_final_rules(
374
            phonetic, final_rules2, language_arg, True
375
        )  # apply lang specific rules
376
377 1
        return phonetic
378
379 1
    def _apply_final_rules(self, phonetic, final_rules, language_arg, strip):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (21/15).
Loading history...
380
        """Apply a set of final rules to the phonetic encoding.
381
382
        Args:
383
            phonetic (str): The term to which to apply the final rules
384
            final_rules (tuple): The set of final phonetic transform regexps
385
            language_arg (int): An integer representing the target language of
386
                the phonetic encoding
387
            strip (bool): Flag to indicate whether to normalize the language
388
                attributes
389
390
        Returns:
391
            str: A BMPM code
392
393
        """
394
        # optimization to save time
395 1
        if not final_rules:
396 1
            return phonetic
397
398
        # expand the result
399 1
        phonetic = self._expand_alternates(phonetic)
400 1
        phonetic_array = phonetic.split('|')
401
402 1
        for k in range(len(phonetic_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
403 1
            phonetic = phonetic_array[k]
404 1
            phonetic2 = ''
405 1
            phoneticx = self._normalize_lang_attrs(phonetic, True)
406
407 1
            i = 0
408 1
            while i < len(phonetic):
409 1
                found = False
410
411 1
                if phonetic[i] == '[':  # skip over language attribute
412 1
                    attrib_start = i
413 1
                    i += 1
414 1
                    while True:
415 1
                        if phonetic[i] == ']':
416 1
                            i += 1
417 1
                            phonetic2 += phonetic[attrib_start:i]
418 1
                            break
419 1
                        i += 1
420 1
                    continue
421
422 1
                for rule in final_rules:
423 1
                    pattern = rule[_PATTERN_POS]
424 1
                    pattern_length = len(pattern)
425 1
                    lcontext = rule[_LCONTEXT_POS]
426 1
                    rcontext = rule[_RCONTEXT_POS]
427
428 1
                    right = '^' + rcontext
429 1
                    left = lcontext + '$'
430
431
                    # check to see if next sequence in phonetic matches the
432
                    # string in the rule
433 1
                    if (pattern_length > len(phoneticx) - i) or phoneticx[
434
                        i : i + pattern_length
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
435
                    ] != pattern:
436 1
                        continue
437
438
                    # check that right context is satisfied
439 1
                    if rcontext != '':
440 1
                        if not search(right, phoneticx[i + pattern_length :]):
441 1
                            continue
442
443
                    # check that left context is satisfied
444 1
                    if lcontext != '':
445 1
                        if not search(left, phoneticx[:i]):
446 1
                            continue
447
448
                    # check for incompatible attributes
449 1
                    candidate = self._apply_rule_if_compat(
450
                        phonetic2, rule[_PHONETIC_POS], language_arg
451
                    )
452
                    # The below condition shouldn't ever be false
453 1
                    if candidate is not None:  # pragma: no branch
454 1
                        phonetic2 = candidate
455 1
                        found = True
456 1
                        break
457
458 1
                if not found:
459
                    # character in name for which there is no substitution in
460
                    # the table
461 1
                    phonetic2 += phonetic[i]
462 1
                    pattern_length = 1
463
464 1
                i += pattern_length
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
465
466 1
            phonetic_array[k] = self._expand_alternates(phonetic2)
467
468 1
        phonetic = '|'.join(phonetic_array)
469 1
        if strip:
470 1
            phonetic = self._normalize_lang_attrs(phonetic, True)
471
472 1
        if '|' in phonetic:
473 1
            phonetic = '(' + self._remove_dupes(phonetic) + ')'
474
475 1
        return phonetic
476
477 1
    def _phonetic_number(self, phonetic):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
478
        """Remove bracketed text from the end of a string.
479
480
        Args:
481
            phonetic (str): A Beider-Morse phonetic encoding
482
483
        Returns:
484
            str: A BMPM code
485
486
        """
487 1
        if '[' in phonetic:
488 1
            return phonetic[: phonetic.find('[')]
489
490 1
        return phonetic  # experimental !!!!
491
492 1
    def _expand_alternates(self, phonetic):
493
        """Expand phonetic alternates separated by |s.
494
495
        Args:
496
            phonetic (str): A Beider-Morse phonetic encoding
497
498
        Returns:
499
            str: A BMPM code
500
501
        """
502 1
        alt_start = phonetic.find('(')
503 1
        if alt_start == -1:
504 1
            return self._normalize_lang_attrs(phonetic, False)
505
506 1
        prefix = phonetic[:alt_start]
507 1
        alt_start += 1  # get past the (
508 1
        alt_end = phonetic.find(')', alt_start)
509 1
        alt_string = phonetic[alt_start:alt_end]
510 1
        alt_end += 1  # get past the )
511 1
        suffix = phonetic[alt_end:]
512 1
        alt_array = alt_string.split('|')
513 1
        result = ''
514
515 1
        for i in range(len(alt_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
516 1
            alt = alt_array[i]
517 1
            alternate = self._expand_alternates(prefix + alt + suffix)
518 1
            if alternate != '' and alternate != '[0]':
519 1
                if result != '':
520 1
                    result += '|'
521 1
                result += alternate
522
523 1
        return result
524
525 1
    def _pnums_with_leading_space(self, phonetic):
526
        """Join prefixes & suffixes in cases of alternate phonetic values.
527
528
        Args:
529
            phonetic (str): A Beider-Morse phonetic encoding
530
531
        Returns:
532
            str: A BMPM code
533
534
        """
535 1
        alt_start = phonetic.find('(')
536 1
        if alt_start == -1:
537 1
            return ' ' + self._phonetic_number(phonetic)
538
539 1
        prefix = phonetic[:alt_start]
540 1
        alt_start += 1  # get past the (
541 1
        alt_end = phonetic.find(')', alt_start)
542 1
        alt_string = phonetic[alt_start:alt_end]
543 1
        alt_end += 1  # get past the )
544 1
        suffix = phonetic[alt_end:]
545 1
        alt_array = alt_string.split('|')
546 1
        result = ''
547 1
        for alt in alt_array:
548 1
            result += self._pnums_with_leading_space(prefix + alt + suffix)
549
550 1
        return result
551
552 1
    def _phonetic_numbers(self, phonetic):
553
        """Prepare & join phonetic numbers.
554
555
        Split phonetic value on '-', run through _pnums_with_leading_space,
556
        and join with ' '
557
558
        Args:
559
            phonetic (str): A Beider-Morse phonetic encoding
560
561
        Returns:
562
            str: A BMPM code
563
564
        """
565 1
        phonetic_array = phonetic.split('-')  # for names with spaces in them
566 1
        result = ' '.join(
567
            [self._pnums_with_leading_space(i)[1:] for i in phonetic_array]
568
        )
569 1
        return result
570
571 1
    def _remove_dupes(self, phonetic):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
572
        """Remove duplicates from a phonetic encoding list.
573
574
        Args:
575
            phonetic (str): A Beider-Morse phonetic encoding
576
577
        Returns:
578
            str: A BMPM code
579
580
        """
581 1
        alt_string = phonetic
582 1
        alt_array = alt_string.split('|')
583
584 1
        result = '|'
585 1
        for i in range(len(alt_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
586 1
            alt = alt_array[i]
587 1
            if alt and '|' + alt + '|' not in result:
588 1
                result += alt + '|'
589
590 1
        return result[1:-1]  # remove leading and trailing |
591
592 1
    def _normalize_lang_attrs(self, text, strip):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
593
        """Remove embedded bracketed attributes.
594
595
        This (potentially) bitwise-ands bracketed attributes together and adds
596
        to the end.
597
        This is applied to a single alternative at a time -- not to a
598
        parenthesized list.
599
        It removes all embedded bracketed attributes, logically-ands them
600
        together, and places them at the end.
601
        However if strip is true, this can indeed remove embedded bracketed
602
        attributes from a parenthesized list.
603
604
        Args:
605
            text (str): A Beider-Morse phonetic encoding (in progress)
606
            strip (bool): Remove the bracketed attributes (and throw away)
607
608
        Returns:
609
            str: A BMPM code
610
611
        Raises:
612
            ValueError: No closing square bracket
613
614
        """
615 1
        uninitialized = -1  # all 1's
616 1
        attrib = uninitialized
617 1
        while '[' in text:
618 1
            bracket_start = text.find('[')
619 1
            bracket_end = text.find(']', bracket_start)
620 1
            if bracket_end == -1:
621 1
                raise ValueError(
622
                    'No closing square bracket: text=('
623
                    + text
624
                    + ') strip=('
625
                    + text_type(strip)
626
                    + ')'
627
                )
628 1
            attrib &= int(text[bracket_start + 1 : bracket_end])
629 1
            text = text[:bracket_start] + text[bracket_end + 1 :]
630
631 1
        if attrib == uninitialized or strip:
632 1
            return text
633 1
        elif attrib == 0:
634
            # means that the attributes were incompatible and there is no
635
            # alternative here
636 1
            return '[0]'
637 1
        return text + '[' + str(attrib) + ']'
638
639 1
    def _apply_rule_if_compat(self, phonetic, target, language_arg):
640
        """Apply a phonetic regex if compatible.
641
642
        tests for compatible language rules
643
644
        to do so, apply the rule, expand the results, and detect alternatives
645
            with incompatible attributes
646
647
        then drop each alternative that has incompatible attributes and keep
648
            those that are compatible
649
650
        if there are no compatible alternatives left, return false
651
652
        otherwise return the compatible alternatives
653
654
        apply the rule
655
656
        Args:
657
            phonetic (str): The Beider-Morse phonetic encoding (so far)
658
            target (str): A proposed addition to the phonetic encoding
659
            language_arg (int): An integer representing the target language of
660
                the phonetic encoding
661
662
        Returns:
663
            str: A candidate encoding
664
665
        """
666 1
        candidate = phonetic + target
667 1
        if '[' not in candidate:  # no attributes so we need test no further
668 1
            return candidate
669
670
        # expand the result, converting incompatible attributes to [0]
671 1
        candidate = self._expand_alternates(candidate)
672 1
        candidate_array = candidate.split('|')
673
674
        # drop each alternative that has incompatible attributes
675 1
        candidate = ''
676 1
        found = False
677
678 1
        for i in range(len(candidate_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
679 1
            this_candidate = candidate_array[i]
680 1
            if language_arg != 1:
681 1
                this_candidate = self._normalize_lang_attrs(
682
                    this_candidate + '[' + str(language_arg) + ']', False
683
                )
684 1
            if this_candidate != '[0]':
685 1
                found = True
686 1
                if candidate:
687 1
                    candidate += '|'
688 1
                candidate += this_candidate
689
690
        # return false if no compatible alternatives remain
691 1
        if not found:
692 1
            return None
693
694
        # return the result of applying the rule
695 1
        if '|' in candidate:
696 1
            candidate = '(' + candidate + ')'
697 1
        return candidate
698
699 1
    def _language_index_from_code(self, code, name_mode):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
700
        """Return the index value for a language code.
701
702
        This returns l_any if more than one code is specified or the code is
703
        out of bounds.
704
705
        Args:
706
            code (int): The language code to interpret
707
            name_mode (str): The name mode of the algorithm: ``gen`` (default),
708
                    ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
709
710
        Returns:
711
            int: Language code index
712
713
        """
714 1
        if code < 1 or code > sum(
715
            _LANG_DICT[_] for _ in BMDATA[name_mode]['languages']
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
716
        ):  # code out of range
717 1
            return L_ANY
718 1
        if (
719
            code & (code - 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
720
        ) != 0:  # choice was more than one language; use any
721 1
            return L_ANY
722 1
        return code
723
724 1
    def encode(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
725
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
726
        word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
727
        language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
728
        name_mode='gen',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
729
        match_mode='approx',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
730
        concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
731
        filter_langs=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
732
    ):
733
        """Return the Beider-Morse Phonetic Matching encoding(s) of a term.
734
735
        Args:
736
            word (str): The word to transform
737
            language_arg (int): The language of the term; supported values
738
                include:
739
                    - ``any``
740
                    - ``arabic``
741
                    - ``cyrillic``
742
                    - ``czech``
743
                    - ``dutch``
744
                    - ``english``
745
                    - ``french``
746
                    - ``german``
747
                    - ``greek``
748
                    - ``greeklatin``
749
                    - ``hebrew``
750
                    - ``hungarian``
751
                    - ``italian``
752
                    - ``latvian``
753
                    - ``polish``
754
                    - ``portuguese``
755
                    - ``romanian``
756
                    - ``russian``
757
                    - ``spanish``
758
                    - ``turkish``
759
            name_mode (str): The name mode of the algorithm:
760
                - ``gen`` -- general (default)
761
                - ``ash`` -- Ashkenazi
762
                - ``sep`` -- Sephardic
763
            match_mode (str): Matching mode: ``approx`` or ``exact``
764
            concat (bool): Concatenation mode
765
            filter_langs (bool): Filter out incompatible languages
766
767
        Returns:
768
            tuple: The BMPM value(s)
769
770
        Raises:
771
            ValueError: Unknown language
772
773
        Examples:
774
            >>> pe = BeiderMorse()
775
            >>> pe.encode('Christopher')
776
            'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir
777
            xristofir xristYfir xristopi xritopir xritopi xristofi xritofir
778
            xritofi tzristopir tzristofir zristopir zristopi zritopir zritopi
779
            zristofir zristofi zritofir zritofi'
780
            >>> pe.encode('Niall')
781
            'nial niol'
782
            >>> pe.encode('Smith')
783
            'zmit'
784
            >>> pe.encode('Schmidt')
785
            'zmit stzmit'
786
787
            >>> pe.encode('Christopher', language_arg='German')
788
            'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir
789
            xristofir xristYfir'
790
            >>> pe.encode('Christopher', language_arg='English')
791
            'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir
792
            xristafir xrQstafir'
793
            >>> pe.encode('Christopher', language_arg='German',
794
            ... name_mode='ash')
795
            'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir
796
            xristofir xristYfir'
797
798
            >>> pe.encode('Christopher', language_arg='German',
799
            ... match_mode='exact')
800
            'xriStopher xriStofer xristopher xristofer'
801
802
        """
803 1
        word = normalize('NFC', text_type(word.strip().lower()))
804
805 1
        name_mode = name_mode.strip().lower()[:3]
806 1
        if name_mode not in {'ash', 'sep', 'gen'}:
807 1
            name_mode = 'gen'
808
809 1
        if match_mode != 'exact':
810 1
            match_mode = 'approx'
811
812
        # Translate the supplied language_arg value into an integer
813
        # representing a set of languages
814 1
        all_langs = (
815
            sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
816
        )
817 1
        lang_choices = 0
818 1
        if isinstance(language_arg, (int, float, long)):
0 ignored issues
show
introduced by
The variable long does not seem to be defined in case PY3 on line 69 is False. Are you sure this can never be the case?
Loading history...
819 1
            lang_choices = int(language_arg)
820 1
        elif language_arg != '' and isinstance(language_arg, (text_type, str)):
821 1
            for lang in text_type(language_arg).lower().split(','):
822 1
                if lang in _LANG_DICT and (_LANG_DICT[lang] & all_langs):
823 1
                    lang_choices += _LANG_DICT[lang]
824 1
                elif not filter_langs:
825 1
                    raise ValueError(
826
                        'Unknown \''
827
                        + name_mode
828
                        + '\' language: \''
829
                        + lang
830
                        + '\''
831
                    )
832
833
        # Language choices are either all incompatible with the name mode or
834
        # no choices were given, so try to autodetect
835 1
        if lang_choices == 0:
836 1
            language_arg = self._language(word, name_mode)
837
        else:
838 1
            language_arg = lang_choices
839 1
        language_arg2 = self._language_index_from_code(language_arg, name_mode)
840
841 1
        rules = BMDATA[name_mode]['rules'][language_arg2]
842 1
        final_rules1 = BMDATA[name_mode][match_mode]['common']
843 1
        final_rules2 = BMDATA[name_mode][match_mode][language_arg2]
844
845 1
        result = self._phonetic(
846
            word,
847
            name_mode,
848
            rules,
849
            final_rules1,
850
            final_rules2,
851
            language_arg,
852
            concat,
853
        )
854 1
        result = self._phonetic_numbers(result)
855
856 1
        return result
857
858
859 1
def bmpm(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
860
    word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
861
    language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
862
    name_mode='gen',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
863
    match_mode='approx',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
864
    concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
865
    filter_langs=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
866
):
867
    """Return the Beider-Morse Phonetic Matching encoding(s) of a term.
868
869
    This is a wrapper for :py:meth:`BeiderMorse.encode`.
870
871
    Args:
872
        word (str): The word to transform
873
        language_arg (str): The language of the term; supported values
874
            include:
875
                - ``any``
876
                - ``arabic``
877
                - ``cyrillic``
878
                - ``czech``
879
                - ``dutch``
880
                - ``english``
881
                - ``french``
882
                - ``german``
883
                - ``greek``
884
                - ``greeklatin``
885
                - ``hebrew``
886
                - ``hungarian``
887
                - ``italian``
888
                - ``latvian``
889
                - ``polish``
890
                - ``portuguese``
891
                - ``romanian``
892
                - ``russian``
893
                - ``spanish``
894
                - ``turkish``
895
        name_mode (str): The name mode of the algorithm:
896
            - ``gen`` -- general (default)
897
            - ``ash`` -- Ashkenazi
898
            - ``sep`` -- Sephardic
899
        match_mode (str): Matching mode: ``approx`` or ``exact``
900
        concat (bool): Concatenation mode
901
        filter_langs (bool): Filter out incompatible languages
902
903
    Returns:
904
        tuple: The BMPM value(s)
905
906
    Examples:
907
        >>> bmpm('Christopher')
908
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
909
        xristYfir xristopi xritopir xritopi xristofi xritofir xritofi
910
        tzristopir tzristofir zristopir zristopi zritopir zritopi zristofir
911
        zristofi zritofir zritofi'
912
        >>> bmpm('Niall')
913
        'nial niol'
914
        >>> bmpm('Smith')
915
        'zmit'
916
        >>> bmpm('Schmidt')
917
        'zmit stzmit'
918
919
        >>> bmpm('Christopher', language_arg='German')
920
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
921
        xristYfir'
922
        >>> bmpm('Christopher', language_arg='English')
923
        'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir
924
        xristafir xrQstafir'
925
        >>> bmpm('Christopher', language_arg='German', name_mode='ash')
926
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
927
        xristYfir'
928
929
        >>> bmpm('Christopher', language_arg='German', match_mode='exact')
930
        'xriStopher xriStofer xristopher xristofer'
931
932
    """
933 1
    return BeiderMorse().encode(
934
        word, language_arg, name_mode, match_mode, concat, filter_langs
935
    )
936
937
938
if __name__ == '__main__':
939
    import doctest
940
941
    doctest.testmod()
942