Completed
Pull Request — master (#135)
by Chris
11:32
created

BeiderMorse._normalize_lang_attrs()   B

Complexity

Conditions 6

Size

Total Lines 38
Code Lines 20

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 6

Importance

Changes 0
Metric Value
eloc 20
dl 0
loc 38
ccs 15
cts 15
cp 1
rs 8.4666
c 0
b 0
f 0
cc 6
nop 3
crap 6
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# This file is based on Alexander Beider and Stephen P. Morse's implementation
7
# of the Beider-Morse Phonetic Matching (BMPM) System, available at
8
# http://stevemorse.org/phonetics/bmpm.htm.
9
#
10
# Abydos is free software: you can redistribute it and/or modify
11
# it under the terms of the GNU General Public License as published by
12
# the Free Software Foundation, either version 3 of the License, or
13
# (at your option) any later version.
14
#
15
# Abydos is distributed in the hope that it will be useful,
16
# but WITHOUT ANY WARRANTY; without even the implied warranty of
17
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
# GNU General Public License for more details.
19
#
20
# You should have received a copy of the GNU General Public License
21
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
22
23 1
"""abydos.phonetic._bmpm.
24
25
The phonetic._bmpm module implements the Beider-Morse Phonentic Matching (BMPM)
26
algorithm.
27
"""
28
29 1
from __future__ import unicode_literals
30
31 1
from re import search
32 1
from unicodedata import normalize
33
34 1
from six import PY3, text_type
35 1
from six.moves import range
36
37 1
from ._bmdata import (
38
    BMDATA,
39
    L_ANY,
40
    L_ARABIC,
41
    L_CYRILLIC,
42
    L_CZECH,
43
    L_DUTCH,
44
    L_ENGLISH,
45
    L_FRENCH,
46
    L_GERMAN,
47
    L_GREEK,
48
    L_GREEKLATIN,
49
    L_HEBREW,
50
    L_HUNGARIAN,
51
    L_ITALIAN,
52
    L_LATVIAN,
53
    L_NONE,
54
    L_POLISH,
55
    L_PORTUGUESE,
56
    L_ROMANIAN,
57
    L_RUSSIAN,
58
    L_SPANISH,
59
    L_TURKISH,
60
)
61 1
from ._phonetic import Phonetic
62
63 1
__all__ = ['BeiderMorse', 'bmpm']
64
65
if PY3:
66
    long = int
0 ignored issues
show
Coding Style Naming introduced by
The name long does not conform to the class naming conventions ([A-Z_][a-zA-Z0-9]+$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
67
68 1
_LANG_DICT = {
69
    'any': L_ANY,
70
    'arabic': L_ARABIC,
71
    'cyrillic': L_CYRILLIC,
72
    'czech': L_CZECH,
73
    'dutch': L_DUTCH,
74
    'english': L_ENGLISH,
75
    'french': L_FRENCH,
76
    'german': L_GERMAN,
77
    'greek': L_GREEK,
78
    'greeklatin': L_GREEKLATIN,
79
    'hebrew': L_HEBREW,
80
    'hungarian': L_HUNGARIAN,
81
    'italian': L_ITALIAN,
82
    'latvian': L_LATVIAN,
83
    'polish': L_POLISH,
84
    'portuguese': L_PORTUGUESE,
85
    'romanian': L_ROMANIAN,
86
    'russian': L_RUSSIAN,
87
    'spanish': L_SPANISH,
88
    'turkish': L_TURKISH,
89
}
90
91 1
BMDATA['gen']['discards'] = {
92
    'da ',
93
    'dal ',
94
    'de ',
95
    'del ',
96
    'dela ',
97
    'de la ',
98
    'della ',
99
    'des ',
100
    'di ',
101
    'do ',
102
    'dos ',
103
    'du ',
104
    'van ',
105
    'von ',
106
    'd\'',
107
}
108 1
BMDATA['sep']['discards'] = {
109
    'al',
110
    'el',
111
    'da',
112
    'dal',
113
    'de',
114
    'del',
115
    'dela',
116
    'de la',
117
    'della',
118
    'des',
119
    'di',
120
    'do',
121
    'dos',
122
    'du',
123
    'van',
124
    'von',
125
}
126 1
BMDATA['ash']['discards'] = {'bar', 'ben', 'da', 'de', 'van', 'von'}
127
128
# format of rules array
129 1
_PATTERN_POS = 0
130 1
_LCONTEXT_POS = 1
131 1
_RCONTEXT_POS = 2
132 1
_PHONETIC_POS = 3
133
134
135 1
class BeiderMorse(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
136
    """Beider-Morse Phonetic Matching.
137
138
    The Beider-Morse Phonetic Matching algorithm is described in
139
    :cite:`Beider:2008`.
140
    The reference implementation is licensed under GPLv3.
141
    """
142
143 1
    def _language(self, name, name_mode):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
144
        """Return the best guess language ID for the word and language choices.
145
146
        :param str name: the term to guess the language of
147
        :param str name_mode: the name mode of the algorithm: 'gen' (default),
148
                    'ash' (Ashkenazi), or 'sep' (Sephardic)
149
        """
150 1
        name = name.strip().lower()
151 1
        rules = BMDATA[name_mode]['language_rules']
152 1
        all_langs = (
153
            sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
154
        )
155 1
        choices_remaining = all_langs
156 1
        for rule in rules:
157 1
            letters, languages, accept = rule
158 1
            if search(letters, name) is not None:
159 1
                if accept:
160 1
                    choices_remaining &= languages
161
                else:
162 1
                    choices_remaining &= (~languages) % (all_langs + 1)
163 1
        if choices_remaining == L_NONE:
164 1
            choices_remaining = L_ANY
165 1
        return choices_remaining
166
167 1
    def _redo_language(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
168
        self, term, name_mode, rules, final_rules1, final_rules2, concat
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
169
    ):
170
        """Reassess the language of the terms and call the phonetic encoder.
171
172
        Uses a split multi-word term.
173
174
        :param str term: the term to encode via Beider-Morse
175
        :param str name_mode: the name mode of the algorithm: 'gen' (default),
176
            'ash' (Ashkenazi), or 'sep' (Sephardic)
177
        :param tuple rules: the set of initial phonetic transform regexps
178
        :param tuple final_rules1: the common set of final phonetic transform
179
            regexps
180
        :param tuple final_rules2: the specific set of final phonetic transform
181
            regexps
182
        :param bool concat: a flag to indicate concatenation
183
        """
184 1
        language_arg = self._language(term, name_mode)
185 1
        return self._phonetic(
186
            term,
187
            name_mode,
188
            rules,
189
            final_rules1,
190
            final_rules2,
191
            language_arg,
192
            concat,
193
        )
194
195 1
    def _phonetic(
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (29/15).
Loading history...
196
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
197
        term,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
198
        name_mode,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
199
        rules,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
200
        final_rules1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
201
        final_rules2,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
202
        language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
203
        concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
204
    ):
205
        """Return the Beider-Morse encoding(s) of a term.
206
207
        :param str term: the term to encode via Beider-Morse
208
        :param str name_mode: the name mode of the algorithm: 'gen' (default),
209
            ash' (Ashkenazi), or 'sep' (Sephardic)
210
        :param tuple rules: the set of initial phonetic transform regexps
211
        :param tuple final_rules1: the common set of final phonetic transform
212
            regexps
213
        :param tuple final_rules2: the specific set of final phonetic transform
214
            regexps
215
        :param int language_arg: an integer representing the target language of
216
            the phonetic encoding
217
        :param bool concat: a flag to indicate concatenation
218
        """
219 1
        term = term.replace('-', ' ').strip()
220
221 1
        if name_mode == 'gen':  # generic case
222
            # discard and concatenate certain words if at the start of the name
223 1
            for pfx in BMDATA['gen']['discards']:
224 1
                if term.startswith(pfx):
225 1
                    remainder = term[len(pfx) :]
226 1
                    combined = pfx[:-1] + remainder
227 1
                    result = (
228
                        self._redo_language(
229
                            remainder,
230
                            name_mode,
231
                            rules,
232
                            final_rules1,
233
                            final_rules2,
234
                            concat,
235
                        )
236
                        + '-'
237
                        + self._redo_language(
238
                            combined,
239
                            name_mode,
240
                            rules,
241
                            final_rules1,
242
                            final_rules2,
243
                            concat,
244
                        )
245
                    )
246 1
                    return result
247
248 1
        words = (
249
            term.split()
250
        )  # create array of the individual words in the name
251 1
        words2 = []
252
253 1
        if name_mode == 'sep':  # Sephardic case
254
            # for each word in the name, delete portions of word preceding
255
            # apostrophe
256
            # ex: d'avila d'aguilar --> avila aguilar
257
            # also discard certain words in the name
258
259
            # note that we can never get a match on "de la" because we are
260
            # checking single words below
261
            # this is a bug, but I won't try to fix it now
262
263 1
            for word in words:
264 1
                word = word[word.rfind('\'') + 1 :]
265 1
                if word not in BMDATA['sep']['discards']:
266 1
                    words2.append(word)
267
268 1
        elif name_mode == 'ash':  # Ashkenazic case
269
            # discard certain words if at the start of the name
270 1
            if len(words) > 1 and words[0] in BMDATA['ash']['discards']:
271 1
                words2 = words[1:]
272
            else:
273 1
                words2 = list(words)
274
        else:
275 1
            words2 = list(words)
276
277 1
        if concat:
278
            # concatenate the separate words of a multi-word name
279
            # (normally used for exact matches)
280 1
            term = ' '.join(words2)
281 1
        elif len(words2) == 1:  # not a multi-word name
282 1
            term = words2[0]
283
        else:
284
            # encode each word in a multi-word name separately
285
            # (normally used for approx matches)
286 1
            result = '-'.join(
287
                [
288
                    self._redo_language(
289
                        w, name_mode, rules, final_rules1, final_rules2, concat
290
                    )
291
                    for w in words2
292
                ]
293
            )
294 1
            return result
295
296 1
        term_length = len(term)
297
298
        # apply language rules to map to phonetic alphabet
299 1
        phonetic = ''
300 1
        skip = 0
301 1
        for i in range(term_length):
302 1
            if skip:
303 1
                skip -= 1
304 1
                continue
305 1
            found = False
306 1
            for rule in rules:
307 1
                pattern = rule[_PATTERN_POS]
308 1
                pattern_length = len(pattern)
309 1
                lcontext = rule[_LCONTEXT_POS]
310 1
                rcontext = rule[_RCONTEXT_POS]
311
312
                # check to see if next sequence in input matches the string in
313
                # the rule
314 1
                if (pattern_length > term_length - i) or (
315
                    term[i : i + pattern_length] != pattern
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
316
                ):  # no match
317 1
                    continue
318
319 1
                right = '^' + rcontext
320 1
                left = lcontext + '$'
321
322
                # check that right context is satisfied
323 1
                if rcontext != '':
324 1
                    if not search(right, term[i + pattern_length :]):
325 1
                        continue
326
327
                # check that left context is satisfied
328 1
                if lcontext != '':
329 1
                    if not search(left, term[:i]):
330 1
                        continue
331
332
                # check for incompatible attributes
333 1
                candidate = self._apply_rule_if_compat(
334
                    phonetic, rule[_PHONETIC_POS], language_arg
335
                )
336
                # The below condition shouldn't ever be false
337 1
                if candidate is not None:  # pragma: no branch
338 1
                    phonetic = candidate
339 1
                    found = True
340 1
                    break
341
342 1
            if (
343
                not found
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
344
            ):  # character in name that is not in table -- e.g., space
345 1
                pattern_length = 1
346 1
            skip = pattern_length - 1
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
347
348
        # apply final rules on phonetic-alphabet,
349
        # doing a substitution of certain characters
350 1
        phonetic = self._apply_final_rules(
351
            phonetic, final_rules1, language_arg, False
352
        )  # apply common rules
353
        # final_rules1 are the common approx rules,
354
        # final_rules2 are approx rules for specific language
355 1
        phonetic = self._apply_final_rules(
356
            phonetic, final_rules2, language_arg, True
357
        )  # apply lang specific rules
358
359 1
        return phonetic
360
361 1
    def _apply_final_rules(self, phonetic, final_rules, language_arg, strip):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (21/15).
Loading history...
362
        """Apply a set of final rules to the phonetic encoding.
363
364
        :param str phonetic: the term to which to apply the final rules
365
        :param tuple final_rules: the set of final phonetic transform regexps
366
        :param int language_arg: an integer representing the target language of
367
            the phonetic encoding
368
        :param bool strip: flag to indicate whether to normalize the language
369
            attributes
370
        """
371
        # optimization to save time
372 1
        if not final_rules:
373 1
            return phonetic
374
375
        # expand the result
376 1
        phonetic = self._expand_alternates(phonetic)
377 1
        phonetic_array = phonetic.split('|')
378
379 1
        for k in range(len(phonetic_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
380 1
            phonetic = phonetic_array[k]
381 1
            phonetic2 = ''
382 1
            phoneticx = self._normalize_lang_attrs(phonetic, True)
383
384 1
            i = 0
385 1
            while i < len(phonetic):
386 1
                found = False
387
388 1
                if phonetic[i] == '[':  # skip over language attribute
389 1
                    attrib_start = i
390 1
                    i += 1
391 1
                    while True:
392 1
                        if phonetic[i] == ']':
393 1
                            i += 1
394 1
                            phonetic2 += phonetic[attrib_start:i]
395 1
                            break
396 1
                        i += 1
397 1
                    continue
398
399 1
                for rule in final_rules:
400 1
                    pattern = rule[_PATTERN_POS]
401 1
                    pattern_length = len(pattern)
402 1
                    lcontext = rule[_LCONTEXT_POS]
403 1
                    rcontext = rule[_RCONTEXT_POS]
404
405 1
                    right = '^' + rcontext
406 1
                    left = lcontext + '$'
407
408
                    # check to see if next sequence in phonetic matches the
409
                    # string in the rule
410 1
                    if (pattern_length > len(phoneticx) - i) or phoneticx[
411
                        i : i + pattern_length
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
412
                    ] != pattern:
413 1
                        continue
414
415
                    # check that right context is satisfied
416 1
                    if rcontext != '':
417 1
                        if not search(right, phoneticx[i + pattern_length :]):
418 1
                            continue
419
420
                    # check that left context is satisfied
421 1
                    if lcontext != '':
422 1
                        if not search(left, phoneticx[:i]):
423 1
                            continue
424
425
                    # check for incompatible attributes
426 1
                    candidate = self._apply_rule_if_compat(
427
                        phonetic2, rule[_PHONETIC_POS], language_arg
428
                    )
429
                    # The below condition shouldn't ever be false
430 1
                    if candidate is not None:  # pragma: no branch
431 1
                        phonetic2 = candidate
432 1
                        found = True
433 1
                        break
434
435 1
                if not found:
436
                    # character in name for which there is no substitution in
437
                    # the table
438 1
                    phonetic2 += phonetic[i]
439 1
                    pattern_length = 1
440
441 1
                i += pattern_length
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
442
443 1
            phonetic_array[k] = self._expand_alternates(phonetic2)
444
445 1
        phonetic = '|'.join(phonetic_array)
446 1
        if strip:
447 1
            phonetic = self._normalize_lang_attrs(phonetic, True)
448
449 1
        if '|' in phonetic:
450 1
            phonetic = '(' + self._remove_dupes(phonetic) + ')'
451
452 1
        return phonetic
453
454 1
    def _phonetic_number(self, phonetic):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
455
        """Remove bracketed text from the end of a string.
456
457
        :param str phonetic: a Beider-Morse phonetic encoding
458
        """
459 1
        if '[' in phonetic:
460 1
            return phonetic[: phonetic.find('[')]
461
462 1
        return phonetic  # experimental !!!!
463
464 1
    def _expand_alternates(self, phonetic):
465
        """Expand phonetic alternates separated by |s.
466
467
        :param str phonetic: a Beider-Morse phonetic encoding
468
        """
469 1
        alt_start = phonetic.find('(')
470 1
        if alt_start == -1:
471 1
            return self._normalize_lang_attrs(phonetic, False)
472
473 1
        prefix = phonetic[:alt_start]
474 1
        alt_start += 1  # get past the (
475 1
        alt_end = phonetic.find(')', alt_start)
476 1
        alt_string = phonetic[alt_start:alt_end]
477 1
        alt_end += 1  # get past the )
478 1
        suffix = phonetic[alt_end:]
479 1
        alt_array = alt_string.split('|')
480 1
        result = ''
481
482 1
        for i in range(len(alt_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
483 1
            alt = alt_array[i]
484 1
            alternate = self._expand_alternates(prefix + alt + suffix)
485 1
            if alternate != '' and alternate != '[0]':
486 1
                if result != '':
487 1
                    result += '|'
488 1
                result += alternate
489
490 1
        return result
491
492 1
    def _pnums_with_leading_space(self, phonetic):
493
        """Join prefixes & suffixes in cases of alternate phonetic values.
494
495
        :param str phonetic: a Beider-Morse phonetic encoding
496
        """
497 1
        alt_start = phonetic.find('(')
498 1
        if alt_start == -1:
499 1
            return ' ' + self._phonetic_number(phonetic)
500
501 1
        prefix = phonetic[:alt_start]
502 1
        alt_start += 1  # get past the (
503 1
        alt_end = phonetic.find(')', alt_start)
504 1
        alt_string = phonetic[alt_start:alt_end]
505 1
        alt_end += 1  # get past the )
506 1
        suffix = phonetic[alt_end:]
507 1
        alt_array = alt_string.split('|')
508 1
        result = ''
509 1
        for alt in alt_array:
510 1
            result += self._pnums_with_leading_space(prefix + alt + suffix)
511
512 1
        return result
513
514 1
    def _phonetic_numbers(self, phonetic):
515
        """Prepare & join phonetic numbers.
516
517
        Split phonetic value on '-', run through _pnums_with_leading_space,
518
        and join with ' '
519
520
        :param str phonetic: a Beider-Morse phonetic encoding
521
        """
522 1
        phonetic_array = phonetic.split('-')  # for names with spaces in them
523 1
        result = ' '.join(
524
            [self._pnums_with_leading_space(i)[1:] for i in phonetic_array]
525
        )
526 1
        return result
527
528 1
    def _remove_dupes(self, phonetic):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
529
        """Remove duplicates from a phonetic encoding list.
530
531
        :param str phonetic: a Beider-Morse phonetic encoding
532
        """
533 1
        alt_string = phonetic
534 1
        alt_array = alt_string.split('|')
535
536 1
        result = '|'
537 1
        for i in range(len(alt_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
538 1
            alt = alt_array[i]
539 1
            if alt and '|' + alt + '|' not in result:
540 1
                result += alt + '|'
541
542 1
        return result[1:-1]  # remove leading and trailing |
543
544 1
    def _normalize_lang_attrs(self, text, strip):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
545
        """Remove embedded bracketed attributes.
546
547
        This (potentially) bitwise-ands bracketed attributes together and adds
548
        to the end.
549
        This is applied to a single alternative at a time -- not to a
550
        parenthisized list.
551
        It removes all embedded bracketed attributes, logically-ands them
552
        together, and places them at the end.
553
        However if strip is true, this can indeed remove embedded bracketed
554
        attributes from a parenthesized list.
555
556
        :param str text: a Beider-Morse phonetic encoding (in progress)
557
        :param bool strip: remove the bracketed attributes (and throw away)
558
        """
559 1
        uninitialized = -1  # all 1's
560 1
        attrib = uninitialized
561 1
        while '[' in text:
562 1
            bracket_start = text.find('[')
563 1
            bracket_end = text.find(']', bracket_start)
564 1
            if bracket_end == -1:
565 1
                raise ValueError(
566
                    'No closing square bracket: text=('
567
                    + text
568
                    + ') strip=('
569
                    + text_type(strip)
570
                    + ')'
571
                )
572 1
            attrib &= int(text[bracket_start + 1 : bracket_end])
573 1
            text = text[:bracket_start] + text[bracket_end + 1 :]
574
575 1
        if attrib == uninitialized or strip:
576 1
            return text
577 1
        elif attrib == 0:
578
            # means that the attributes were incompatible and there is no
579
            # alternative here
580 1
            return '[0]'
581 1
        return text + '[' + str(attrib) + ']'
582
583 1
    def _apply_rule_if_compat(self, phonetic, target, language_arg):
584
        """Apply a phonetic regex if compatible.
585
586
        tests for compatible language rules
587
588
        to do so, apply the rule, expand the results, and detect alternatives
589
            with incompatible attributes
590
591
        then drop each alternative that has incompatible attributes and keep
592
            those that are compatible
593
594
        if there are no compatible alternatives left, return false
595
596
        otherwise return the compatible alternatives
597
598
        apply the rule
599
600
        :param str phonetic: the Beider-Morse phonetic encoding (so far)
601
        :param str target: a proposed addition to the phonetic encoding
602
        :param int language_arg: an integer representing the target language of
603
            the phonetic encoding
604
        """
605 1
        candidate = phonetic + target
606 1
        if '[' not in candidate:  # no attributes so we need test no further
607 1
            return candidate
608
609
        # expand the result, converting incompatible attributes to [0]
610 1
        candidate = self._expand_alternates(candidate)
611 1
        candidate_array = candidate.split('|')
612
613
        # drop each alternative that has incompatible attributes
614 1
        candidate = ''
615 1
        found = False
616
617 1
        for i in range(len(candidate_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
618 1
            this_candidate = candidate_array[i]
619 1
            if language_arg != 1:
620 1
                this_candidate = self._normalize_lang_attrs(
621
                    this_candidate + '[' + str(language_arg) + ']', False
622
                )
623 1
            if this_candidate != '[0]':
624 1
                found = True
625 1
                if candidate:
626 1
                    candidate += '|'
627 1
                candidate += this_candidate
628
629
        # return false if no compatible alternatives remain
630 1
        if not found:
631 1
            return None
632
633
        # return the result of applying the rule
634 1
        if '|' in candidate:
635 1
            candidate = '(' + candidate + ')'
636 1
        return candidate
637
638 1
    def _language_index_from_code(self, code, name_mode):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
639
        """Return the index value for a language code.
640
641
        This returns l_any if more than one code is specified or the code is
642
        out of bounds.
643
644
        :param int code: the language code to interpret
645
        :param str name_mode: the name mode of the algorithm: 'gen' (default),
646
                    'ash' (Ashkenazi), or 'sep' (Sephardic)
647
        """
648 1
        if code < 1 or code > sum(
649
            _LANG_DICT[_] for _ in BMDATA[name_mode]['languages']
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
650
        ):  # code out of range
651 1
            return L_ANY
652 1
        if (
653
            code & (code - 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
654
        ) != 0:  # choice was more than one language; use any
655 1
            return L_ANY
656 1
        return code
657
658 1
    def encode(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
659
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
660
        word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
661
        language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
662
        name_mode='gen',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
663
        match_mode='approx',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
664
        concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
665
        filter_langs=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
666
    ):
667
        """Return the Beider-Morse Phonetic Matching encoding(s) of a term.
668
669
        :param str word: the word to transform
670
        :param str language_arg: the language of the term; supported values
671
            include:
672
673
                - 'any'
674
                - 'arabic'
675
                - 'cyrillic'
676
                - 'czech'
677
                - 'dutch'
678
                - 'english'
679
                - 'french'
680
                - 'german'
681
                - 'greek'
682
                - 'greeklatin'
683
                - 'hebrew'
684
                - 'hungarian'
685
                - 'italian'
686
                - 'latvian'
687
                - 'polish'
688
                - 'portuguese'
689
                - 'romanian'
690
                - 'russian'
691
                - 'spanish'
692
                - 'turkish'
693
694
        :param str name_mode: the name mode of the algorithm:
695
696
                - 'gen' -- general (default)
697
                - 'ash' -- Ashkenazi
698
                - 'sep' -- Sephardic
699
700
        :param str match_mode: matching mode: 'approx' or 'exact'
701
        :param bool concat: concatenation mode
702
        :param bool filter_langs: filter out incompatible languages
703
        :returns: the BMPM value(s)
704
        :rtype: tuple
705
706
        >>> pe = BeiderMorse()
707
        >>> pe.encode('Christopher')
708
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
709
        xristYfir xristopi xritopir xritopi xristofi xritofir xritofi
710
        tzristopir tzristofir zristopir zristopi zritopir zritopi zristofir
711
        zristofi zritofir zritofi'
712
        >>> pe.encode('Niall')
713
        'nial niol'
714
        >>> pe.encode('Smith')
715
        'zmit'
716
        >>> pe.encode('Schmidt')
717
        'zmit stzmit'
718
719
        >>> pe.encode('Christopher', language_arg='German')
720
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
721
        xristYfir'
722
        >>> pe.encode('Christopher', language_arg='English')
723
        'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir
724
        xristafir xrQstafir'
725
        >>> pe.encode('Christopher', language_arg='German', name_mode='ash')
726
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
727
        xristYfir'
728
729
        >>> pe.encode('Christopher', language_arg='German', match_mode='exact')
730
        'xriStopher xriStofer xristopher xristofer'
731
        """
732 1
        word = normalize('NFC', text_type(word.strip().lower()))
733
734 1
        name_mode = name_mode.strip().lower()[:3]
735 1
        if name_mode not in {'ash', 'sep', 'gen'}:
736 1
            name_mode = 'gen'
737
738 1
        if match_mode != 'exact':
739 1
            match_mode = 'approx'
740
741
        # Translate the supplied language_arg value into an integer
742
        # representing a set of languages
743 1
        all_langs = (
744
            sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
745
        )
746 1
        lang_choices = 0
747 1
        if isinstance(language_arg, (int, float, long)):
0 ignored issues
show
introduced by
The variable long does not seem to be defined in case PY3 on line 65 is False. Are you sure this can never be the case?
Loading history...
748 1
            lang_choices = int(language_arg)
749 1
        elif language_arg != '' and isinstance(language_arg, (text_type, str)):
750 1
            for lang in text_type(language_arg).lower().split(','):
751 1
                if lang in _LANG_DICT and (_LANG_DICT[lang] & all_langs):
752 1
                    lang_choices += _LANG_DICT[lang]
753 1
                elif not filter_langs:
754 1
                    raise ValueError(
755
                        'Unknown \''
756
                        + name_mode
757
                        + '\' language: \''
758
                        + lang
759
                        + '\''
760
                    )
761
762
        # Language choices are either all incompatible with the name mode or
763
        # no choices were given, so try to autodetect
764 1
        if lang_choices == 0:
765 1
            language_arg = self._language(word, name_mode)
766
        else:
767 1
            language_arg = lang_choices
768 1
        language_arg2 = self._language_index_from_code(language_arg, name_mode)
769
770 1
        rules = BMDATA[name_mode]['rules'][language_arg2]
771 1
        final_rules1 = BMDATA[name_mode][match_mode]['common']
772 1
        final_rules2 = BMDATA[name_mode][match_mode][language_arg2]
773
774 1
        result = self._phonetic(
775
            word,
776
            name_mode,
777
            rules,
778
            final_rules1,
779
            final_rules2,
780
            language_arg,
781
            concat,
782
        )
783 1
        result = self._phonetic_numbers(result)
784
785 1
        return result
786
787
788 1
def bmpm(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
789
    word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
790
    language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
791
    name_mode='gen',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
792
    match_mode='approx',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
793
    concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
794
    filter_langs=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
795
):
796
    """Return the Beider-Morse Phonetic Matching encoding(s) of a term.
797
798
    This is a wrapper for :py:meth:`BeiderMorse.encode`.
799
800
    :param str word: the word to transform
801
    :param str language_arg: the language of the term; supported values
802
        include:
803
804
            - 'any'
805
            - 'arabic'
806
            - 'cyrillic'
807
            - 'czech'
808
            - 'dutch'
809
            - 'english'
810
            - 'french'
811
            - 'german'
812
            - 'greek'
813
            - 'greeklatin'
814
            - 'hebrew'
815
            - 'hungarian'
816
            - 'italian'
817
            - 'latvian'
818
            - 'polish'
819
            - 'portuguese'
820
            - 'romanian'
821
            - 'russian'
822
            - 'spanish'
823
            - 'turkish'
824
825
    :param str name_mode: the name mode of the algorithm:
826
827
            - 'gen' -- general (default)
828
            - 'ash' -- Ashkenazi
829
            - 'sep' -- Sephardic
830
831
    :param str match_mode: matching mode: 'approx' or 'exact'
832
    :param bool concat: concatenation mode
833
    :param bool filter_langs: filter out incompatible languages
834
    :returns: the BMPM value(s)
835
    :rtype: tuple
836
837
    >>> bmpm('Christopher')
838
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
839
    xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir
840
    tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir
841
    zritofi'
842
    >>> bmpm('Niall')
843
    'nial niol'
844
    >>> bmpm('Smith')
845
    'zmit'
846
    >>> bmpm('Schmidt')
847
    'zmit stzmit'
848
849
    >>> bmpm('Christopher', language_arg='German')
850
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
851
    xristYfir'
852
    >>> bmpm('Christopher', language_arg='English')
853
    'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir
854
    xrQstafir'
855
    >>> bmpm('Christopher', language_arg='German', name_mode='ash')
856
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
857
    xristYfir'
858
859
    >>> bmpm('Christopher', language_arg='German', match_mode='exact')
860
    'xriStopher xriStofer xristopher xristofer'
861
    """
862 1
    return BeiderMorse().encode(
863
        word, language_arg, name_mode, match_mode, concat, filter_langs
864
    )
865
866
867
if __name__ == '__main__':
868
    import doctest
869
870
    doctest.testmod()
871