Completed
Branch master (78a222)
by Chris
14:36
created

abydos.phonetic._bmpm._bm_language()   A

Complexity

Conditions 5

Size

Total Lines 21
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 14
CRAP Score 5

Importance

Changes 0
Metric Value
eloc 14
dl 0
loc 21
ccs 14
cts 14
cp 1
rs 9.2333
c 0
b 0
f 0
cc 5
nop 2
crap 5
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# This file is based on Alexander Beider and Stephen P. Morse's implementation
7
# of the Beider-Morse Phonetic Matching (BMPM) System, available at
8
# http://stevemorse.org/phonetics/bmpm.htm.
9
#
10
# Abydos is free software: you can redistribute it and/or modify
11
# it under the terms of the GNU General Public License as published by
12
# the Free Software Foundation, either version 3 of the License, or
13
# (at your option) any later version.
14
#
15
# Abydos is distributed in the hope that it will be useful,
16
# but WITHOUT ANY WARRANTY; without even the implied warranty of
17
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
# GNU General Public License for more details.
19
#
20
# You should have received a copy of the GNU General Public License
21
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
22
23 1
"""abydos.phonetic._bmpm.
24
25
The phonetic._bmpm module implements the Beider-Morse Phonentic Matching (BMPM)
26
algorithm.
27
"""
28
29 1
from __future__ import unicode_literals
30
31 1
from re import search
32 1
from unicodedata import normalize
33
34 1
from six import PY3, text_type
35 1
from six.moves import range
36
37 1
from ._bmdata import (
38
    BMDATA,
39
    L_ANY,
40
    L_ARABIC,
41
    L_CYRILLIC,
42
    L_CZECH,
43
    L_DUTCH,
44
    L_ENGLISH,
45
    L_FRENCH,
46
    L_GERMAN,
47
    L_GREEK,
48
    L_GREEKLATIN,
49
    L_HEBREW,
50
    L_HUNGARIAN,
51
    L_ITALIAN,
52
    L_LATVIAN,
53
    L_NONE,
54
    L_POLISH,
55
    L_PORTUGUESE,
56
    L_ROMANIAN,
57
    L_RUSSIAN,
58
    L_SPANISH,
59
    L_TURKISH,
60
)
61
62 1
__all__ = ['bmpm']
63
64
if PY3:
65
    long = int
0 ignored issues
show
Coding Style Naming introduced by
The name long does not conform to the class naming conventions ([A-Z_][a-zA-Z0-9]+$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
66
67 1
_LANG_DICT = {
68
    'any': L_ANY,
69
    'arabic': L_ARABIC,
70
    'cyrillic': L_CYRILLIC,
71
    'czech': L_CZECH,
72
    'dutch': L_DUTCH,
73
    'english': L_ENGLISH,
74
    'french': L_FRENCH,
75
    'german': L_GERMAN,
76
    'greek': L_GREEK,
77
    'greeklatin': L_GREEKLATIN,
78
    'hebrew': L_HEBREW,
79
    'hungarian': L_HUNGARIAN,
80
    'italian': L_ITALIAN,
81
    'latvian': L_LATVIAN,
82
    'polish': L_POLISH,
83
    'portuguese': L_PORTUGUESE,
84
    'romanian': L_ROMANIAN,
85
    'russian': L_RUSSIAN,
86
    'spanish': L_SPANISH,
87
    'turkish': L_TURKISH,
88
}
89
90 1
BMDATA['gen']['discards'] = {
91
    'da ',
92
    'dal ',
93
    'de ',
94
    'del ',
95
    'dela ',
96
    'de la ',
97
    'della ',
98
    'des ',
99
    'di ',
100
    'do ',
101
    'dos ',
102
    'du ',
103
    'van ',
104
    'von ',
105
    'd\'',
106
}
107 1
BMDATA['sep']['discards'] = {
108
    'al',
109
    'el',
110
    'da',
111
    'dal',
112
    'de',
113
    'del',
114
    'dela',
115
    'de la',
116
    'della',
117
    'des',
118
    'di',
119
    'do',
120
    'dos',
121
    'du',
122
    'van',
123
    'von',
124
}
125 1
BMDATA['ash']['discards'] = {'bar', 'ben', 'da', 'de', 'van', 'von'}
126
127
# format of rules array
128 1
_PATTERN_POS = 0
129 1
_LCONTEXT_POS = 1
130 1
_RCONTEXT_POS = 2
131 1
_PHONETIC_POS = 3
132
133
134 1
def _bm_language(name, name_mode):
135
    """Return the best guess language ID for the word and language choices.
136
137
    :param str name: the term to guess the language of
138
    :param str name_mode: the name mode of the algorithm: 'gen' (default),
139
                'ash' (Ashkenazi), or 'sep' (Sephardic)
140
    """
141 1
    name = name.strip().lower()
142 1
    rules = BMDATA[name_mode]['language_rules']
143 1
    all_langs = sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
144 1
    choices_remaining = all_langs
145 1
    for rule in rules:
146 1
        letters, languages, accept = rule
147 1
        if search(letters, name) is not None:
148 1
            if accept:
149 1
                choices_remaining &= languages
150
            else:
151 1
                choices_remaining &= (~languages) % (all_langs + 1)
152 1
    if choices_remaining == L_NONE:
153 1
        choices_remaining = L_ANY
154 1
    return choices_remaining
155
156
157 1
def _bm_redo_language(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
158
    term, name_mode, rules, final_rules1, final_rules2, concat
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
159
):
160
    """Reassess the language of the terms and call the phonetic encoder.
161
162
    Uses a split multi-word term.
163
164
    :param str term: the term to encode via Beider-Morse
165
    :param str name_mode: the name mode of the algorithm: 'gen' (default),
166
        'ash' (Ashkenazi), or 'sep' (Sephardic)
167
    :param tuple rules: the set of initial phonetic transform regexps
168
    :param tuple final_rules1: the common set of final phonetic transform
169
        regexps
170
    :param tuple final_rules2: the specific set of final phonetic transform
171
        regexps
172
    :param bool concat: a flag to indicate concatenation
173
    """
174 1
    language_arg = _bm_language(term, name_mode)
175 1
    return _bm_phonetic(
176
        term,
177
        name_mode,
178
        rules,
179
        final_rules1,
180
        final_rules2,
181
        language_arg,
182
        concat,
183
    )
184
185
186 1
def _bm_phonetic(
0 ignored issues
show
best-practice introduced by
Too many arguments (7/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (27/15).
Loading history...
187
    term,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
188
    name_mode,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
189
    rules,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
190
    final_rules1,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
191
    final_rules2,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
192
    language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
193
    concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
194
):
195
    """Return the Beider-Morse encoding(s) of a term.
196
197
    :param str term: the term to encode via Beider-Morse
198
    :param str name_mode: the name mode of the algorithm: 'gen' (default),
199
        ash' (Ashkenazi), or 'sep' (Sephardic)
200
    :param tuple rules: the set of initial phonetic transform regexps
201
    :param tuple final_rules1: the common set of final phonetic transform
202
        regexps
203
    :param tuple final_rules2: the specific set of final phonetic transform
204
        regexps
205
    :param int language_arg: an integer representing the target language of the
206
        phonetic encoding
207
    :param bool concat: a flag to indicate concatenation
208
    """
209 1
    term = term.replace('-', ' ').strip()
210
211 1
    if name_mode == 'gen':  # generic case
212
        # discard and concatenate certain words if at the start of the name
213 1
        for pfx in BMDATA['gen']['discards']:
214 1
            if term.startswith(pfx):
215 1
                remainder = term[len(pfx) :]
216 1
                combined = pfx[:-1] + remainder
217 1
                result = (
218
                    _bm_redo_language(
219
                        remainder,
220
                        name_mode,
221
                        rules,
222
                        final_rules1,
223
                        final_rules2,
224
                        concat,
225
                    )
226
                    + '-'
227
                    + _bm_redo_language(
228
                        combined,
229
                        name_mode,
230
                        rules,
231
                        final_rules1,
232
                        final_rules2,
233
                        concat,
234
                    )
235
                )
236 1
                return result
237
238 1
    words = term.split()  # create array of the individual words in the name
239 1
    words2 = []
240
241 1
    if name_mode == 'sep':  # Sephardic case
242
        # for each word in the name, delete portions of word preceding
243
        # apostrophe
244
        # ex: d'avila d'aguilar --> avila aguilar
245
        # also discard certain words in the name
246
247
        # note that we can never get a match on "de la" because we are checking
248
        # single words below
249
        # this is a bug, but I won't try to fix it now
250
251 1
        for word in words:
252 1
            word = word[word.rfind('\'') + 1 :]
253 1
            if word not in BMDATA['sep']['discards']:
254 1
                words2.append(word)
255
256 1
    elif name_mode == 'ash':  # Ashkenazic case
257
        # discard certain words if at the start of the name
258 1
        if len(words) > 1 and words[0] in BMDATA['ash']['discards']:
259 1
            words2 = words[1:]
260
        else:
261 1
            words2 = list(words)
262
    else:
263 1
        words2 = list(words)
264
265 1
    if concat:
266
        # concatenate the separate words of a multi-word name
267
        # (normally used for exact matches)
268 1
        term = ' '.join(words2)
269 1
    elif len(words2) == 1:  # not a multi-word name
270 1
        term = words2[0]
271
    else:
272
        # encode each word in a multi-word name separately
273
        # (normally used for approx matches)
274 1
        result = '-'.join(
275
            [
276
                _bm_redo_language(
277
                    w, name_mode, rules, final_rules1, final_rules2, concat
278
                )
279
                for w in words2
280
            ]
281
        )
282 1
        return result
283
284 1
    term_length = len(term)
285
286
    # apply language rules to map to phonetic alphabet
287 1
    phonetic = ''
288 1
    skip = 0
289 1
    for i in range(term_length):
290 1
        if skip:
291 1
            skip -= 1
292 1
            continue
293 1
        found = False
294 1
        for rule in rules:
295 1
            pattern = rule[_PATTERN_POS]
296 1
            pattern_length = len(pattern)
297 1
            lcontext = rule[_LCONTEXT_POS]
298 1
            rcontext = rule[_RCONTEXT_POS]
299
300
            # check to see if next sequence in input matches the string in the
301
            # rule
302 1
            if (pattern_length > term_length - i) or (
303
                term[i : i + pattern_length] != pattern
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
304
            ):  # no match
305 1
                continue
306
307 1
            right = '^' + rcontext
308 1
            left = lcontext + '$'
309
310
            # check that right context is satisfied
311 1
            if rcontext != '':
312 1
                if not search(right, term[i + pattern_length :]):
313 1
                    continue
314
315
            # check that left context is satisfied
316 1
            if lcontext != '':
317 1
                if not search(left, term[:i]):
318 1
                    continue
319
320
            # check for incompatible attributes
321 1
            candidate = _bm_apply_rule_if_compat(
322
                phonetic, rule[_PHONETIC_POS], language_arg
323
            )
324
            # The below condition shouldn't ever be false
325 1
            if candidate is not None:  # pragma: no branch
326 1
                phonetic = candidate
327 1
                found = True
328 1
                break
329
330 1
        if not found:  # character in name that is not in table -- e.g., space
331 1
            pattern_length = 1
332 1
        skip = pattern_length - 1
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
333
334
    # apply final rules on phonetic-alphabet,
335
    # doing a substitution of certain characters
336 1
    phonetic = _bm_apply_final_rules(
337
        phonetic, final_rules1, language_arg, False
338
    )  # apply common rules
339
    # final_rules1 are the common approx rules,
340
    # final_rules2 are approx rules for specific language
341 1
    phonetic = _bm_apply_final_rules(
342
        phonetic, final_rules2, language_arg, True
343
    )  # apply lang specific rules
344
345 1
    return phonetic
346
347
348 1
def _bm_apply_final_rules(phonetic, final_rules, language_arg, strip):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (19/15).
Loading history...
349
    """Apply a set of final rules to the phonetic encoding.
350
351
    :param str phonetic: the term to which to apply the final rules
352
    :param tuple final_rules: the set of final phonetic transform regexps
353
    :param int language_arg: an integer representing the target language of the
354
        phonetic encoding
355
    :param bool strip: flag to indicate whether to normalize the language
356
        attributes
357
    """
358
    # optimization to save time
359 1
    if not final_rules:
360 1
        return phonetic
361
362
    # expand the result
363 1
    phonetic = _bm_expand_alternates(phonetic)
364 1
    phonetic_array = phonetic.split('|')
365
366 1
    for k in range(len(phonetic_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
367 1
        phonetic = phonetic_array[k]
368 1
        phonetic2 = ''
369 1
        phoneticx = _bm_normalize_lang_attrs(phonetic, True)
370
371 1
        i = 0
372 1
        while i < len(phonetic):
373 1
            found = False
374
375 1
            if phonetic[i] == '[':  # skip over language attribute
376 1
                attrib_start = i
377 1
                i += 1
378 1
                while True:
379 1
                    if phonetic[i] == ']':
380 1
                        i += 1
381 1
                        phonetic2 += phonetic[attrib_start:i]
382 1
                        break
383 1
                    i += 1
384 1
                continue
385
386 1
            for rule in final_rules:
387 1
                pattern = rule[_PATTERN_POS]
388 1
                pattern_length = len(pattern)
389 1
                lcontext = rule[_LCONTEXT_POS]
390 1
                rcontext = rule[_RCONTEXT_POS]
391
392 1
                right = '^' + rcontext
393 1
                left = lcontext + '$'
394
395
                # check to see if next sequence in phonetic matches the string
396
                # in the rule
397 1
                if (pattern_length > len(phoneticx) - i) or phoneticx[
398
                    i : i + pattern_length
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
399
                ] != pattern:
400 1
                    continue
401
402
                # check that right context is satisfied
403 1
                if rcontext != '':
404 1
                    if not search(right, phoneticx[i + pattern_length :]):
405 1
                        continue
406
407
                # check that left context is satisfied
408 1
                if lcontext != '':
409 1
                    if not search(left, phoneticx[:i]):
410 1
                        continue
411
412
                # check for incompatible attributes
413 1
                candidate = _bm_apply_rule_if_compat(
414
                    phonetic2, rule[_PHONETIC_POS], language_arg
415
                )
416
                # The below condition shouldn't ever be false
417 1
                if candidate is not None:  # pragma: no branch
418 1
                    phonetic2 = candidate
419 1
                    found = True
420 1
                    break
421
422 1
            if not found:
423
                # character in name for which there is no substitution in the
424
                # table
425 1
                phonetic2 += phonetic[i]
426 1
                pattern_length = 1
427
428 1
            i += pattern_length
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
429
430 1
        phonetic_array[k] = _bm_expand_alternates(phonetic2)
431
432 1
    phonetic = '|'.join(phonetic_array)
433 1
    if strip:
434 1
        phonetic = _bm_normalize_lang_attrs(phonetic, True)
435
436 1
    if '|' in phonetic:
437 1
        phonetic = '(' + _bm_remove_dupes(phonetic) + ')'
438
439 1
    return phonetic
440
441
442 1
def _bm_phonetic_number(phonetic):
443
    """Remove bracketed text from the end of a string.
444
445
    :param str phonetic: a Beider-Morse phonetic encoding
446
    """
447 1
    if '[' in phonetic:
448 1
        return phonetic[: phonetic.find('[')]
449
450 1
    return phonetic  # experimental !!!!
451
452
453 1
def _bm_expand_alternates(phonetic):
454
    """Expand phonetic alternates separated by |s.
455
456
    :param str phonetic: a Beider-Morse phonetic encoding
457
    """
458 1
    alt_start = phonetic.find('(')
459 1
    if alt_start == -1:
460 1
        return _bm_normalize_lang_attrs(phonetic, False)
461
462 1
    prefix = phonetic[:alt_start]
463 1
    alt_start += 1  # get past the (
464 1
    alt_end = phonetic.find(')', alt_start)
465 1
    alt_string = phonetic[alt_start:alt_end]
466 1
    alt_end += 1  # get past the )
467 1
    suffix = phonetic[alt_end:]
468 1
    alt_array = alt_string.split('|')
469 1
    result = ''
470
471 1
    for i in range(len(alt_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
472 1
        alt = alt_array[i]
473 1
        alternate = _bm_expand_alternates(prefix + alt + suffix)
474 1
        if alternate != '' and alternate != '[0]':
475 1
            if result != '':
476 1
                result += '|'
477 1
            result += alternate
478
479 1
    return result
480
481
482 1
def _bm_pnums_with_leading_space(phonetic):
483
    """Join prefixes & suffixes in cases of alternate phonetic values.
484
485
    :param str phonetic: a Beider-Morse phonetic encoding
486
    """
487 1
    alt_start = phonetic.find('(')
488 1
    if alt_start == -1:
489 1
        return ' ' + _bm_phonetic_number(phonetic)
490
491 1
    prefix = phonetic[:alt_start]
492 1
    alt_start += 1  # get past the (
493 1
    alt_end = phonetic.find(')', alt_start)
494 1
    alt_string = phonetic[alt_start:alt_end]
495 1
    alt_end += 1  # get past the )
496 1
    suffix = phonetic[alt_end:]
497 1
    alt_array = alt_string.split('|')
498 1
    result = ''
499 1
    for alt in alt_array:
500 1
        result += _bm_pnums_with_leading_space(prefix + alt + suffix)
501
502 1
    return result
503
504
505 1
def _bm_phonetic_numbers(phonetic):
506
    """Prepare & join phonetic numbers.
507
508
    Split phonetic value on '-', run through _bm_pnums_with_leading_space,
509
    and join with ' '
510
511
    :param str phonetic: a Beider-Morse phonetic encoding
512
    """
513 1
    phonetic_array = phonetic.split('-')  # for names with spaces in them
514 1
    result = ' '.join(
515
        [_bm_pnums_with_leading_space(i)[1:] for i in phonetic_array]
516
    )
517 1
    return result
518
519
520 1
def _bm_remove_dupes(phonetic):
521
    """Remove duplicates from a phonetic encoding list.
522
523
    :param str phonetic: a Beider-Morse phonetic encoding
524
    """
525 1
    alt_string = phonetic
526 1
    alt_array = alt_string.split('|')
527
528 1
    result = '|'
529 1
    for i in range(len(alt_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
530 1
        alt = alt_array[i]
531 1
        if alt and '|' + alt + '|' not in result:
532 1
            result += alt + '|'
533
534 1
    return result[1:-1]  # remove leading and trailing |
535
536
537 1
def _bm_normalize_lang_attrs(text, strip):
538
    """Remove embedded bracketed attributes.
539
540
    This (potentially) bitwise-ands bracketed attributes together and adds to
541
    the end.
542
    This is applied to a single alternative at a time -- not to a
543
    parenthisized list.
544
    It removes all embedded bracketed attributes, logically-ands them together,
545
    and places them at the end.
546
    However if strip is true, this can indeed remove embedded bracketed
547
    attributes from a parenthesized list.
548
549
    :param str text: a Beider-Morse phonetic encoding (in progress)
550
    :param bool strip: remove the bracketed attributes (and throw away)
551
    """
552 1
    uninitialized = -1  # all 1's
553 1
    attrib = uninitialized
554 1
    while '[' in text:
555 1
        bracket_start = text.find('[')
556 1
        bracket_end = text.find(']', bracket_start)
557 1
        if bracket_end == -1:
558 1
            raise ValueError(
559
                'No closing square bracket: text=('
560
                + text
561
                + ') strip=('
562
                + text_type(strip)
563
                + ')'
564
            )
565 1
        attrib &= int(text[bracket_start + 1 : bracket_end])
566 1
        text = text[:bracket_start] + text[bracket_end + 1 :]
567
568 1
    if attrib == uninitialized or strip:
569 1
        return text
570 1
    elif attrib == 0:
571
        # means that the attributes were incompatible and there is no
572
        # alternative here
573 1
        return '[0]'
574 1
    return text + '[' + str(attrib) + ']'
575
576
577 1
def _bm_apply_rule_if_compat(phonetic, target, language_arg):
578
    """Apply a phonetic regex if compatible.
579
580
    tests for compatible language rules
581
582
    to do so, apply the rule, expand the results, and detect alternatives with
583
        incompatible attributes
584
585
    then drop each alternative that has incompatible attributes and keep those
586
        that are compatible
587
588
    if there are no compatible alternatives left, return false
589
590
    otherwise return the compatible alternatives
591
592
    apply the rule
593
594
    :param str phonetic: the Beider-Morse phonetic encoding (so far)
595
    :param str target: a proposed addition to the phonetic encoding
596
    :param int language_arg: an integer representing the target language of
597
        the phonetic encoding
598
    """
599 1
    candidate = phonetic + target
600 1
    if '[' not in candidate:  # no attributes so we need test no further
601 1
        return candidate
602
603
    # expand the result, converting incompatible attributes to [0]
604 1
    candidate = _bm_expand_alternates(candidate)
605 1
    candidate_array = candidate.split('|')
606
607
    # drop each alternative that has incompatible attributes
608 1
    candidate = ''
609 1
    found = False
610
611 1
    for i in range(len(candidate_array)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
612 1
        this_candidate = candidate_array[i]
613 1
        if language_arg != 1:
614 1
            this_candidate = _bm_normalize_lang_attrs(
615
                this_candidate + '[' + str(language_arg) + ']', False
616
            )
617 1
        if this_candidate != '[0]':
618 1
            found = True
619 1
            if candidate:
620 1
                candidate += '|'
621 1
            candidate += this_candidate
622
623
    # return false if no compatible alternatives remain
624 1
    if not found:
625 1
        return None
626
627
    # return the result of applying the rule
628 1
    if '|' in candidate:
629 1
        candidate = '(' + candidate + ')'
630 1
    return candidate
631
632
633 1
def _bm_language_index_from_code(code, name_mode):
634
    """Return the index value for a language code.
635
636
    This returns l_any if more than one code is specified or the code is out
637
    of bounds.
638
639
    :param int code: the language code to interpret
640
    :param str name_mode: the name mode of the algorithm: 'gen' (default),
641
                'ash' (Ashkenazi), or 'sep' (Sephardic)
642
    """
643 1
    if code < 1 or code > sum(
644
        _LANG_DICT[_] for _ in BMDATA[name_mode]['languages']
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
645
    ):  # code out of range
646 1
        return L_ANY
647 1
    if (code & (code - 1)) != 0:  # choice was more than one language; use any
648 1
        return L_ANY
649 1
    return code
650
651
652 1
def bmpm(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
653
    word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
654
    language_arg=0,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
655
    name_mode='gen',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
656
    match_mode='approx',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
657
    concat=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
658
    filter_langs=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
659
):
660
    """Return the Beider-Morse Phonetic Matching encoding(s) of a term.
661
662
    The Beider-Morse Phonetic Matching algorithm is described in
663
    :cite:`Beider:2008`.
664
    The reference implementation is licensed under GPLv3.
665
666
    :param str word: the word to transform
667
    :param str language_arg: the language of the term; supported values
668
        include:
669
670
            - 'any'
671
            - 'arabic'
672
            - 'cyrillic'
673
            - 'czech'
674
            - 'dutch'
675
            - 'english'
676
            - 'french'
677
            - 'german'
678
            - 'greek'
679
            - 'greeklatin'
680
            - 'hebrew'
681
            - 'hungarian'
682
            - 'italian'
683
            - 'latvian'
684
            - 'polish'
685
            - 'portuguese'
686
            - 'romanian'
687
            - 'russian'
688
            - 'spanish'
689
            - 'turkish'
690
691
    :param str name_mode: the name mode of the algorithm:
692
693
            - 'gen' -- general (default)
694
            - 'ash' -- Ashkenazi
695
            - 'sep' -- Sephardic
696
697
    :param str match_mode: matching mode: 'approx' or 'exact'
698
    :param bool concat: concatenation mode
699
    :param bool filter_langs: filter out incompatible languages
700
    :returns: the BMPM value(s)
701
    :rtype: tuple
702
703
    >>> bmpm('Christopher')
704
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
705
    xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir
706
    tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir
707
    zritofi'
708
    >>> bmpm('Niall')
709
    'nial niol'
710
    >>> bmpm('Smith')
711
    'zmit'
712
    >>> bmpm('Schmidt')
713
    'zmit stzmit'
714
715
    >>> bmpm('Christopher', language_arg='German')
716
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
717
    xristYfir'
718
    >>> bmpm('Christopher', language_arg='English')
719
    'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir
720
    xrQstafir'
721
    >>> bmpm('Christopher', language_arg='German', name_mode='ash')
722
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
723
    xristYfir'
724
725
    >>> bmpm('Christopher', language_arg='German', match_mode='exact')
726
    'xriStopher xriStofer xristopher xristofer'
727
    """
728 1
    word = normalize('NFC', text_type(word.strip().lower()))
729
730 1
    name_mode = name_mode.strip().lower()[:3]
731 1
    if name_mode not in {'ash', 'sep', 'gen'}:
732 1
        name_mode = 'gen'
733
734 1
    if match_mode != 'exact':
735 1
        match_mode = 'approx'
736
737
    # Translate the supplied language_arg value into an integer representing
738
    # a set of languages
739 1
    all_langs = sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
740 1
    lang_choices = 0
741 1
    if isinstance(language_arg, (int, float, long)):
0 ignored issues
show
introduced by
The variable long does not seem to be defined in case PY3 on line 64 is False. Are you sure this can never be the case?
Loading history...
742 1
        lang_choices = int(language_arg)
743 1
    elif language_arg != '' and isinstance(language_arg, (text_type, str)):
744 1
        for lang in text_type(language_arg).lower().split(','):
745 1
            if lang in _LANG_DICT and (_LANG_DICT[lang] & all_langs):
746 1
                lang_choices += _LANG_DICT[lang]
747 1
            elif not filter_langs:
748 1
                raise ValueError(
749
                    'Unknown \'' + name_mode + '\' language: \'' + lang + '\''
750
                )
751
752
    # Language choices are either all incompatible with the name mode or
753
    # no choices were given, so try to autodetect
754 1
    if lang_choices == 0:
755 1
        language_arg = _bm_language(word, name_mode)
756
    else:
757 1
        language_arg = lang_choices
758 1
    language_arg2 = _bm_language_index_from_code(language_arg, name_mode)
759
760 1
    rules = BMDATA[name_mode]['rules'][language_arg2]
761 1
    final_rules1 = BMDATA[name_mode][match_mode]['common']
762 1
    final_rules2 = BMDATA[name_mode][match_mode][language_arg2]
763
764 1
    result = _bm_phonetic(
765
        word,
766
        name_mode,
767
        rules,
768
        final_rules1,
769
        final_rules2,
770
        language_arg,
771
        concat,
772
    )
773 1
    result = _bm_phonetic_numbers(result)
774
775 1
    return result
776
777
778
if __name__ == '__main__':
779
    import doctest
780
781
    doctest.testmod()
782