Passed
Push — master ( c2a3b6...15a61d )
by Chris
01:00 queued 14s
created

abydos.phonetic._beider_morse.bmpm()   A

Complexity

Conditions 1

Size

Total Lines 98
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 15
dl 0
loc 98
ccs 5
cts 5
cp 1
rs 9.65
c 0
b 0
f 0
cc 1
nop 6
crap 1

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# This file is based on Alexander Beider and Stephen P. Morse's implementation
5
# of the Beider-Morse Phonetic Matching (BMPM) System, available at
6
# http://stevemorse.org/phonetics/bmpm.htm.
7
#
8
# Abydos is free software: you can redistribute it and/or modify
9
# it under the terms of the GNU General Public License as published by
10
# the Free Software Foundation, either version 3 of the License, or
11
# (at your option) any later version.
12
#
13
# Abydos is distributed in the hope that it will be useful,
14
# but WITHOUT ANY WARRANTY; without even the implied warranty of
15
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
# GNU General Public License for more details.
17
#
18
# You should have received a copy of the GNU General Public License
19
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
20
21
"""abydos.phonetic._beider_morse.
22
23 1
Beider-Morse Phonetic Matching (BMPM) algorithm
24
"""
25
26
from re import search
27
from unicodedata import normalize
28 1
29
from ._beider_morse_data import (
30
    BMDATA,
31
    L_ANY,
32
    L_ARABIC,
33
    L_CYRILLIC,
34
    L_CZECH,
35 1
    L_DUTCH,
36 1
    L_ENGLISH,
37
    L_FRENCH,
38 1
    L_GERMAN,
39
    L_GREEK,
40 1
    L_GREEKLATIN,
41 1
    L_HEBREW,
42
    L_HUNGARIAN,
43 1
    L_ITALIAN,
44
    L_LATVIAN,
45
    L_NONE,
46
    L_POLISH,
47
    L_PORTUGUESE,
48
    L_ROMANIAN,
49
    L_RUSSIAN,
50
    L_SPANISH,
51
    L_TURKISH,
52
)
53
from ._phonetic import _Phonetic
54
55
__all__ = ['BeiderMorse']
56
57
_LANG_DICT = {
58
    'any': L_ANY,
59
    'arabic': L_ARABIC,
60
    'cyrillic': L_CYRILLIC,
61
    'czech': L_CZECH,
62
    'dutch': L_DUTCH,
63
    'english': L_ENGLISH,
64
    'french': L_FRENCH,
65
    'german': L_GERMAN,
66
    'greek': L_GREEK,
67 1
    'greeklatin': L_GREEKLATIN,
68 1
    'hebrew': L_HEBREW,
69
    'hungarian': L_HUNGARIAN,
70 1
    'italian': L_ITALIAN,
71
    'latvian': L_LATVIAN,
72
    'polish': L_POLISH,
73
    'portuguese': L_PORTUGUESE,
74
    'romanian': L_ROMANIAN,
75 1
    'russian': L_RUSSIAN,
76
    'spanish': L_SPANISH,
77
    'turkish': L_TURKISH,
78
}
79
80
BMDATA['gen']['discards'] = {
81
    'da ',
82
    'dal ',
83
    'de ',
84
    'del ',
85
    'dela ',
86
    'de la ',
87
    'della ',
88
    'des ',
89
    'di ',
90
    'do ',
91
    'dos ',
92
    'du ',
93
    'van ',
94
    'von ',
95
    "d'",
96
}
97
BMDATA['sep']['discards'] = {
98 1
    'al',
99
    'el',
100
    'da',
101
    'dal',
102
    'de',
103
    'del',
104
    'dela',
105
    'de la',
106
    'della',
107
    'des',
108
    'di',
109
    'do',
110
    'dos',
111
    'du',
112
    'van',
113
    'von',
114
}
115 1
BMDATA['ash']['discards'] = {'bar', 'ben', 'da', 'de', 'van', 'von'}
116
117
# format of rules array
118
_PATTERN_POS = 0
119
_LCONTEXT_POS = 1
120
_RCONTEXT_POS = 2
121
_PHONETIC_POS = 3
122
123
124
class BeiderMorse(_Phonetic):
125
    """Beider-Morse Phonetic Matching.
126
127
    The Beider-Morse Phonetic Matching algorithm is described in
128
    :cite:`Beider:2008`.
129
    The reference implementation is licensed under GPLv3.
130
131
    .. versionadded:: 0.3.6
132
    """
133 1
134
    def _language(self, name, name_mode):
135
        """Return the best guess language ID for the word and language choices.
136 1
137 1
        Parameters
138 1
        ----------
139 1
        name : str
140
            The term to guess the language of
141
        name_mode : str
142 1
            The name mode of the algorithm: ``gen`` (default),
143
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
144
145
        Returns
146
        -------
147
        int
148
            Language ID
149
150
151
        .. versionadded:: 0.1.0
152 1
        .. versionchanged:: 0.3.6
153
            Encapsulated in class
154
155
        """
156
        name = name.strip().lower()
157
        rules = BMDATA[name_mode]['language_rules']
158
        all_langs = (
159
            sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
160
        )
161
        choices_remaining = all_langs
162
        for rule in rules:
163
            letters, languages, accept = rule
164
            if search(letters, name) is not None:
165
                if accept:
166
                    choices_remaining &= languages
167
                else:
168
                    choices_remaining &= (~languages) % (all_langs + 1)
169
        if choices_remaining == L_NONE:
170
            choices_remaining = L_ANY
171
        return choices_remaining
172
173
    def _redo_language(
174 1
        self, term, name_mode, rules, final_rules1, final_rules2, concat
175 1
    ):
176 1
        """Reassess the language of the terms and call the phonetic encoder.
177
178
        Uses a split multi-word term.
179 1
180 1
        Parameters
181 1
        ----------
182 1
        term : str
183 1
            The term to encode via Beider-Morse
184 1
        name_mode : str
185
            The name mode of the algorithm: ``gen`` (default),
186 1
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
187 1
        rules : tuple
188 1
            The set of initial phonetic transform regexps
189 1
        final_rules1 : tuple
190
            The common set of final phonetic transform regexps
191 1
        final_rules2 : tuple
192
            The specific set of final phonetic transform regexps
193
        concat : bool
194
            A flag to indicate concatenation
195
196
        Returns
197
        -------
198
        str
199
            A Beider-Morse phonetic code
200
201
202
        .. versionadded:: 0.1.0
203
        .. versionchanged:: 0.3.6
204
            Encapsulated in class
205
206
        """
207
        language_arg = self._language(term, name_mode)
208
        return self._phonetic(
209
            term,
210
            name_mode,
211
            rules,
212
            final_rules1,
213
            final_rules2,
214
            language_arg,
215
            concat,
216
        )
217
218
    def _phonetic(
219
        self,
220
        term,
221
        name_mode,
222
        rules,
223
        final_rules1,
224
        final_rules2,
225 1
        language_arg=0,
226 1
        concat=False,
227
    ):
228
        """Return the Beider-Morse encoding(s) of a term.
229
230
        Parameters
231
        ----------
232
        term : str
233
            The term to encode via Beider-Morse
234
        name_mode : str
235
            The name mode of the algorithm: ``gen`` (default),
236 1
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
237
        rules : tuple
238
            The set of initial phonetic transform regexps
239
        final_rules1 : tuple
240
            The common set of final phonetic transform regexps
241
        final_rules2 : tuple
242
            The specific set of final phonetic transform regexps
243
        language_arg : int
244
            The language of the term
245
        concat : bool
246
            A flag to indicate concatenation
247
248
        Returns
249
        -------
250
        str
251
            A Beider-Morse phonetic code
252
253
254
        .. versionadded:: 0.1.0
255
        .. versionchanged:: 0.3.6
256
            Encapsulated in class
257
258
        """
259
        term = term.replace('-', ' ').strip()
260
261
        if name_mode == 'gen':  # generic case
262
            # discard and concatenate certain words if at the start of the name
263
            for pfx in BMDATA['gen']['discards']:
264
                if term.startswith(pfx):
265
                    remainder = term[len(pfx) :]
266
                    combined = pfx[:-1] + remainder
267
                    result = (
268
                        self._redo_language(
269
                            remainder,
270
                            name_mode,
271
                            rules,
272
                            final_rules1,
273
                            final_rules2,
274
                            concat,
275
                        )
276
                        + '-'
277 1
                        + self._redo_language(
278
                            combined,
279 1
                            name_mode,
280
                            rules,
281 1
                            final_rules1,
282 1
                            final_rules2,
283 1
                            concat,
284 1
                        )
285 1
                    )
286
                    return result
287
288
        words = (
289
            term.split()
290
        )  # create array of the individual words in the name
291
        words2 = []
292
293
        if name_mode == 'sep':  # Sephardic case
294
            # for each word in the name, delete portions of word preceding
295
            # apostrophe
296
            # ex: d'avila d'aguilar --> avila aguilar
297
            # also discard certain words in the name
298
299
            # note that we can never get a match on "de la" because we are
300
            # checking single words below
301
            # this is a bug, but I won't try to fix it now
302
303
            for word in words:
304 1
                word = word[word.rfind("'") + 1 :]
305
                if word not in BMDATA['sep']['discards']:
306 1
                    words2.append(word)
307
308
        elif name_mode == 'ash':  # Ashkenazic case
309 1
            # discard certain words if at the start of the name
310
            if len(words) > 1 and words[0] in BMDATA['ash']['discards']:
311 1
                words2 = words[1:]
312
            else:
313
                words2 = list(words)
314
        else:
315
            words2 = list(words)
316
317
        if concat:
318
            # concatenate the separate words of a multi-word name
319
            # (normally used for exact matches)
320
            term = ' '.join(words2)
321 1
        elif len(words2) == 1:  # not a multi-word name
322 1
            term = words2[0]
323 1
        else:
324 1
            # encode each word in a multi-word name separately
325
            # (normally used for approx matches)
326 1
            result = '-'.join(
327
                [
328 1
                    self._redo_language(
329 1
                        w, name_mode, rules, final_rules1, final_rules2, concat
330
                    )
331 1
                    for w in words2
332
                ]
333 1
            )
334
            return result
335 1
336
        term_length = len(term)
337
338 1
        # apply language rules to map to phonetic alphabet
339 1
        phonetic = ''
340 1
        skip = 0
341
        for i in range(term_length):
342
            if skip:
343
                skip -= 1
344 1
                continue
345
            found = False
346
            for rule in rules:
347
                pattern = rule[_PATTERN_POS]
348
                pattern_length = len(pattern)
349
                lcontext = rule[_LCONTEXT_POS]
350
                rcontext = rule[_RCONTEXT_POS]
351
352 1
                # check to see if next sequence in input matches the string in
353
                # the rule
354 1
                if (pattern_length > term_length - i) or (
355
                    term[i : i + pattern_length] != pattern
356
                ):  # no match
357 1
                    continue
358 1
359 1
                right = '^' + rcontext
360 1
                left = lcontext + '$'
361 1
362 1
                # check that right context is satisfied
363 1
                if rcontext != '':
364 1
                    if not search(right, term[i + pattern_length :]):
365 1
                        continue
366 1
367 1
                # check that left context is satisfied
368 1
                if lcontext != '':
369
                    if not search(left, term[:i]):
370
                        continue
371
372 1
                # check for incompatible attributes
373
                candidate = self._apply_rule_if_compat(
374
                    phonetic, rule[_PHONETIC_POS], language_arg
375 1
                )
376
                # The below condition shouldn't ever be false
377 1
                if candidate is not None:  # pragma: no branch
378 1
                    phonetic = candidate
379
                    found = True
380
                    break
381 1
382 1
            if (
383 1
                not found
384
            ):  # character in name that is not in table -- e.g., space
385
                pattern_length = 1
386 1
            skip = pattern_length - 1
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
387 1
388 1
        # apply final rules on phonetic-alphabet,
389
        # doing a substitution of certain characters
390
        phonetic = self._apply_final_rules(
391 1
            phonetic, final_rules1, language_arg, False
392
        )  # apply common rules
393
        # final_rules1 are the common approx rules,
394
        # final_rules2 are approx rules for specific language
395 1
        phonetic = self._apply_final_rules(
396 1
            phonetic, final_rules2, language_arg, True
397 1
        )  # apply lang specific rules
398 1
399
        return phonetic
400 1
401
    def _apply_final_rules(self, phonetic, final_rules, language_arg, strip):
402
        """Apply a set of final rules to the phonetic encoding.
403 1
404 1
        Parameters
405
        ----------
406
        phonetic : str
407
            The term to which to apply the final rules
408 1
        final_rules : tuple
409
            The set of final phonetic transform regexps
410
        language_arg : int
411
            An integer representing the target language of the phonetic
412
            encoding
413 1
        strip : bool
414
            Flag to indicate whether to normalize the language attributes
415
416
        Returns
417 1
        -------
418
        str
419 1
            A Beider-Morse phonetic code
420
421
422
        .. versionadded:: 0.1.0
423
        .. versionchanged:: 0.3.6
424
            Encapsulated in class
425
426
        """
427
        # optimization to save time
428
        if not final_rules:
429
            return phonetic
430
431
        # expand the result
432
        phonetic = self._expand_alternates(phonetic)
433
        phonetic_array = phonetic.split('|')
434
435
        for k in range(len(phonetic_array)):
436
            phonetic = phonetic_array[k]
437
            phonetic2 = ''
438
            phoneticx = self._normalize_lang_attrs(phonetic, True)
439
440
            i = 0
441
            while i < len(phonetic):
442
                found = False
443
444
                if phonetic[i] == '[':  # skip over language attribute
445
                    attrib_start = i
446 1
                    i += 1
447 1
                    while True:
448
                        if phonetic[i] == ']':
449
                            i += 1
450 1
                            phonetic2 += phonetic[attrib_start:i]
451 1
                            break
452
                        i += 1
453 1
                    continue
454 1
455 1
                for rule in final_rules:
456 1
                    pattern = rule[_PATTERN_POS]
457
                    pattern_length = len(pattern)
458 1
                    lcontext = rule[_LCONTEXT_POS]
459 1
                    rcontext = rule[_RCONTEXT_POS]
460 1
461
                    right = '^' + rcontext
462 1
                    left = lcontext + '$'
463 1
464 1
                    # check to see if next sequence in phonetic matches the
465 1
                    # string in the rule
466 1
                    if (pattern_length > len(phoneticx) - i) or phoneticx[
467 1
                        i : i + pattern_length
468 1
                    ] != pattern:
469 1
                        continue
470 1
471 1
                    # check that right context is satisfied
472
                    if rcontext != '':
473 1
                        if not search(right, phoneticx[i + pattern_length :]):
474 1
                            continue
475 1
476 1
                    # check that left context is satisfied
477 1
                    if lcontext != '':
478
                        if not search(left, phoneticx[:i]):
479 1
                            continue
480 1
481
                    # check for incompatible attributes
482
                    candidate = self._apply_rule_if_compat(
483
                        phonetic2, rule[_PHONETIC_POS], language_arg
484 1
                    )
485
                    # The below condition shouldn't ever be false
486
                    if candidate is not None:  # pragma: no branch
487 1
                        phonetic2 = candidate
488
                        found = True
489
                        break
490 1
491 1
                if not found:
492 1
                    # character in name for which there is no substitution in
493
                    # the table
494
                    phonetic2 += phonetic[i]
495 1
                    pattern_length = 1
496 1
497 1
                i += pattern_length
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
498
499
            phonetic_array[k] = self._expand_alternates(phonetic2)
500 1
501
        phonetic = '|'.join(phonetic_array)
502
        if strip:
503
            phonetic = self._normalize_lang_attrs(phonetic, True)
504 1
505 1
        if '|' in phonetic:
506 1
            phonetic = '(' + self._remove_dupes(phonetic) + ')'
507 1
508
        return phonetic
509 1
510
    def _phonetic_number(self, phonetic):
511
        """Remove bracketed text from the end of a string.
512 1
513 1
        Parameters
514
        ----------
515 1
        phonetic : str
516
            A Beider-Morse phonetic encoding
517 1
518
        Returns
519 1
        -------
520 1
        str
521 1
            A Beider-Morse phonetic code
522
523 1
524 1
        .. versionadded:: 0.1.0
525
        .. versionchanged:: 0.3.6
526 1
            Encapsulated in class
527
528 1
        """
529
        if '[' in phonetic:
530
            return phonetic[: phonetic.find('[')]
531
532
        return phonetic  # experimental !!!!
533
534
    def _expand_alternates(self, phonetic):
535
        r"""Expand phonetic alternates separated by \|s.
536
537
        Parameters
538
        ----------
539
        phonetic : str
540
            A Beider-Morse phonetic encoding
541
542
        Returns
543
        -------
544
        str
545
            A Beider-Morse phonetic code
546
547 1
548 1
        .. versionadded:: 0.1.0
549
        .. versionchanged:: 0.3.6
550 1
            Encapsulated in class
551
552 1
        """
553
        alt_start = phonetic.find('(')
554
        if alt_start == -1:
555
            return self._normalize_lang_attrs(phonetic, False)
556
557
        prefix = phonetic[:alt_start]
558
        alt_start += 1  # get past the (
559
        alt_end = phonetic.find(')', alt_start)
560
        alt_string = phonetic[alt_start:alt_end]
561
        alt_end += 1  # get past the )
562
        suffix = phonetic[alt_end:]
563
        alt_array = alt_string.split('|')
564
        result = ''
565
566
        for i in range(len(alt_array)):
567
            alt = alt_array[i]
568
            alternate = self._expand_alternates(prefix + alt + suffix)
569
            if alternate != '' and alternate != '[0]':
570
                if result != '':
571 1
                    result += '|'
572 1
                result += alternate
573 1
574
        return result
575 1
576 1
    def _pnums_with_leading_space(self, phonetic):
577 1
        """Join prefixes & suffixes in cases of alternate phonetic values.
578 1
579 1
        Parameters
580 1
        ----------
581 1
        phonetic : str
582 1
            A Beider-Morse phonetic encoding
583
584 1
        Returns
585 1
        -------
586 1
        str
587 1
            A Beider-Morse phonetic code
588 1
589 1
590 1
        .. versionadded:: 0.1.0
591
        .. versionchanged:: 0.3.6
592 1
            Encapsulated in class
593
594 1
        """
595
        alt_start = phonetic.find('(')
596
        if alt_start == -1:
597
            return ' ' + self._phonetic_number(phonetic)
598
599
        prefix = phonetic[:alt_start]
600
        alt_start += 1  # get past the (
601
        alt_end = phonetic.find(')', alt_start)
602
        alt_string = phonetic[alt_start:alt_end]
603
        alt_end += 1  # get past the )
604
        suffix = phonetic[alt_end:]
605
        alt_array = alt_string.split('|')
606
        result = ''
607
        for alt in alt_array:
608
            result += self._pnums_with_leading_space(prefix + alt + suffix)
609
610
        return result
611
612
    def _phonetic_numbers(self, phonetic):
613 1
        """Prepare & join phonetic numbers.
614 1
615 1
        Split phonetic value on '-', run through _pnums_with_leading_space,
616
        and join with ' '
617 1
618 1
        Parameters
619 1
        ----------
620 1
        phonetic : str
621 1
            A Beider-Morse phonetic encoding
622 1
623 1
        Returns
624 1
        -------
625 1
        str
626 1
            A Beider-Morse phonetic code
627
628 1
629
        .. versionadded:: 0.1.0
630 1
        .. versionchanged:: 0.3.6
631
            Encapsulated in class
632
633
        """
634
        phonetic_array = phonetic.split('-')  # for names with spaces in them
635
        result = ' '.join(
636
            [self._pnums_with_leading_space(i)[1:] for i in phonetic_array]
637
        )
638
        return result
639
640
    def _remove_dupes(self, phonetic):
641
        """Remove duplicates from a phonetic encoding list.
642
643
        Parameters
644
        ----------
645
        phonetic : str
646
            A Beider-Morse phonetic encoding
647
648
        Returns
649
        -------
650
        str
651
            A Beider-Morse phonetic code
652 1
653 1
654
        .. versionadded:: 0.1.0
655
        .. versionchanged:: 0.3.6
656 1
            Encapsulated in class
657
658 1
        """
659
        alt_string = phonetic
660
        alt_array = alt_string.split('|')
661
662
        result = '|'
663
        for i in range(len(alt_array)):
664
            alt = alt_array[i]
665
            if alt and '|' + alt + '|' not in result:
666
                result += alt + '|'
667
668
        return result[1:-1]  # remove leading and trailing |
669
670
    def _normalize_lang_attrs(self, text, strip):
671
        """Remove embedded bracketed attributes.
672
673
        This (potentially) bitwise-ands bracketed attributes together and adds
674
        to the end.
675
        This is applied to a single alternative at a time -- not to a
676
        parenthesized list.
677 1
        It removes all embedded bracketed attributes, logically-ands them
678 1
        together, and places them at the end.
679
        However if strip is true, this can indeed remove embedded bracketed
680 1
        attributes from a parenthesized list.
681 1
682 1
        Parameters
683 1
        ----------
684 1
        text : str
685
            A Beider-Morse phonetic encoding (in progress)
686 1
        strip : bool
687
            Remove the bracketed attributes (and throw away)
688 1
689
        Returns
690
        -------
691
        str
692
            A Beider-Morse phonetic code
693
694
        Raises
695
        ------
696
        ValueError
697
            No closing square bracket
698
699
700
        .. versionadded:: 0.1.0
701
        .. versionchanged:: 0.3.6
702
            Encapsulated in class
703
704
        """
705
        uninitialized = -1  # all 1's
706
        attrib = uninitialized
707
        while '[' in text:
708
            bracket_start = text.find('[')
709
            bracket_end = text.find(']', bracket_start)
710
            if bracket_end == -1:
711
                raise ValueError(
712
                    'No closing square bracket: text=('
713
                    + text
714
                    + ') strip=('
715
                    + str(strip)
716
                    + ')'
717
                )
718
            attrib &= int(text[bracket_start + 1 : bracket_end])
719
            text = text[:bracket_start] + text[bracket_end + 1 :]
720
721
        if attrib == uninitialized or strip:
722
            return text
723 1
        elif attrib == 0:
724 1
            # means that the attributes were incompatible and there is no
725 1
            # alternative here
726 1
            return '[0]'
727 1
        return text + '[' + str(attrib) + ']'
728 1
729 1
    def _apply_rule_if_compat(self, phonetic, target, language_arg):
730
        """Apply a phonetic regex if compatible.
731
732
        tests for compatible language rules
733
734
        to do so, apply the rule, expand the results, and detect alternatives
735
            with incompatible attributes
736 1
737 1
        then drop each alternative that has incompatible attributes and keep
738
            those that are compatible
739 1
740 1
        if there are no compatible alternatives left, return false
741 1
742
        otherwise return the compatible alternatives
743
744 1
        apply the rule
745 1
746
        Parameters
747 1
        ----------
748
        phonetic : str
749
            The Beider-Morse phonetic encoding (so far)
750
        target : str
751
            A proposed addition to the phonetic encoding
752
        language_arg : int
753
            An integer representing the target language of the phonetic
754
            encoding
755
756
        Returns
757
        -------
758
        str
759
            A candidate encoding
760
761
762
        .. versionadded:: 0.1.0
763
        .. versionchanged:: 0.3.6
764
            Encapsulated in class
765
766
        """
767
        candidate = phonetic + target
768
        if '[' not in candidate:  # no attributes so we need test no further
769
            return candidate
770
771
        # expand the result, converting incompatible attributes to [0]
772
        candidate = self._expand_alternates(candidate)
773
        candidate_array = candidate.split('|')
774
775
        # drop each alternative that has incompatible attributes
776
        candidate = ''
777
        found = False
778
779
        for i in range(len(candidate_array)):
780
            this_candidate = candidate_array[i]
781
            if language_arg != 1:
782
                this_candidate = self._normalize_lang_attrs(
783
                    this_candidate + '[' + str(language_arg) + ']', False
784
                )
785 1
            if this_candidate != '[0]':
786 1
                found = True
787 1
                if candidate:
788
                    candidate += '|'
789
                candidate += this_candidate
790 1
791 1
        # return false if no compatible alternatives remain
792
        if not found:
793
            return None
794 1
795 1
        # return the result of applying the rule
796
        if '|' in candidate:
797 1
            candidate = '(' + candidate + ')'
798 1
        return candidate
799 1
800 1
    def _language_index_from_code(self, code, name_mode):
801
        """Return the index value for a language code.
802
803 1
        This returns l_any if more than one code is specified or the code is
804 1
        out of bounds.
805 1
806 1
        Parameters
807 1
        ----------
808
        code : int
809
            The language code to interpret
810 1
        name_mode : str
811 1
            The name mode of the algorithm: ``gen`` (default),
812
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
813
814 1
        Returns
815 1
        -------
816 1
        int
817
            Language code index
818 1
819
820
        .. versionadded:: 0.1.0
821
        .. versionchanged:: 0.3.6
822
            Encapsulated in class
823
824
        """
825
        if code < 1 or code > sum(
826
            _LANG_DICT[_] for _ in BMDATA[name_mode]['languages']
827
        ):  # code out of range
828
            return L_ANY
829
        if (
830
            code & (code - 1)
831
        ) != 0:  # choice was more than one language; use any
832
            return L_ANY
833
        return code
834
835
    def __init__(
836
        self,
837
        language_arg=0,
838
        name_mode='gen',
839
        match_mode='approx',
840
        concat=False,
841
        filter_langs=False,
842
    ):
843 1
        """Initialize BeiderMorse instance.
844
845
        Parameters
846 1
        ----------
847 1
        language_arg : str or int
848
            The language of the term; supported values include:
849
850 1
                - ``any``
851 1
                - ``arabic``
852
                - ``cyrillic``
853 1
                - ``czech``
854
                - ``dutch``
855
                - ``english``
856
                - ``french``
857
                - ``german``
858
                - ``greek``
859
                - ``greeklatin``
860
                - ``hebrew``
861
                - ``hungarian``
862
                - ``italian``
863
                - ``latvian``
864
                - ``polish``
865
                - ``portuguese``
866
                - ``romanian``
867
                - ``russian``
868
                - ``spanish``
869
                - ``turkish``
870
871
        name_mode : str
872
            The name mode of the algorithm:
873
874
                - ``gen`` -- general (default)
875
                - ``ash`` -- Ashkenazi
876
                - ``sep`` -- Sephardic
877
878
        match_mode : str
879
            Matching mode: ``approx`` or ``exact``
880
        concat : bool
881
            Concatenation mode
882
        filter_langs : bool
883
            Filter out incompatible languages
884
885
886
        .. versionadded:: 0.4.0
887
888
        """
889
        name_mode = name_mode.strip().lower()[:3]
890
        if name_mode not in {'ash', 'sep', 'gen'}:
891
            name_mode = 'gen'
892
893
        if match_mode != 'exact':
894
            match_mode = 'approx'
895
896
        # Translate the supplied language_arg value into an integer
897
        # representing a set of languages
898
        all_langs = (
899
            sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
900
        )
901
        lang_choices = 0
902
        if isinstance(language_arg, (int, float)):
903
            self._lang_choices = int(language_arg)
904
        elif language_arg != '' and isinstance(language_arg, str):
905
            for lang in language_arg.lower().split(','):
906
                if lang in _LANG_DICT and (_LANG_DICT[lang] & all_langs):
907 1
                    lang_choices += _LANG_DICT[lang]
908 1
                elif not filter_langs:
909 1
                    raise ValueError(
910
                        "Unknown '" + name_mode + "' language: '" + lang + "'"
911 1
                    )
912 1
913
        self._language_arg = language_arg
914
        self._name_mode = name_mode
915
        self._match_mode = match_mode
916 1
        self._concat = concat
917
        self._filter_langs = filter_langs
918
        self._lang_choices = lang_choices
919 1
920 1
    def encode(self, word):
921 1
        """Return the Beider-Morse Phonetic Matching encoding(s) of a term.
922 1
923 1
        Parameters
924 1
        ----------
925 1
        word : str
926 1
            The word to transform
927 1
928
        Returns
929
        -------
930
        tuple
931 1
            The Beider-Morse phonetic value(s)
932 1
933 1
        Raises
934 1
        ------
935 1
        ValueError
936 1
            Unknown language
937
938 1
        Examples
939
        --------
940
        >>> pe = BeiderMorse()
941
        >>> pe.encode('Christopher')
942
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir
943
        xristofir xristYfir xristopi xritopir xritopi xristofi xritofir
944
        xritofi tzristopir tzristofir zristopir zristopi zritopir zritopi
945
        zristofir zristofi zritofir zritofi'
946
        >>> pe.encode('Niall')
947
        'nial niol'
948
        >>> pe.encode('Smith')
949
        'zmit'
950
        >>> pe.encode('Schmidt')
951
        'zmit stzmit'
952
953
        >>> BeiderMorse(language_arg='German').encode('Christopher')
954
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir
955
        xristofir xristYfir'
956
        >>> BeiderMorse(language_arg='English').encode('Christopher')
957
        'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir
958
        xristafir xrQstafir'
959
        >>> BeiderMorse(language_arg='German',
960
        ... name_mode='ash').encode('Christopher')
961
        'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir
962
        xristofir xristYfir'
963
964
        >>> BeiderMorse(language_arg='German',
965
        ... match_mode='exact').encode('Christopher')
966
        'xriStopher xriStofer xristopher xristofer'
967
968
969
        .. versionadded:: 0.1.0
970
        .. versionchanged:: 0.3.6
971
            Encapsulated in class
972
973
        """
974
        word = normalize('NFC', word.strip().lower())
975
976
        # Language choices are either all incompatible with the name mode or
977
        # no choices were given, so try to autodetect
978
        if self._lang_choices == 0:
979
            language_arg = self._language(word, self._name_mode)
980
        else:
981
            language_arg = self._lang_choices
982
        language_arg2 = self._language_index_from_code(
983
            language_arg, self._name_mode
984
        )
985
986
        rules = BMDATA[self._name_mode]['rules'][language_arg2]
987
        final_rules1 = BMDATA[self._name_mode][self._match_mode]['common']
988
        final_rules2 = BMDATA[self._name_mode][self._match_mode][language_arg2]
989
990
        result = self._phonetic(
991
            word,
992 1
            self._name_mode,
993
            rules,
994
            final_rules1,
995
            final_rules2,
996 1
            language_arg,
997 1
            self._concat,
998
        )
999 1
        result = self._phonetic_numbers(result)
1000 1
1001
        return result
1002
1003
1004 1
if __name__ == '__main__':
1005 1
    import doctest
1006 1
1007
    doctest.testmod()
1008