BeiderMorse.__init__()   C
last analyzed

Complexity

Conditions 10

Size

Total Lines 84
Code Lines 30

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 21
CRAP Score 10

Importance

Changes 0
Metric Value
eloc 30
dl 0
loc 84
ccs 21
cts 21
cp 1
rs 5.9999
c 0
b 0
f 0
cc 10
nop 6
crap 10

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._beider_morse.BeiderMorse.__init__() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# This file is based on Alexander Beider and Stephen P. Morse's implementation
5
# of the Beider-Morse Phonetic Matching (BMPM) System, available at
6
# http://stevemorse.org/phonetics/bmpm.htm.
7
#
8
# Abydos is free software: you can redistribute it and/or modify
9
# it under the terms of the GNU General Public License as published by
10
# the Free Software Foundation, either version 3 of the License, or
11
# (at your option) any later version.
12
#
13
# Abydos is distributed in the hope that it will be useful,
14
# but WITHOUT ANY WARRANTY; without even the implied warranty of
15
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
# GNU General Public License for more details.
17
#
18
# You should have received a copy of the GNU General Public License
19
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
20
21
"""abydos.phonetic._beider_morse.
22
23 1
Beider-Morse Phonetic Matching (BMPM) algorithm
24
"""
25
26
from re import search
27
from typing import Any, Optional, Tuple, Union
28 1
from unicodedata import normalize
29
30
from ._beider_morse_data import (
31
    BMDATA,
32
    L_ANY,
33
    L_ARABIC,
34
    L_CYRILLIC,
35 1
    L_CZECH,
36 1
    L_DUTCH,
37
    L_ENGLISH,
38 1
    L_FRENCH,
39
    L_GERMAN,
40 1
    L_GREEK,
41 1
    L_GREEKLATIN,
42
    L_HEBREW,
43 1
    L_HUNGARIAN,
44
    L_ITALIAN,
45
    L_LATVIAN,
46
    L_NONE,
47
    L_POLISH,
48
    L_PORTUGUESE,
49
    L_ROMANIAN,
50
    L_RUSSIAN,
51
    L_SPANISH,
52
    L_TURKISH,
53
)
54
from ._phonetic import _Phonetic
55
56
__all__ = ['BeiderMorse']
57
58
_LANG_DICT = {
59
    'any': L_ANY,
60
    'arabic': L_ARABIC,
61
    'cyrillic': L_CYRILLIC,
62
    'czech': L_CZECH,
63
    'dutch': L_DUTCH,
64
    'english': L_ENGLISH,
65
    'french': L_FRENCH,
66
    'german': L_GERMAN,
67 1
    'greek': L_GREEK,
68 1
    'greeklatin': L_GREEKLATIN,
69
    'hebrew': L_HEBREW,
70 1
    'hungarian': L_HUNGARIAN,
71
    'italian': L_ITALIAN,
72
    'latvian': L_LATVIAN,
73
    'polish': L_POLISH,
74
    'portuguese': L_PORTUGUESE,
75 1
    'romanian': L_ROMANIAN,
76
    'russian': L_RUSSIAN,
77
    'spanish': L_SPANISH,
78
    'turkish': L_TURKISH,
79
}
80
81
BMDATA['gen']['discards'] = {
82
    'da ',
83
    'dal ',
84
    'de ',
85
    'del ',
86
    'dela ',
87
    'de la ',
88
    'della ',
89
    'des ',
90
    'di ',
91
    'do ',
92
    'dos ',
93
    'du ',
94
    'van ',
95
    'von ',
96
    "d'",
97
}
98 1
BMDATA['sep']['discards'] = {
99
    'al',
100
    'el',
101
    'da',
102
    'dal',
103
    'de',
104
    'del',
105
    'dela',
106
    'de la',
107
    'della',
108
    'des',
109
    'di',
110
    'do',
111
    'dos',
112
    'du',
113
    'van',
114
    'von',
115 1
}
116
BMDATA['ash']['discards'] = {'bar', 'ben', 'da', 'de', 'van', 'von'}
117
118
# format of rules array
119
_PATTERN_POS = 0
120
_LCONTEXT_POS = 1
121
_RCONTEXT_POS = 2
122
_PHONETIC_POS = 3
123
124
125
class BeiderMorse(_Phonetic):
126
    """Beider-Morse Phonetic Matching.
127
128
    The Beider-Morse Phonetic Matching algorithm is described in
129
    :cite:`Beider:2008`.
130
    The reference implementation is licensed under GPLv3.
131
132
    .. versionadded:: 0.3.6
133 1
    """
134
135
    def _language(self, name: str, name_mode: str) -> int:
136 1
        """Return the best guess language ID for the word and language choices.
137 1
138 1
        Parameters
139 1
        ----------
140
        name : str
141
            The term to guess the language of
142 1
        name_mode : str
143
            The name mode of the algorithm: ``gen`` (default),
144
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
145
146
        Returns
147
        -------
148
        int
149
            Language ID
150
151
152 1
        .. versionadded:: 0.1.0
153
        .. versionchanged:: 0.3.6
154
            Encapsulated in class
155
156
        """
157
        name = name.strip().lower()
158
        rules = BMDATA[name_mode]['language_rules']
159
        all_langs = (
160
            sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
161
        )
162
        choices_remaining = all_langs
163
        for rule in rules:
164
            letters, languages, accept = rule
165
            if search(letters, name) is not None:
166
                if accept:
167
                    choices_remaining &= languages
168
                else:
169
                    choices_remaining &= (~languages) % (all_langs + 1)
170
        if choices_remaining == L_NONE:
171
            choices_remaining = L_ANY
172
        return choices_remaining
173
174 1
    def _redo_language(
175 1
        self,
176 1
        term: str,
177
        name_mode: str,
178
        rules: Tuple[Any, ...],
179 1
        final_rules1: Tuple[Any, ...],
180 1
        final_rules2: Tuple[Any, ...],
181 1
        concat: bool,
182 1
    ) -> str:
183 1
        """Reassess the language of the terms and call the phonetic encoder.
184 1
185
        Uses a split multi-word term.
186 1
187 1
        Parameters
188 1
        ----------
189 1
        term : str
190
            The term to encode via Beider-Morse
191 1
        name_mode : str
192
            The name mode of the algorithm: ``gen`` (default),
193
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
194
        rules : tuple
195
            The set of initial phonetic transform regexps
196
        final_rules1 : tuple
197
            The common set of final phonetic transform regexps
198
        final_rules2 : tuple
199
            The specific set of final phonetic transform regexps
200
        concat : bool
201
            A flag to indicate concatenation
202
203
        Returns
204
        -------
205
        str
206
            A Beider-Morse phonetic code
207
208
209
        .. versionadded:: 0.1.0
210
        .. versionchanged:: 0.3.6
211
            Encapsulated in class
212
213
        """
214
        language_arg = self._language(term, name_mode)
215
        return self._phonetic(
216
            term,
217
            name_mode,
218
            rules,
219
            final_rules1,
220
            final_rules2,
221
            language_arg,
222
            concat,
223
        )
224
225 1
    def _phonetic(
226 1
        self,
227
        term: str,
228
        name_mode: str,
229
        rules: Tuple[Any, ...],
230
        final_rules1: Tuple[Any, ...],
231
        final_rules2: Tuple[Any, ...],
232
        language_arg: int = 0,
233
        concat: bool = False,
234
    ) -> str:
235
        """Return the Beider-Morse encoding(s) of a term.
236 1
237
        Parameters
238
        ----------
239
        term : str
240
            The term to encode via Beider-Morse
241
        name_mode : str
242
            The name mode of the algorithm: ``gen`` (default),
243
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
244
        rules : tuple
245
            The set of initial phonetic transform regexps
246
        final_rules1 : tuple
247
            The common set of final phonetic transform regexps
248
        final_rules2 : tuple
249
            The specific set of final phonetic transform regexps
250
        language_arg : int
251
            The language of the term
252
        concat : bool
253
            A flag to indicate concatenation
254
255
        Returns
256
        -------
257
        str
258
            A Beider-Morse phonetic code
259
260
261
        .. versionadded:: 0.1.0
262
        .. versionchanged:: 0.3.6
263
            Encapsulated in class
264
265
        """
266
        term = term.replace('-', ' ').strip()
267
268
        if name_mode == 'gen':  # generic case
269
            # discard and concatenate certain words if at the start of the name
270
            for pfx in BMDATA['gen']['discards']:
271
                if term.startswith(pfx):
272
                    remainder = term[len(pfx) :]
273
                    combined = pfx[:-1] + remainder
274
                    result = (
275
                        self._redo_language(
276
                            remainder,
277 1
                            name_mode,
278
                            rules,
279 1
                            final_rules1,
280
                            final_rules2,
281 1
                            concat,
282 1
                        )
283 1
                        + '-'
284 1
                        + self._redo_language(
285 1
                            combined,
286
                            name_mode,
287
                            rules,
288
                            final_rules1,
289
                            final_rules2,
290
                            concat,
291
                        )
292
                    )
293
                    return result
294
295
        words = (
296
            term.split()
297
        )  # create array of the individual words in the name
298
        words2 = []
299
300
        if name_mode == 'sep':  # Sephardic case
301
            # for each word in the name, delete portions of word preceding
302
            # apostrophe
303
            # ex: d'avila d'aguilar --> avila aguilar
304 1
            # also discard certain words in the name
305
306 1
            # note that we can never get a match on "de la" because we are
307
            # checking single words below
308
            # this is a bug, but I won't try to fix it now
309 1
310
            for word in words:
311 1
                word = word[word.rfind("'") + 1 :]
312
                if word not in BMDATA['sep']['discards']:
313
                    words2.append(word)
314
315
        elif name_mode == 'ash':  # Ashkenazic case
316
            # discard certain words if at the start of the name
317
            if len(words) > 1 and words[0] in BMDATA['ash']['discards']:
318
                words2 = words[1:]
319
            else:
320
                words2 = list(words)
321 1
        else:
322 1
            words2 = list(words)
323 1
324 1
        if concat:
325
            # concatenate the separate words of a multi-word name
326 1
            # (normally used for exact matches)
327
            term = ' '.join(words2)
328 1
        elif len(words2) == 1:  # not a multi-word name
329 1
            term = words2[0]
330
        else:
331 1
            # encode each word in a multi-word name separately
332
            # (normally used for approx matches)
333 1
            result = '-'.join(
334
                [
335 1
                    self._redo_language(
336
                        w, name_mode, rules, final_rules1, final_rules2, concat
337
                    )
338 1
                    for w in words2
339 1
                ]
340 1
            )
341
            return result
342
343
        term_length = len(term)
344 1
345
        # apply language rules to map to phonetic alphabet
346
        phonetic = ''
347
        skip = 0
348
        for i in range(term_length):
349
            if skip:
350
                skip -= 1
351
                continue
352 1
            found = False
353
            for rule in rules:
354 1
                pattern = rule[_PATTERN_POS]
355
                pattern_length = len(pattern)
356
                lcontext = rule[_LCONTEXT_POS]
357 1
                rcontext = rule[_RCONTEXT_POS]
358 1
359 1
                # check to see if next sequence in input matches the string in
360 1
                # the rule
361 1
                if (pattern_length > term_length - i) or (
362 1
                    term[i : i + pattern_length] != pattern
363 1
                ):  # no match
364 1
                    continue
365 1
366 1
                right = '^' + rcontext
367 1
                left = lcontext + '$'
368 1
369
                # check that right context is satisfied
370
                if rcontext != '':
371
                    if not search(right, term[i + pattern_length :]):
372 1
                        continue
373
374
                # check that left context is satisfied
375 1
                if lcontext != '':
376
                    if not search(left, term[:i]):
377 1
                        continue
378 1
379
                # check for incompatible attributes
380
                candidate = self._apply_rule_if_compat(
381 1
                    phonetic, rule[_PHONETIC_POS], language_arg
382 1
                )
383 1
                # The below condition shouldn't ever be false
384
                if candidate is not None:  # pragma: no branch
385
                    phonetic = candidate
386 1
                    found = True
387 1
                    break
388 1
389
            if (
390
                not found
391 1
            ):  # character in name that is not in table -- e.g., space
392
                pattern_length = 1
393
            skip = pattern_length - 1
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
394
395 1
        # apply final rules on phonetic-alphabet,
396 1
        # doing a substitution of certain characters
397 1
        phonetic = self._apply_final_rules(
398 1
            phonetic, final_rules1, language_arg, False
399
        )  # apply common rules
400 1
        # final_rules1 are the common approx rules,
401
        # final_rules2 are approx rules for specific language
402
        phonetic = self._apply_final_rules(
403 1
            phonetic, final_rules2, language_arg, True
404 1
        )  # apply lang specific rules
405
406
        return phonetic
407
408 1
    def _apply_final_rules(
409
        self,
410
        phonetic: str,
411
        final_rules: Tuple[Any, ...],
412
        language_arg: int,
413 1
        strip: bool,
414
    ) -> str:
415
        """Apply a set of final rules to the phonetic encoding.
416
417 1
        Parameters
418
        ----------
419 1
        phonetic : str
420
            The term to which to apply the final rules
421
        final_rules : tuple
422
            The set of final phonetic transform regexps
423
        language_arg : int
424
            An integer representing the target language of the phonetic
425
            encoding
426
        strip : bool
427
            Flag to indicate whether to normalize the language attributes
428
429
        Returns
430
        -------
431
        str
432
            A Beider-Morse phonetic code
433
434
435
        .. versionadded:: 0.1.0
436
        .. versionchanged:: 0.3.6
437
            Encapsulated in class
438
439
        """
440
        # optimization to save time
441
        if not final_rules:
442
            return phonetic
443
444
        # expand the result
445
        phonetic = self._expand_alternates(phonetic)
446 1
        phonetic_array = phonetic.split('|')
447 1
448
        for k in range(len(phonetic_array)):
449
            phonetic = phonetic_array[k]
450 1
            phonetic2 = ''
451 1
            phoneticx = self._normalize_lang_attrs(phonetic, True)
452
453 1
            i = 0
454 1
            while i < len(phonetic):
455 1
                found = False
456 1
457
                if phonetic[i] == '[':  # skip over language attribute
458 1
                    attrib_start = i
459 1
                    i += 1
460 1
                    while True:
461
                        if phonetic[i] == ']':
462 1
                            i += 1
463 1
                            phonetic2 += phonetic[attrib_start:i]
464 1
                            break
465 1
                        i += 1
466 1
                    continue
467 1
468 1
                for rule in final_rules:
469 1
                    pattern = rule[_PATTERN_POS]
470 1
                    pattern_length = len(pattern)
471 1
                    lcontext = rule[_LCONTEXT_POS]
472
                    rcontext = rule[_RCONTEXT_POS]
473 1
474 1
                    right = '^' + rcontext
475 1
                    left = lcontext + '$'
476 1
477 1
                    # check to see if next sequence in phonetic matches the
478
                    # string in the rule
479 1
                    if (pattern_length > len(phoneticx) - i) or phoneticx[
480 1
                        i : i + pattern_length
481
                    ] != pattern:
482
                        continue
483
484 1
                    # check that right context is satisfied
485
                    if rcontext != '':
486
                        if not search(right, phoneticx[i + pattern_length :]):
487 1
                            continue
488
489
                    # check that left context is satisfied
490 1
                    if lcontext != '':
491 1
                        if not search(left, phoneticx[:i]):
492 1
                            continue
493
494
                    # check for incompatible attributes
495 1
                    candidate = self._apply_rule_if_compat(
496 1
                        phonetic2, rule[_PHONETIC_POS], language_arg
497 1
                    )
498
                    # The below condition shouldn't ever be false
499
                    if candidate is not None:  # pragma: no branch
500 1
                        phonetic2 = candidate
501
                        found = True
502
                        break
503
504 1
                if not found:
505 1
                    # character in name for which there is no substitution in
506 1
                    # the table
507 1
                    phonetic2 += phonetic[i]
508
                    pattern_length = 1
509 1
510
                i += pattern_length
0 ignored issues
show
introduced by
The variable pattern_length does not seem to be defined for all execution paths.
Loading history...
511
512 1
            phonetic_array[k] = self._expand_alternates(phonetic2)
513 1
514
        phonetic = '|'.join(phonetic_array)
515 1
        if strip:
516
            phonetic = self._normalize_lang_attrs(phonetic, True)
517 1
518
        if '|' in phonetic:
519 1
            phonetic = '(' + self._remove_dupes(phonetic) + ')'
520 1
521 1
        return phonetic
522
523 1
    def _phonetic_number(self, phonetic: str) -> str:
524 1
        """Remove bracketed text from the end of a string.
525
526 1
        Parameters
527
        ----------
528 1
        phonetic : str
529
            A Beider-Morse phonetic encoding
530
531
        Returns
532
        -------
533
        str
534
            A Beider-Morse phonetic code
535
536
537
        .. versionadded:: 0.1.0
538
        .. versionchanged:: 0.3.6
539
            Encapsulated in class
540
541
        """
542
        if '[' in phonetic:
543
            return phonetic[: phonetic.find('[')]
544
545
        return phonetic  # experimental !!!!
546
547 1
    def _expand_alternates(self, phonetic: str) -> str:
548 1
        r"""Expand phonetic alternates separated by \|s.
549
550 1
        Parameters
551
        ----------
552 1
        phonetic : str
553
            A Beider-Morse phonetic encoding
554
555
        Returns
556
        -------
557
        str
558
            A Beider-Morse phonetic code
559
560
561
        .. versionadded:: 0.1.0
562
        .. versionchanged:: 0.3.6
563
            Encapsulated in class
564
565
        """
566
        alt_start = phonetic.find('(')
567
        if alt_start == -1:
568
            return self._normalize_lang_attrs(phonetic, False)
569
570
        prefix = phonetic[:alt_start]
571 1
        alt_start += 1  # get past the (
572 1
        alt_end = phonetic.find(')', alt_start)
573 1
        alt_string = phonetic[alt_start:alt_end]
574
        alt_end += 1  # get past the )
575 1
        suffix = phonetic[alt_end:]
576 1
        alt_array = alt_string.split('|')
577 1
        result = ''
578 1
579 1
        for i in range(len(alt_array)):
580 1
            alt = alt_array[i]
581 1
            alternate = self._expand_alternates(prefix + alt + suffix)
582 1
            if alternate != '' and alternate != '[0]':
583
                if result != '':
584 1
                    result += '|'
585 1
                result += alternate
586 1
587 1
        return result
588 1
589 1
    def _pnums_with_leading_space(self, phonetic: str) -> str:
590 1
        """Join prefixes & suffixes in cases of alternate phonetic values.
591
592 1
        Parameters
593
        ----------
594 1
        phonetic : str
595
            A Beider-Morse phonetic encoding
596
597
        Returns
598
        -------
599
        str
600
            A Beider-Morse phonetic code
601
602
603
        .. versionadded:: 0.1.0
604
        .. versionchanged:: 0.3.6
605
            Encapsulated in class
606
607
        """
608
        alt_start = phonetic.find('(')
609
        if alt_start == -1:
610
            return ' ' + self._phonetic_number(phonetic)
611
612
        prefix = phonetic[:alt_start]
613 1
        alt_start += 1  # get past the (
614 1
        alt_end = phonetic.find(')', alt_start)
615 1
        alt_string = phonetic[alt_start:alt_end]
616
        alt_end += 1  # get past the )
617 1
        suffix = phonetic[alt_end:]
618 1
        alt_array = alt_string.split('|')
619 1
        result = ''
620 1
        for alt in alt_array:
621 1
            result += self._pnums_with_leading_space(prefix + alt + suffix)
622 1
623 1
        return result
624 1
625 1
    def _phonetic_numbers(self, phonetic: str) -> str:
626 1
        """Prepare & join phonetic numbers.
627
628 1
        Split phonetic value on '-', run through _pnums_with_leading_space,
629
        and join with ' '
630 1
631
        Parameters
632
        ----------
633
        phonetic : str
634
            A Beider-Morse phonetic encoding
635
636
        Returns
637
        -------
638
        str
639
            A Beider-Morse phonetic code
640
641
642
        .. versionadded:: 0.1.0
643
        .. versionchanged:: 0.3.6
644
            Encapsulated in class
645
646
        """
647
        phonetic_array = phonetic.split('-')  # for names with spaces in them
648
        result = ' '.join(
649
            [self._pnums_with_leading_space(i)[1:] for i in phonetic_array]
650
        )
651
        return result
652 1
653 1
    def _remove_dupes(self, phonetic: str) -> str:
654
        """Remove duplicates from a phonetic encoding list.
655
656 1
        Parameters
657
        ----------
658 1
        phonetic : str
659
            A Beider-Morse phonetic encoding
660
661
        Returns
662
        -------
663
        str
664
            A Beider-Morse phonetic code
665
666
667
        .. versionadded:: 0.1.0
668
        .. versionchanged:: 0.3.6
669
            Encapsulated in class
670
671
        """
672
        alt_string = phonetic
673
        alt_array = alt_string.split('|')
674
675
        result = '|'
676
        for i in range(len(alt_array)):
677 1
            alt = alt_array[i]
678 1
            if alt and '|' + alt + '|' not in result:
679
                result += alt + '|'
680 1
681 1
        return result[1:-1]  # remove leading and trailing |
682 1
683 1
    def _normalize_lang_attrs(self, text: str, strip: bool) -> str:
684 1
        """Remove embedded bracketed attributes.
685
686 1
        This (potentially) bitwise-ands bracketed attributes together and adds
687
        to the end.
688 1
        This is applied to a single alternative at a time -- not to a
689
        parenthesized list.
690
        It removes all embedded bracketed attributes, logically-ands them
691
        together, and places them at the end.
692
        However if strip is true, this can indeed remove embedded bracketed
693
        attributes from a parenthesized list.
694
695
        Parameters
696
        ----------
697
        text : str
698
            A Beider-Morse phonetic encoding (in progress)
699
        strip : bool
700
            Remove the bracketed attributes (and throw away)
701
702
        Returns
703
        -------
704
        str
705
            A Beider-Morse phonetic code
706
707
        Raises
708
        ------
709
        ValueError
710
            No closing square bracket
711
712
713
        .. versionadded:: 0.1.0
714
        .. versionchanged:: 0.3.6
715
            Encapsulated in class
716
717
        """
718
        uninitialized = -1  # all 1's
719
        attrib = uninitialized
720
        while '[' in text:
721
            bracket_start = text.find('[')
722
            bracket_end = text.find(']', bracket_start)
723 1
            if bracket_end == -1:
724 1
                raise ValueError(
725 1
                    'No closing square bracket: text=('
726 1
                    + text
727 1
                    + ') strip=('
728 1
                    + str(strip)
729 1
                    + ')'
730
                )
731
            attrib &= int(text[bracket_start + 1 : bracket_end])
732
            text = text[:bracket_start] + text[bracket_end + 1 :]
733
734
        if attrib == uninitialized or strip:
735
            return text
736 1
        elif attrib == 0:
737 1
            # means that the attributes were incompatible and there is no
738
            # alternative here
739 1
            return '[0]'
740 1
        return text + '[' + str(attrib) + ']'
741 1
742
    def _apply_rule_if_compat(
743
        self, phonetic: str, target: str, language_arg: int
744 1
    ) -> Optional[str]:
745 1
        """Apply a phonetic regex if compatible.
746
747 1
        tests for compatible language rules
748
749
        to do so, apply the rule, expand the results, and detect alternatives
750
            with incompatible attributes
751
752
        then drop each alternative that has incompatible attributes and keep
753
            those that are compatible
754
755
        if there are no compatible alternatives left, return false
756
757
        otherwise return the compatible alternatives
758
759
        apply the rule
760
761
        Parameters
762
        ----------
763
        phonetic : str
764
            The Beider-Morse phonetic encoding (so far)
765
        target : str
766
            A proposed addition to the phonetic encoding
767
        language_arg : int
768
            An integer representing the target language of the phonetic
769
            encoding
770
771
        Returns
772
        -------
773
        str
774
            A candidate encoding
775
776
777
        .. versionadded:: 0.1.0
778
        .. versionchanged:: 0.3.6
779
            Encapsulated in class
780
781
        """
782
        candidate = phonetic + target
783
        if '[' not in candidate:  # no attributes so we need test no further
784
            return candidate
785 1
786 1
        # expand the result, converting incompatible attributes to [0]
787 1
        candidate = self._expand_alternates(candidate)
788
        candidate_array = candidate.split('|')
789
790 1
        # drop each alternative that has incompatible attributes
791 1
        candidate = ''
792
        found = False
793
794 1
        for i in range(len(candidate_array)):
795 1
            this_candidate = candidate_array[i]
796
            if language_arg != 1:
797 1
                this_candidate = self._normalize_lang_attrs(
798 1
                    this_candidate + '[' + str(language_arg) + ']', False
799 1
                )
800 1
            if this_candidate != '[0]':
801
                found = True
802
                if candidate:
803 1
                    candidate += '|'
804 1
                candidate += this_candidate
805 1
806 1
        # return false if no compatible alternatives remain
807 1
        if not found:
808
            return None
809
810 1
        # return the result of applying the rule
811 1
        if '|' in candidate:
812
            candidate = '(' + candidate + ')'
813
        return candidate
814 1
815 1
    def _language_index_from_code(self, code: int, name_mode: str) -> int:
816 1
        """Return the index value for a language code.
817
818 1
        This returns l_any if more than one code is specified or the code is
819
        out of bounds.
820
821
        Parameters
822
        ----------
823
        code : int
824
            The language code to interpret
825
        name_mode : str
826
            The name mode of the algorithm: ``gen`` (default),
827
            ``ash`` (Ashkenazi), or ``sep`` (Sephardic)
828
829
        Returns
830
        -------
831
        int
832
            Language code index
833
834
835
        .. versionadded:: 0.1.0
836
        .. versionchanged:: 0.3.6
837
            Encapsulated in class
838
839
        """
840
        if code < 1 or code > sum(
841
            _LANG_DICT[_] for _ in BMDATA[name_mode]['languages']
842
        ):  # code out of range
843 1
            return L_ANY
844
        if (
845
            code & (code - 1)
846 1
        ) != 0:  # choice was more than one language; use any
847 1
            return L_ANY
848
        return code
849
850 1
    def __init__(
851 1
        self,
852
        language_arg: Union[str, int] = 0,
853 1
        name_mode: str = 'gen',
854
        match_mode: str = 'approx',
855
        concat: bool = False,
856
        filter_langs: bool = False,
857
    ) -> None:
858
        """Initialize BeiderMorse instance.
859
860
        Parameters
861
        ----------
862
        language_arg : str or int
863
            The language of the term; supported values include:
864
865
                - ``any``
866
                - ``arabic``
867
                - ``cyrillic``
868
                - ``czech``
869
                - ``dutch``
870
                - ``english``
871
                - ``french``
872
                - ``german``
873
                - ``greek``
874
                - ``greeklatin``
875
                - ``hebrew``
876
                - ``hungarian``
877
                - ``italian``
878
                - ``latvian``
879
                - ``polish``
880
                - ``portuguese``
881
                - ``romanian``
882
                - ``russian``
883
                - ``spanish``
884
                - ``turkish``
885
886
        name_mode : str
887
            The name mode of the algorithm:
888
889
                - ``gen`` -- general (default)
890
                - ``ash`` -- Ashkenazi
891
                - ``sep`` -- Sephardic
892
893
        match_mode : str
894
            Matching mode: ``approx`` or ``exact``
895
        concat : bool
896
            Concatenation mode
897
        filter_langs : bool
898
            Filter out incompatible languages
899
900
901
        .. versionadded:: 0.4.0
902
903
        """
904
        name_mode = name_mode.strip().lower()[:3]
905
        if name_mode not in {'ash', 'sep', 'gen'}:
906
            name_mode = 'gen'
907 1
908 1
        if match_mode != 'exact':
909 1
            match_mode = 'approx'
910
911 1
        # Translate the supplied language_arg value into an integer
912 1
        # representing a set of languages
913
        all_langs = (
914
            sum(_LANG_DICT[_] for _ in BMDATA[name_mode]['languages']) - 1
915
        )
916 1
        lang_choices = 0
917
        if isinstance(language_arg, (int, float)):
918
            self._lang_choices = int(language_arg)
919 1
        elif language_arg != '' and isinstance(language_arg, str):
920 1
            for lang in language_arg.lower().split(','):
921 1
                if lang in _LANG_DICT and (_LANG_DICT[lang] & all_langs):
922 1
                    lang_choices += _LANG_DICT[lang]
923 1
                elif not filter_langs:
924 1
                    raise ValueError(
925 1
                        "Unknown '" + name_mode + "' language: '" + lang + "'"
926 1
                    )
927 1
928
        self._language_arg = language_arg
929
        self._name_mode = name_mode
930
        self._match_mode = match_mode
931 1
        self._concat = concat
932 1
        self._filter_langs = filter_langs
933 1
        self._lang_choices = lang_choices
934 1
935 1
    def encode(self, word: str) -> str:
936 1
        """Return the Beider-Morse Phonetic Matching encoding(s) of a term.
937
938 1
        Parameters
939
        ----------
940
        word : str
941
            The word to transform
942
943
        Returns
944
        -------
945
        tuple
946
            The Beider-Morse phonetic value(s)
947
948
        Raises
949
        ------
950
        ValueError
951
            Unknown language
952
953
        Examples
954
        --------
955
        >>> pe = BeiderMorse()
956
        >>> pe.encode('Christopher').split(',')
957
        ['xrQstopir', 'xrQstYpir', 'xristopir', 'xristYpir', 'xrQstofir',
958
        'xrQstYfir', 'xristofir', 'xristYfir', 'xristopi', 'xritopir',
959
        'xritopi', 'xristofi', 'xritofir', 'xritofi', 'tzristopir',
960
        'tzristofir', 'zristopir', 'zristopi', 'zritopir', 'zritopi',
961
        'zristofir', 'zristofi', 'zritofir', 'zritofi']
962
        >>> pe.encode('Niall')
963
        'nial,niol'
964
        >>> pe.encode('Smith')
965
        'zmit'
966
        >>> pe.encode('Schmidt')
967
        'zmit,stzmit'
968
969
        >>> BeiderMorse(language_arg='German').encode('Christopher').split(',')
970
        ['xrQstopir', 'xrQstYpir', 'xristopir', 'xristYpir', 'xrQstofir',
971
        'xrQstYfir', 'xristofir', 'xristYfir']
972
        >>> BeiderMorse(language_arg='English').encode(
973
        ... 'Christopher').split(',')
974
        ['tzristofir', 'tzrQstofir', 'tzristafir', 'tzrQstafir', 'xristofir',
975
        'xrQstofir', 'xristafir', 'xrQstafir']
976
        >>> BeiderMorse(language_arg='German',
977
        ... name_mode='ash').encode('Christopher').split(',')
978
        ['xrQstopir', 'xrQstYpir', 'xristopir', 'xristYpir', 'xrQstofir',
979
        'xrQstYfir', 'xristofir', 'xristYfir']
980
981
        >>> BeiderMorse(language_arg='German',
982
        ... match_mode='exact').encode('Christopher')
983
        'xriStopher,xriStofer,xristopher,xristofer'
984
985
986
        .. versionadded:: 0.1.0
987
        .. versionchanged:: 0.3.6
988
            Encapsulated in class
989
        .. versionchanged:: 0.6.0
990
            Made comma-sepated instead of space-separated output
991
992 1
        """
993
        word = normalize('NFC', word.strip().lower())
994
995
        # Language choices are either all incompatible with the name mode or
996 1
        # no choices were given, so try to autodetect
997 1
        if self._lang_choices == 0:
998
            language_arg = self._language(word, self._name_mode)
999 1
        else:
1000 1
            language_arg = self._lang_choices
1001
        language_arg2 = self._language_index_from_code(
1002
            language_arg, self._name_mode
1003
        )
1004 1
1005 1
        rules = BMDATA[self._name_mode]['rules'][language_arg2]
1006 1
        final_rules1 = BMDATA[self._name_mode][self._match_mode]['common']
1007
        final_rules2 = BMDATA[self._name_mode][self._match_mode][language_arg2]
1008 1
1009
        result = self._phonetic(
1010
            word,
1011
            self._name_mode,
1012
            rules,
1013
            final_rules1,
1014
            final_rules2,
1015
            language_arg,
1016
            self._concat,
1017 1
        )
1018
        result = self._phonetic_numbers(result).replace(' ', ',')
1019 1
1020
        return result
1021
1022 1
1023
if __name__ == '__main__':
1024
    import doctest
1025
1026
    doctest.testmod()
1027