Issues (140)

abydos/phonetic/_phonem.py (1 issue)

1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.phonetic._phonem.
18
19 1
Phonem
20
"""
21
22
from unicodedata import normalize as unicode_normalize
23
24 1
from ._phonetic import _Phonetic
25
26
__all__ = ['Phonem']
27
28
29
class Phonem(_Phonetic):
30
    """Phonem.
31 1
32
    Phonem is defined in :cite:`Wilde:1988`.
33 1
34
    This version is based on the Perl implementation documented at
35 1
    :cite:`Wilz:2005`.
36
    It includes some enhancements presented in the Java port at
37 1
    :cite:`dcm4che:2011`.
38 1
39
    Phonem is intended chiefly for German names/words.
40 1
41
    .. versionadded:: 0.3.6
42
    """
43 1
44
    _substitutions = (
45
        ('SC', 'C'),
46
        ('SZ', 'C'),
47
        ('CZ', 'C'),
48
        ('TZ', 'C'),
49
        ('TS', 'C'),
50
        ('KS', 'X'),
51
        ('PF', 'V'),
52
        ('QU', 'KW'),
53
        ('PH', 'V'),
54
        ('UE', 'Y'),
55
        ('AE', 'E'),
56
        ('OE', 'Ö'),
57
        ('EI', 'AY'),
58 1
        ('EY', 'AY'),
59
        ('EU', 'OY'),
60
        ('AU', 'A§'),
61
        ('OU', '§'),
62
    )
63
64
    _trans = dict(
65
        zip(
66
            (ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
67
            'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ',
68
        )
69
    )
70
71
    _uc_set = set('ABCDLMNORSUVWXYÖ')
72
73
    def encode(self, word: str) -> str:
74
        """Return the Phonem code for a word.
75
76
        Parameters
77
        ----------
78 1
        word : str
79
        The word to transform
80
81
        Returns
82
        -------
83
        str
84
            The Phonem value
85 1
86
        Examples
87 1
        --------
88
        >>> pe = Phonem()
89
        >>> pe.encode('Christopher')
90
        'CRYSDOVR'
91
        >>> pe.encode('Niall')
92
        'NYAL'
93
        >>> pe.encode('Smith')
94
        'SMYD'
95
        >>> pe.encode('Schmidt')
96
        'CMYD'
97
98
99
        .. versionadded:: 0.1.0
100
        .. versionchanged:: 0.3.6
101
            Encapsulated in class
102
103
        """
104
        word = unicode_normalize('NFC', word.upper())
105
        for i, j in self._substitutions:
106
            word = word.replace(i, j)
107
        word = word.translate(self._trans)
108
109
        return ''.join(
110
            c
111
            for c in self._delete_consecutive_repeats(word)
112
            if c in self._uc_set
113
        )
114
115
116
if __name__ == '__main__':
117
    import doctest
118 1
119
    doctest.testmod()
120