Completed
Push — master ( 6ed6e1...91db7a )
by Chris
13:26
created

abydos.phonetic.statistics_canada   A

Complexity

Total Complexity 3

Size/Duplication

Total Lines 115
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 52
dl 0
loc 115
ccs 19
cts 19
cp 1
rs 10
c 0
b 0
f 0
wmc 3

1 Function

Rating   Name   Duplication   Size   Complexity  
B statistics_canada() 0 73 3
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic.statistics_canada.
20
21
The phonetic.statistics_canada module implements the Statistics Canada phonetic
22
encoding.
23
"""
24
25 1
from __future__ import unicode_literals
26
27 1
from unicodedata import normalize as unicode_normalize
28
29 1
from six import text_type
30
31 1
from . import _delete_consecutive_repeats
32
33 1
__all__ = ['statistics_canada']
34
35
36 1
def statistics_canada(word, max_length=4):
37
    """Return the Statistics Canada code for a word.
38
39
    The original description of this algorithm could not be located, and
40
    may only have been specified in an unpublished TR. The coding does not
41
    appear to be in use by Statistics Canada any longer. In its place, this is
42
    an implementation of the "Census modified Statistics Canada name coding
43
    procedure".
44
45
    The modified version of this algorithm is described in Appendix B of
46
     :cite:`Moore:1977`.
47
48
    :param str word: the word to transform
49
    :param int max_length: the maximum length (default 4) of the code to return
50
    :returns: the Statistics Canada name code value
51
    :rtype: str
52
53
    >>> statistics_canada('Christopher')
54
    'CHRS'
55
    >>> statistics_canada('Niall')
56
    'NL'
57
    >>> statistics_canada('Smith')
58
    'SMTH'
59
    >>> statistics_canada('Schmidt')
60
    'SCHM'
61
    """
62
    # uppercase, normalize, decompose, and filter non-A-Z out
63 1
    word = unicode_normalize('NFKD', text_type(word.upper()))
64 1
    word = word.replace('ß', 'SS')
65 1
    word = ''.join(
66
        c
67
        for c in word
68
        if c
69
        in {
70
            'A',
71
            'B',
72
            'C',
73
            'D',
74
            'E',
75
            'F',
76
            'G',
77
            'H',
78
            'I',
79
            'J',
80
            'K',
81
            'L',
82
            'M',
83
            'N',
84
            'O',
85
            'P',
86
            'Q',
87
            'R',
88
            'S',
89
            'T',
90
            'U',
91
            'V',
92
            'W',
93
            'X',
94
            'Y',
95
            'Z',
96
        }
97
    )
98 1
    if not word:
99 1
        return ''
100
101 1
    code = word[1:]
102 1
    for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}:
103 1
        code = code.replace(vowel, '')
104 1
    code = word[0] + code
105 1
    code = _delete_consecutive_repeats(code)
106 1
    code = code.replace(' ', '')
107
108 1
    return code[:max_length]
109
110
111
if __name__ == '__main__':
112
    import doctest
113
114
    doctest.testmod()
115