abydos.stemmer._schinke - Code Metrics - Inspection of "78a222a9f7d8976f6744d263e3d6d01a2a991c27" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (78a222)

by Chris

created 2018-10-26 11:30 UTC

abydos.stemmer._schinke A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	237
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	14
eloc	161
dl	0
loc	237
ccs	46
cts	46
cp	1
rs	10
c	0
b	0
f	0

1 Function

Rating	Name	Duplication	Size	Complexity
F	schinke()	0	197	14

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._schinke.

The stemmer.schinke module defines the Schinke Latin stemmer.
"""

from __future__ import unicode_literals

from unicodedata import normalize

from six import text_type
from six.moves import range

__all__ = ['schinke']


def schinke(word):
    """Return the stem of a word according to the Schinke stemmer.

    This is defined in :cite:`Schinke:1996`.

    :param str word: the word to stem
    :returns: a dict of the noun- and verb-stemmed word
    :rtype: dict

    >>> schinke('atque')
    {'n': 'atque', 'v': 'atque'}
    >>> schinke('census')
    {'n': 'cens', 'v': 'censu'}
    >>> schinke('virum')
    {'n': 'uir', 'v': 'uiru'}
    >>> schinke('populusque')
    {'n': 'popul', 'v': 'populu'}
    >>> schinke('senatus')
    {'n': 'senat', 'v': 'senatu'}
    """
    word = normalize('NFKD', text_type(word.lower()))
    word = ''.join(
        c
        for c in word
        if c
        in {
            'a',
            'b',
            'c',
            'd',
            'e',
            'f',
            'g',
            'h',
            'i',
            'j',
            'k',
            'l',
            'm',
            'n',
            'o',
            'p',
            'q',
            'r',
            's',
            't',
            'u',
            'v',
            'w',
            'x',
            'y',
            'z',
        }
    )

    # Rule 2
    word = word.replace('j', 'i').replace('v', 'u')

    # Rule 3
    keep_que = {
        'at',
        'quo',
        'ne',
        'ita',
        'abs',
        'aps',
        'abus',
        'adae',
        'adus',
        'deni',
        'de',
        'sus',
        'obli',
        'perae',
        'plenis',
        'quando',
        'quis',
        'quae',
        'cuius',
        'cui',
        'quem',
        'quam',
        'qua',
        'qui',
        'quorum',
        'quarum',
        'quibus',
        'quos',
        'quas',
        'quotusquis',
        'quous',
        'ubi',
        'undi',
        'us',
        'uter',
        'uti',
        'utro',
        'utribi',
        'tor',
        'co',
        'conco',
        'contor',
        'detor',
        'deco',
        'exco',
        'extor',
        'obtor',
        'optor',
        'retor',
        'reco',
        'attor',
        'inco',
        'intor',
        'praetor',
    }
    if word[-3:] == 'que':
        # This diverges from the paper by also returning 'que' itself unstemmed
        if word[:-3] in keep_que or word == 'que':
            return {'n': word, 'v': word}
        else:
            word = word[:-3]

    # Base case will mean returning the words as is
    noun = word
    verb = word

    # Rule 4
    n_endings = {
        4: {'ibus'},
        3: {'ius'},
        2: {
            'is',
            'nt',
            'ae',
            'os',
            'am',
            'ud',
            'as',
            'um',
            'em',
            'us',
            'es',
            'ia',
        },
        1: {'a', 'e', 'i', 'o', 'u'},
    }
    for endlen in range(4, 0, -1):
        if word[-endlen:] in n_endings[endlen]:
            if len(word) - 2 >= endlen:
                noun = word[:-endlen]
            else:
                noun = word
            break

    v_endings_strip = {
        6: {},
        5: {},
        4: {'mini', 'ntur', 'stis'},
        3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'},
        2: {'ns', 'nt', 'ri'},
        1: {'m', 'r', 's', 't'},
    }
    v_endings_alter = {
        6: {'iuntur'},
        5: {'beris', 'erunt', 'untur'},
        4: {'iunt'},
        3: {'bor', 'ero', 'unt'},
        2: {'bo'},
        1: {},
    }
    for endlen in range(6, 0, -1):
        if word[-endlen:] in v_endings_strip[endlen]:
            if len(word) - 2 >= endlen:
                verb = word[:-endlen]
            else:
                verb = word
            break
        if word[-endlen:] in v_endings_alter[endlen]:
            if word[-endlen:] in {'iuntur', 'erunt', 'untur', 'iunt', 'unt'}:
                new_word = word[:-endlen] + 'i'
                addlen = 1
            elif word[-endlen:] in {'beris', 'bor', 'bo'}:
                new_word = word[:-endlen] + 'bi'
                addlen = 2
            else:
                new_word = word[:-endlen] + 'eri'
                addlen = 3

            # Technically this diverges from the paper by considering the
            # length of the stem without the new suffix
            if len(new_word) >= 2 + addlen:
                verb = new_word
            else:
                verb = word
            break

    return {'n': noun, 'v': verb}


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.stemmer._schinke.
20
21		The stemmer.schinke module defines the Schinke Latin stemmer.
22		"""
23
24	1	from __future__ import unicode_literals
25
26	1	from unicodedata import normalize
27
28	1	from six import text_type
29	1	from six.moves import range
30
31	1	__all__ = ['schinke']
32
33
34	1	def schinke(word):
35		"""Return the stem of a word according to the Schinke stemmer.
36
37		This is defined in :cite:`Schinke:1996`.
38
39		:param str word: the word to stem
40		:returns: a dict of the noun- and verb-stemmed word
41		:rtype: dict
42
43		>>> schinke('atque')
44		{'n': 'atque', 'v': 'atque'}
45		>>> schinke('census')
46		{'n': 'cens', 'v': 'censu'}
47		>>> schinke('virum')
48		{'n': 'uir', 'v': 'uiru'}
49		>>> schinke('populusque')
50		{'n': 'popul', 'v': 'populu'}
51		>>> schinke('senatus')
52		{'n': 'senat', 'v': 'senatu'}
53		"""
54	1	word = normalize('NFKD', text_type(word.lower()))
55	1	word = ''.join(
56		c
57		for c in word
58		if c
59		in {
60		'a',
61		'b',
62		'c',
63		'd',
64		'e',
65		'f',
66		'g',
67		'h',
68		'i',
69		'j',
70		'k',
71		'l',
72		'm',
73		'n',
74		'o',
75		'p',
76		'q',
77		'r',
78		's',
79		't',
80		'u',
81		'v',
82		'w',
83		'x',
84		'y',
85		'z',
86		}
87		)
88
89		# Rule 2
90	1	word = word.replace('j', 'i').replace('v', 'u')
91
92		# Rule 3
93	1	keep_que = {
94		'at',
95		'quo',
96		'ne',
97		'ita',
98		'abs',
99		'aps',
100		'abus',
101		'adae',
102		'adus',
103		'deni',
104		'de',
105		'sus',
106		'obli',
107		'perae',
108		'plenis',
109		'quando',
110		'quis',
111		'quae',
112		'cuius',
113		'cui',
114		'quem',
115		'quam',
116		'qua',
117		'qui',
118		'quorum',
119		'quarum',
120		'quibus',
121		'quos',
122		'quas',
123		'quotusquis',
124		'quous',
125		'ubi',
126		'undi',
127		'us',
128		'uter',
129		'uti',
130		'utro',
131		'utribi',
132		'tor',
133		'co',
134		'conco',
135		'contor',
136		'detor',
137		'deco',
138		'exco',
139		'extor',
140		'obtor',
141		'optor',
142		'retor',
143		'reco',
144		'attor',
145		'inco',
146		'intor',
147		'praetor',
148		}
149	1	if word[-3:] == 'que':
150		# This diverges from the paper by also returning 'que' itself unstemmed
151	1	if word[:-3] in keep_que or word == 'que':
152	1	return {'n': word, 'v': word}
153		else:
154	1	word = word[:-3]
155
156		# Base case will mean returning the words as is
157	1	noun = word
158	1	verb = word
159
160		# Rule 4
161	1	n_endings = {
162		4: {'ibus'},
163		3: {'ius'},
164		2: {
165		'is',
166		'nt',
167		'ae',
168		'os',
169		'am',
170		'ud',
171		'as',
172		'um',
173		'em',
174		'us',
175		'es',
176		'ia',
177		},
178		1: {'a', 'e', 'i', 'o', 'u'},
179		}
180	1	for endlen in range(4, 0, -1):
181	1	if word[-endlen:] in n_endings[endlen]:
182	1	if len(word) - 2 >= endlen:
183	1	noun = word[:-endlen]
184		else:
185	1	noun = word
186	1	break
187
188	1	v_endings_strip = {
189		6: {},
190		5: {},
191		4: {'mini', 'ntur', 'stis'},
192		3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'},
193		2: {'ns', 'nt', 'ri'},
194		1: {'m', 'r', 's', 't'},
195		}
196	1	v_endings_alter = {
197		6: {'iuntur'},
198		5: {'beris', 'erunt', 'untur'},
199		4: {'iunt'},
200		3: {'bor', 'ero', 'unt'},
201		2: {'bo'},
202		1: {},
203		}
204	1	for endlen in range(6, 0, -1):
205	1	if word[-endlen:] in v_endings_strip[endlen]:
206	1	if len(word) - 2 >= endlen:
207	1	verb = word[:-endlen]
208		else:
209	1	verb = word
210	1	break
211	1	if word[-endlen:] in v_endings_alter[endlen]:
212	1	if word[-endlen:] in {'iuntur', 'erunt', 'untur', 'iunt', 'unt'}:
213	1	new_word = word[:-endlen] + 'i'
214	1	addlen = 1
215	1	elif word[-endlen:] in {'beris', 'bor', 'bo'}:
216	1	new_word = word[:-endlen] + 'bi'
217	1	addlen = 2
218		else:
219	1	new_word = word[:-endlen] + 'eri'
220	1	addlen = 3
221
222		# Technically this diverges from the paper by considering the
223		# length of the stem without the new suffix
224	1	if len(new_word) >= 2 + addlen:
225	1	verb = new_word
226		else:
227	1	verb = word
228	1	break
229
230	1	return {'n': noun, 'v': verb}
231
232
233		if __name__ == '__main__':
234		import doctest
235
236		doctest.testmod()
237

chrislit / abydos

Branch — master (78a222)

abydos.stemmer._schinke A

Complexity

Size/Duplication

Test Coverage

Importance

1 Function

Duplication Side-by-Side

Filter issues like