abydos.stemmer._schinke - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.stemmer._schinke A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	300
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	15
eloc	169
dl	0
loc	300
ccs	51
cts	51
cp	1
rs	10
c	0
b	0
f	0

2 Methods

Rating	Name	Duplication	Size	Complexity
F	Schinke.stem_dict()	0	124	14
A	Schinke.stem()	0	37	1

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._schinke.

Schinke Latin stemmer.
"""

from typing import Dict
from unicodedata import normalize

from ._stemmer import _Stemmer

__all__ = ['Schinke']


class Schinke(_Stemmer):
    """Schinke stemmer.

    This is defined in :cite:`Schinke:1996`.

    .. versionadded:: 0.3.6
    """

    _keep_que = {
        'at',
        'quo',
        'ne',
        'ita',
        'abs',
        'aps',
        'abus',
        'adae',
        'adus',
        'deni',
        'de',
        'sus',
        'obli',
        'perae',
        'plenis',
        'quando',
        'quis',
        'quae',
        'cuius',
        'cui',
        'quem',
        'quam',
        'qua',
        'qui',
        'quorum',
        'quarum',
        'quibus',
        'quos',
        'quas',
        'quotusquis',
        'quous',
        'ubi',
        'undi',
        'us',
        'uter',
        'uti',
        'utro',
        'utribi',
        'tor',
        'co',
        'conco',
        'contor',
        'detor',
        'deco',
        'exco',
        'extor',
        'obtor',
        'optor',
        'retor',
        'reco',
        'attor',
        'inco',
        'intor',
        'praetor',
    }

    _n_endings = {
        4: {'ibus'},
        3: {'ius'},
        2: {
            'is',
            'nt',
            'ae',
            'os',
            'am',
            'ud',
            'as',
            'um',
            'em',
            'us',
            'es',
            'ia',
        },
        1: {'a', 'e', 'i', 'o', 'u'},
    }

    _v_endings_strip = {
        6: {},
        5: {},
        4: {'mini', 'ntur', 'stis'},
        3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'},
        2: {'ns', 'nt', 'ri'},
        1: {'m', 'r', 's', 't'},
    }
    _v_endings_alter = {
        6: {'iuntur'},
        5: {'beris', 'erunt', 'untur'},
        4: {'iunt'},
        3: {'bor', 'ero', 'unt'},
        2: {'bo'},
        1: {},
    }

    def stem(self, word: str) -> str:
        """Return the stem of a word according to the Schinke stemmer.

        Parameters
        ----------
        word : str
            The word to stem

        Returns
        -------
        str
            Word stem

        Examples
        --------
        >>> stmr = Schinke()
        >>> stmr.stem('atque')
        'atque,atque'
        >>> stmr.stem('census')
        'cens,censu'
        >>> stmr.stem('virum')
        'uir,uiru'
        >>> stmr.stem('populusque')
        'popul,populu'
        >>> stmr.stem('senatus')
        'senat,senatu'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class
        .. versionchanged:: 0.6.0
            Made return a str with the noun then verb stem, comma-separated

        """
        nv = self.stem_dict(word)
        return '{0},{1}'.format(nv['n'], nv['v'])

    def stem_dict(self, word: str) -> Dict[str, str]:
        """Return the stem of a word according to the Schinke stemmer.

        Parameters
        ----------
        word : str
            The word to stem

        Returns
        -------
        dict
            Word stems in a dictionary

        Examples
        --------
        >>> stmr = Schinke()
        >>> stmr.stem_dict('atque')
        {'n': 'atque', 'v': 'atque'}
        >>> stmr.stem_dict('census')
        {'n': 'cens', 'v': 'censu'}
        >>> stmr.stem_dict('virum')
        {'n': 'uir', 'v': 'uiru'}
        >>> stmr.stem_dict('populusque')
        {'n': 'popul', 'v': 'populu'}
        >>> stmr.stem_dict('senatus')
        {'n': 'senat', 'v': 'senatu'}


        .. versionadded:: 0.6.0

        """
        word = normalize('NFKD', word.lower())
        word = ''.join(
            c
            for c in word
            if c
            in {
                'a',
                'b',
                'c',
                'd',
                'e',
                'f',
                'g',
                'h',
                'i',
                'j',
                'k',
                'l',
                'm',
                'n',
                'o',
                'p',
                'q',
                'r',
                's',
                't',
                'u',
                'v',
                'w',
                'x',
                'y',
                'z',
            }
        )

        # Rule 2
        word = word.replace('j', 'i').replace('v', 'u')

        # Rule 3
        if word[-3:] == 'que':
            # This diverges from the paper by also returning 'que' itself
            #  unstemmed
            if word[:-3] in self._keep_que or word == 'que':
                return {'n': word, 'v': word}
            else:
                word = word[:-3]

        # Base case will mean returning the words as is
        noun = word
        verb = word

        # Rule 4
        for endlen in range(4, 0, -1):
            if word[-endlen:] in self._n_endings[endlen]:
                if len(word) - 2 >= endlen:
                    noun = word[:-endlen]
                else:
                    noun = word
                break

        for endlen in range(6, 0, -1):
            if word[-endlen:] in self._v_endings_strip[endlen]:
                if len(word) - 2 >= endlen:
                    verb = word[:-endlen]
                else:
                    verb = word
                break
            if word[-endlen:] in self._v_endings_alter[endlen]:
                if word[-endlen:] in {
                    'iuntur',
                    'erunt',
                    'untur',
                    'iunt',
                    'unt',
                }:
                    new_word = word[:-endlen] + 'i'
                    addlen = 1
                elif word[-endlen:] in {'beris', 'bor', 'bo'}:
                    new_word = word[:-endlen] + 'bi'
                    addlen = 2
                else:
                    new_word = word[:-endlen] + 'eri'
                    addlen = 3

                # Technically this diverges from the paper by considering the
                # length of the stem without the new suffix
                if len(new_word) >= 2 + addlen:
                    verb = new_word
                else:
                    verb = word
                break

        return {'n': noun, 'v': verb}


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2014-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.stemmer._schinke.
18
19	1	Schinke Latin stemmer.
20		"""
21
22		from typing import Dict
23		from unicodedata import normalize
24	1
25		from ._stemmer import _Stemmer
26
27		__all__ = ['Schinke']
28
29
30		class Schinke(_Stemmer):
31	1	"""Schinke stemmer.
32
33	1	This is defined in :cite:`Schinke:1996`.
34
35	1	.. versionadded:: 0.3.6
36	1	"""
37
38	1	_keep_que = {
39	1	'at',
40		'quo',
41	1	'ne',
42		'ita',
43		'abs',
44	1	'aps',
45		'abus',
46		'adae',
47		'adus',
48		'deni',
49		'de',
50		'sus',
51		'obli',
52	1	'perae',
53		'plenis',
54		'quando',
55		'quis',
56		'quae',
57		'cuius',
58		'cui',
59		'quem',
60		'quam',
61		'qua',
62		'qui',
63		'quorum',
64		'quarum',
65		'quibus',
66		'quos',
67		'quas',
68		'quotusquis',
69		'quous',
70		'ubi',
71		'undi',
72		'us',
73		'uter',
74		'uti',
75		'utro',
76		'utribi',
77		'tor',
78		'co',
79		'conco',
80		'contor',
81		'detor',
82		'deco',
83		'exco',
84		'extor',
85		'obtor',
86		'optor',
87		'retor',
88		'reco',
89		'attor',
90		'inco',
91		'intor',
92		'praetor',
93		}
94
95		_n_endings = {
96		4: {'ibus'},
97		3: {'ius'},
98		2: {
99		'is',
100		'nt',
101		'ae',
102		'os',
103		'am',
104		'ud',
105		'as',
106		'um',
107		'em',
108		'us',
109	1	'es',
110		'ia',
111		},
112		1: {'a', 'e', 'i', 'o', 'u'},
113		}
114
115		_v_endings_strip = {
116		6: {},
117		5: {},
118		4: {'mini', 'ntur', 'stis'},
119		3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'},
120		2: {'ns', 'nt', 'ri'},
121		1: {'m', 'r', 's', 't'},
122		}
123		_v_endings_alter = {
124		6: {'iuntur'},
125		5: {'beris', 'erunt', 'untur'},
126		4: {'iunt'},
127		3: {'bor', 'ero', 'unt'},
128		2: {'bo'},
129	1	1: {},
130		}
131
132		def stem(self, word: str) -> str:
133		"""Return the stem of a word according to the Schinke stemmer.
134
135		Parameters
136		----------
137	1	word : str
138		The word to stem
139
140		Returns
141		-------
142		str
143		Word stem
144
145		Examples
146	1	--------
147		>>> stmr = Schinke()
148		>>> stmr.stem('atque')
149		'atque,atque'
150		>>> stmr.stem('census')
151		'cens,censu'
152		>>> stmr.stem('virum')
153		'uir,uiru'
154		>>> stmr.stem('populusque')
155		'popul,populu'
156		>>> stmr.stem('senatus')
157		'senat,senatu'
158
159
160		.. versionadded:: 0.3.0
161		.. versionchanged:: 0.3.6
162		Encapsulated in class
163		.. versionchanged:: 0.6.0
164		Made return a str with the noun then verb stem, comma-separated
165
166		"""
167		nv = self.stem_dict(word)
168		return '{0},{1}'.format(nv['n'], nv['v'])
169
170		def stem_dict(self, word: str) -> Dict[str, str]:
171		"""Return the stem of a word according to the Schinke stemmer.
172
173		Parameters
174		----------
175		word : str
176		The word to stem
177
178		Returns
179	1	-------
180	1	dict
181		Word stems in a dictionary
182
183		Examples
184		--------
185		>>> stmr = Schinke()
186		>>> stmr.stem_dict('atque')
187		{'n': 'atque', 'v': 'atque'}
188		>>> stmr.stem_dict('census')
189		{'n': 'cens', 'v': 'censu'}
190		>>> stmr.stem_dict('virum')
191		{'n': 'uir', 'v': 'uiru'}
192		>>> stmr.stem_dict('populusque')
193		{'n': 'popul', 'v': 'populu'}
194		>>> stmr.stem_dict('senatus')
195		{'n': 'senat', 'v': 'senatu'}
196
197
198		.. versionadded:: 0.6.0
199
200		"""
201		word = normalize('NFKD', word.lower())
202		word = ''.join(
203		c
204		for c in word
205		if c
206		in {
207		'a',
208		'b',
209		'c',
210		'd',
211		'e',
212		'f',
213		'g',
214		'h',
215	1	'i',
216		'j',
217		'k',
218	1	'l',
219		'm',
220		'n',
221	1	'o',
222	1	'p',
223		'q',
224	1	'r',
225		's',
226		't',
227	1	'u',
228	1	'v',
229		'w',
230		'x',
231	1	'y',
232	1	'z',
233	1	}
234	1	)
235
236	1	# Rule 2
237	1	word = word.replace('j', 'i').replace('v', 'u')
238
239	1	# Rule 3
240	1	if word[-3:] == 'que':
241	1	# This diverges from the paper by also returning 'que' itself
242	1	# unstemmed
243		if word[:-3] in self._keep_que or word == 'que':
244	1	return {'n': word, 'v': word}
245	1	else:
246	1	word = word[:-3]
247	1
248		# Base case will mean returning the words as is
249		noun = word
250		verb = word
251
252		# Rule 4
253		for endlen in range(4, 0, -1):
254	1	if word[-endlen:] in self._n_endings[endlen]:
255	1	if len(word) - 2 >= endlen:
256	1	noun = word[:-endlen]
257	1	else:
258	1	noun = word
259		break
260	1
261	1	for endlen in range(6, 0, -1):
262		if word[-endlen:] in self._v_endings_strip[endlen]:
263		if len(word) - 2 >= endlen:
264		verb = word[:-endlen]
265	1	else:
266	1	verb = word
267		break
268	1	if word[-endlen:] in self._v_endings_alter[endlen]:
269	1	if word[-endlen:] in {
270		'iuntur',
271	1	'erunt',
272		'untur',
273		'iunt',
274	1	'unt',
275		}:
276		new_word = word[:-endlen] + 'i'
277		addlen = 1
278		elif word[-endlen:] in {'beris', 'bor', 'bo'}:
279		new_word = word[:-endlen] + 'bi'
280		addlen = 2
281		else:
282		new_word = word[:-endlen] + 'eri'
283		addlen = 3
284
285		# Technically this diverges from the paper by considering the
286		# length of the stem without the new suffix
287		if len(new_word) >= 2 + addlen:
288		verb = new_word
289		else:
290		verb = word
291		break
292
293		return {'n': noun, 'v': verb}
294
295
296		if __name__ == '__main__':
297		import doctest
298
299		doctest.testmod()
300

chrislit / abydos

abydos.stemmer._schinke A last analyzed 2020-12-31 20:10 UTC

Complexity

Size/Duplication

Test Coverage

Importance

2 Methods

Duplication Side-by-Side

Filter issues like

abydos.stemmer._schinke A
last analyzed 2020-12-31 20:10 UTC