abydos.tokenizer._tokenizer._Tokenizer.tokenize() - Code Metrics - Inspection of "Merge pull request #240 from chrislit/0.4.1" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 643512...2b6b3e )

by Chris

created 2020-01-06 02:41 UTC

abydos.tokenizer._tokenizer._Tokenizer.tokenize() B

↳ Parent: abydos.tokenizer._tokenizer

Complexity

Conditions

Size

Total Lines	52
Code Lines	28

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	22
CRAP Score	8

Importance

Changes

Metric	Value
eloc	28
dl	0
loc	52
ccs	22
cts	22
cp	1
rs	7.3333
c	0
b	0
f	0
cc	8
nop	2
crap	8

How to fix Long Method

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.tokenizer._tokenize.

_Tokenizer base class
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from collections import Counter
from math import exp, log, log1p

__all__ = ['_Tokenizer']


class _Tokenizer(object):
    """Abstract _Tokenizer class.

    .. versionadded:: 0.4.0
    """

    def __init__(self, scaler=None, *args, **kwargs):
        """Initialize Tokenizer.

        Parameters
        ----------
        scaler : None, str, or function
            A scaling function for the Counter:

                - None : no scaling
                - 'set' : All non-zero values are set to 1.
                - 'length' : Each token has weight equal to its length.
                - 'length-log' : Each token has weight equal to the log of its
                   length + 1.
                - 'length-exp' : Each token has weight equal to e raised to its
                   length.
                - 'entropy' : Weights are scaled to the (log_2) information
                  entropy of each key's frequency.
                - a callable function : The function is applied to each value
                  in the Counter. Some useful functions include math.exp,
                  math.log1p, math.sqrt, and indexes into interesting integer
                  sequences such as the Fibonacci sequence.


        .. versionadded:: 0.4.0

        """
        super(_Tokenizer, self).__init__()

        self._scaler = scaler
        self._tokens = Counter()
        self._string = ''
        self._ordered_tokens = []
        self._ordered_weights = []

    def tokenize(self, string=None):
        """Tokenize the term and store it.

        The tokenized term is stored as an ordered list and as a Counter
        object.

        Parameters
        ----------
        string : str or None
            The string to tokenize


        .. versionadded:: 0.4.0
        .. versionchanged:: 0.4.1
            Added 'length', 'entropy', and related scalers

        """
        if string is not None:
            self._string = string
            self._ordered_tokens = [self._string]
            self._ordered_weights = [1]

        if self._scaler in {'SSK', 'length', 'length-log', 'length-exp'}:
            self._tokens = Counter()
            if self._scaler[:6] == 'length':
                self._ordered_weights = [len(_) for _ in self._ordered_tokens]
                if self._scaler == 'length-log':
                    self._ordered_weights = [
                        log1p(_) for _ in self._ordered_weights
                    ]
                elif self._scaler == 'length-exp':
                    self._ordered_weights = [
                        exp(_) for _ in self._ordered_weights
                    ]
            for token, weight in zip(
                self._ordered_tokens, self._ordered_weights
            ):
                self._tokens[token] += weight
        elif self._scaler == 'entropy':
            counts = Counter(self._ordered_tokens)
            n = len(self._ordered_tokens)
            self._tokens = {
                key: -(val / n) * log(val / n, 2)
                for key, val in counts.items()
            }
            self._ordered_weights = [
                self._tokens[tok] / counts[tok] for tok in self._ordered_tokens
            ]
        else:
            self._tokens = Counter(self._ordered_tokens)

        return self

    def count(self):
        """Return token count.

        Returns
        -------
        int
            The total count of tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.count()
        1


        .. versionadded:: 0.4.0

        """
        return sum(self.get_counter().values())

    def count_unique(self):
        """Return the number of unique elements.

        Returns
        -------
        int
            The number of unique tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.count_unique()
        1


        .. versionadded:: 0.4.0

        """
        return len(self._tokens.values())

    def get_counter(self):
        """Return the tokens as a Counter object.

        Returns
        -------
        Counter
            The Counter of tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.get_counter()
        Counter({'term': 1})


        .. versionadded:: 0.4.0

        """
        if self._scaler == 'set':
            return Counter({key: 1 for key in self._tokens.keys()})
        elif callable(self._scaler):
            return Counter(
                {key: self._scaler(val) for key, val in self._tokens.items()}
            )
        else:
            return self._tokens

    def get_set(self):
        """Return the unique tokens as a set.

        Returns
        -------
        Counter
            The set of tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.get_set()
        {'term'}


        .. versionadded:: 0.4.0

        """
        return set(self._tokens.keys())

    def get_list(self):
        """Return the tokens as an ordered list.

        Returns
        -------
        Counter
            The list of q-grams in the order they were added.

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.get_list()
        ['term']


        .. versionadded:: 0.4.0

        """
        return self._ordered_tokens

    def __repr__(self):
        """Return representation of tokens object.

        .. versionadded:: 0.4.0

        """
        return self.__class__.__name__ + '{}'.format(str(self._tokens)[7:])

    def __and__(self, other):
        """Return intersection with other tokens.

        .. versionadded:: 0.4.0

        """
        return self.get_counter() & other.get_counter()

    def __add__(self, other):
        """Return union with other tokens.

        .. versionadded:: 0.4.0

        """
        return self.get_counter() + other.get_counter()

    def __sub__(self, other):
        """Return difference from other tokens.

        .. versionadded:: 0.4.0

        """
        return self.get_counter() - other.get_counter()


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.tokenizer._tokenize.
20
21		_Tokenizer base class
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from collections import Counter
32	1	from math import exp, log, log1p
33
34	1	__all__ = ['_Tokenizer']
35
36
37	1	class _Tokenizer(object):
38		"""Abstract _Tokenizer class.
39
40		.. versionadded:: 0.4.0
41		"""
42
43	1	def __init__(self, scaler=None, args, *kwargs):
44		"""Initialize Tokenizer.
45
46		Parameters
47		----------
48		scaler : None, str, or function
49		A scaling function for the Counter:
50
51		- None : no scaling
52		- 'set' : All non-zero values are set to 1.
53		- 'length' : Each token has weight equal to its length.
54		- 'length-log' : Each token has weight equal to the log of its
55		length + 1.
56		- 'length-exp' : Each token has weight equal to e raised to its
57		length.
58		- 'entropy' : Weights are scaled to the (log_2) information
59		entropy of each key's frequency.
60		- a callable function : The function is applied to each value
61		in the Counter. Some useful functions include math.exp,
62		math.log1p, math.sqrt, and indexes into interesting integer
63		sequences such as the Fibonacci sequence.
64
65
66		.. versionadded:: 0.4.0
67
68		"""
69	1	super(_Tokenizer, self).__init__()
70
71	1	self._scaler = scaler
72	1	self._tokens = Counter()
73	1	self._string = ''
74	1	self._ordered_tokens = []
75	1	self._ordered_weights = []
76
77	1	def tokenize(self, string=None):
78		"""Tokenize the term and store it.
79
80		The tokenized term is stored as an ordered list and as a Counter
81		object.
82
83		Parameters
84		----------
85		string : str or None
86		The string to tokenize
87
88
89		.. versionadded:: 0.4.0
90		.. versionchanged:: 0.4.1
91		Added 'length', 'entropy', and related scalers
92
93		"""
94	1	if string is not None:
95	1	self._string = string
96	1	self._ordered_tokens = [self._string]
97	1	self._ordered_weights = [1]
98
99	1	if self._scaler in {'SSK', 'length', 'length-log', 'length-exp'}:
100	1	self._tokens = Counter()
101	1	if self._scaler[:6] == 'length':
102	1	self._ordered_weights = [len(_) for _ in self._ordered_tokens]
103	1	if self._scaler == 'length-log':
104	1	self._ordered_weights = [
105		log1p(_) for _ in self._ordered_weights
106		]
107	1	elif self._scaler == 'length-exp':
108	1	self._ordered_weights = [
109		exp(_) for _ in self._ordered_weights
110		]
111	1	for token, weight in zip(
112		self._ordered_tokens, self._ordered_weights
113		):
114	1	self._tokens[token] += weight
115	1	elif self._scaler == 'entropy':
116	1	counts = Counter(self._ordered_tokens)
117	1	n = len(self._ordered_tokens)
118	1	self._tokens = {
119		key: -(val / n) * log(val / n, 2)
120		for key, val in counts.items()
121		}
122	1	self._ordered_weights = [
123		self._tokens[tok] / counts[tok] for tok in self._ordered_tokens
124		]
125		else:
126	1	self._tokens = Counter(self._ordered_tokens)
127
128	1	return self
129
130	1	def count(self):
131		"""Return token count.
132
133		Returns
134		-------
135		int
136		The total count of tokens
137
138		Examples
139		--------
140		>>> tok = _Tokenizer().tokenize('term')
141		>>> tok.count()
142		1
143
144
145		.. versionadded:: 0.4.0
146
147		"""
148	1	return sum(self.get_counter().values())
149
150	1	def count_unique(self):
151		"""Return the number of unique elements.
152
153		Returns
154		-------
155		int
156		The number of unique tokens
157
158		Examples
159		--------
160		>>> tok = _Tokenizer().tokenize('term')
161		>>> tok.count_unique()
162		1
163
164
165		.. versionadded:: 0.4.0
166
167		"""
168	1	return len(self._tokens.values())
169
170	1	def get_counter(self):
171		"""Return the tokens as a Counter object.
172
173		Returns
174		-------
175		Counter
176		The Counter of tokens
177
178		Examples
179		--------
180		>>> tok = _Tokenizer().tokenize('term')
181		>>> tok.get_counter()
182		Counter({'term': 1})
183
184
185		.. versionadded:: 0.4.0
186
187		"""
188	1	if self._scaler == 'set':
189	1	return Counter({key: 1 for key in self._tokens.keys()})
190	1	elif callable(self._scaler):
191	1	return Counter(
192		{key: self._scaler(val) for key, val in self._tokens.items()}
193		)
194		else:
195	1	return self._tokens
196
197	1	def get_set(self):
198		"""Return the unique tokens as a set.
199
200		Returns
201		-------
202		Counter
203		The set of tokens
204
205		Examples
206		--------
207		>>> tok = _Tokenizer().tokenize('term')
208		>>> tok.get_set()
209		{'term'}
210
211
212		.. versionadded:: 0.4.0
213
214		"""
215	1	return set(self._tokens.keys())
216
217	1	def get_list(self):
218		"""Return the tokens as an ordered list.
219
220		Returns
221		-------
222		Counter
223		The list of q-grams in the order they were added.
224
225		Examples
226		--------
227		>>> tok = _Tokenizer().tokenize('term')
228		>>> tok.get_list()
229		['term']
230
231
232		.. versionadded:: 0.4.0
233
234		"""
235	1	return self._ordered_tokens
236
237	1	def __repr__(self):
238		"""Return representation of tokens object.
239
240		.. versionadded:: 0.4.0
241
242		"""
243	1	return self.__class__.__name__ + '{}'.format(str(self._tokens)[7:])
244
245	1	def __and__(self, other):
246		"""Return intersection with other tokens.
247
248		.. versionadded:: 0.4.0
249
250		"""
251	1	return self.get_counter() & other.get_counter()
252
253	1	def __add__(self, other):
254		"""Return union with other tokens.
255
256		.. versionadded:: 0.4.0
257
258		"""
259	1	return self.get_counter() + other.get_counter()
260
261	1	def __sub__(self, other):
262		"""Return difference from other tokens.
263
264		.. versionadded:: 0.4.0
265
266		"""
267	1	return self.get_counter() - other.get_counter()
268
269
270		if __name__ == '__main__':
271		import doctest
272
273		doctest.testmod()
274

chrislit / abydos

Push — master ( 643512...2b6b3e )

abydos.tokenizer._tokenizer._Tokenizer.tokenize() B

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like