abydos.tokenizer._tokenizer._Tokenizer.tokenize() - Code Metrics - Inspection of "Merge pull request #257 from chrislit/0.6.0b" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 416c2f...9ec382 )

by Chris

created 2020-02-10 23:39 UTC

abydos.tokenizer._tokenizer._Tokenizer.tokenize() A

↳ Parent: abydos.tokenizer._tokenizer

Complexity

Conditions

Size

Total Lines	25
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	12
CRAP Score	1

Importance

Changes

Metric	Value
eloc	6
dl	0
loc	25
ccs	12
cts	12
cp	1
rs	10
c	0
b	0
f	0
cc	1
nop	2
crap	1

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.tokenizer._tokenize.

_Tokenizer base class
"""

from collections import Counter, defaultdict
from math import exp, log1p, log2
from typing import (
    Any,
    Callable,
    Counter as TCounter,
    DefaultDict,
    List,
    Optional,
    Set,
    Union,
    cast,
)

__all__ = ['_Tokenizer']


class _Tokenizer:
    """Abstract _Tokenizer class.

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        scaler: Optional[Union[str, Callable[[float], float]]] = None,
        *args: Any,
        **kwargs: Any
    ) -> None:
        """Initialize Tokenizer.

        Parameters
        ----------
        scaler : None, str, or function
            A scaling function for the Counter:

                - None : no scaling
                - 'set' : All non-zero values are set to 1.
                - 'length' : Each token has weight equal to its length.
                - 'length-log' : Each token has weight equal to the log of its
                   length + 1.
                - 'length-exp' : Each token has weight equal to e raised to its
                   length.
                - 'entropy' : Weights are scaled to the (log_2) information
                  entropy of each key's frequency.
                - a callable function : The function is applied to each value
                  in the Counter. Some useful functions include math.exp,
                  math.log1p, math.sqrt, and indexes into interesting integer
                  sequences such as the Fibonacci sequence.


        .. versionadded:: 0.4.0

        """
        super(_Tokenizer, self).__init__()

        self._scaler = scaler
        self._tokens = defaultdict(int)  # type: DefaultDict[str, float]
        self._string = ''
        self._ordered_tokens = []  # type: List[str]
        self._ordered_weights = []  # type: List[float]

    def tokenize(self, string: str) -> '_Tokenizer':
        """Tokenize the term and store it.

        The tokenized term is stored as an ordered list and as a defaultdict
        object.

        Parameters
        ----------
        string : str
            The string to tokenize


        .. versionadded:: 0.4.0
        .. versionchanged:: 0.4.1
            Added 'length', 'entropy', and related scalers
        .. versionchanged:: 0.6.0
            Moved scaling & counterizing to separate function

        """
        self._string = string
        self._ordered_tokens = [self._string]
        self._ordered_weights = [1]

        self._scale_and_counterize()
        return self

    def _scale_and_counterize(self) -> None:
        """Scale the tokens and store them in a defaultdict.

        .. versionadded:: 0.6.0

        """
        if self._scaler in {'SSK', 'length', 'length-log', 'length-exp'}:
            self._tokens = defaultdict(float)
            if cast(str, self._scaler)[:6] == 'length':
                self._ordered_weights = [len(_) for _ in self._ordered_tokens]
                if self._scaler == 'length-log':
                    self._ordered_weights = [
                        log1p(_) for _ in self._ordered_weights
                    ]
                elif self._scaler == 'length-exp':
                    self._ordered_weights = [
                        exp(_) for _ in self._ordered_weights
                    ]
            for token, weight in zip(
                self._ordered_tokens, self._ordered_weights
            ):
                self._tokens[token] += weight
        elif self._scaler == 'entropy':
            counts = Counter(self._ordered_tokens)
            n = len(self._ordered_tokens)
            self._tokens = defaultdict(float)
            self._tokens.update(
                {
                    key: -(val / n) * log2(val / n)
                    for key, val in counts.items()
                }
            )
            self._ordered_weights = [
                self._tokens[tok] / counts[tok] for tok in self._ordered_tokens
            ]
        else:
            self._tokens = defaultdict(int)
            self._tokens.update(Counter(self._ordered_tokens))

    def count(self) -> int:
        """Return token count.

        Returns
        -------
        int
            The total count of tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.count()
        1


        .. versionadded:: 0.4.0

        """
        return sum(self.get_counter().values())

    def count_unique(self) -> int:
        """Return the number of unique elements.

        Returns
        -------
        int
            The number of unique tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.count_unique()
        1


        .. versionadded:: 0.4.0

        """
        return len(self._tokens.values())

    def get_counter(self) -> TCounter[str]:
        """Return the tokens as a Counter object.

        Returns
        -------
        Counter
            The Counter of tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.get_counter()
        Counter({'term': 1})


        .. versionadded:: 0.4.0

        """
        if self._scaler == 'set':
            return Counter({key: 1 for key in self._tokens.keys()})
        elif callable(self._scaler):
            return Counter(
                {key: self._scaler(val) for key, val in self._tokens.items()}
            )
        else:
            return Counter(self._tokens)

    def get_set(self) -> Set[str]:
        """Return the unique tokens as a set.

        Returns
        -------
        Counter
            The set of tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.get_set()
        {'term'}


        .. versionadded:: 0.4.0

        """
        return set(self._tokens.keys())

    def get_list(self) -> List[str]:
        """Return the tokens as an ordered list.

        Returns
        -------
        Counter
            The list of q-grams in the order they were added.

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.get_list()
        ['term']


        .. versionadded:: 0.4.0

        """
        return self._ordered_tokens

    def __repr__(self) -> str:
        """Return representation of tokens object.

        .. versionadded:: 0.4.0

        """
        return self.__class__.__name__ + '({}'.format(str(self._tokens)[27:])

    def __and__(self, other: '_Tokenizer') -> TCounter[str]:
        """Return intersection with other tokens.

        .. versionadded:: 0.4.0

        """
        return self.get_counter() & other.get_counter()

    def __add__(self, other: '_Tokenizer') -> TCounter[str]:
        """Return union with other tokens.

        .. versionadded:: 0.4.0

        """
        return self.get_counter() + other.get_counter()

    def __sub__(self, other: '_Tokenizer') -> TCounter[str]:
        """Return difference from other tokens.

        .. versionadded:: 0.4.0

        """
        return self.get_counter() - other.get_counter()


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2018-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.tokenizer._tokenize.
18
19	1	_Tokenizer base class
20		"""
21
22		from collections import Counter, defaultdict
23		from math import exp, log1p, log2
24	1	from typing import (
25		Any,
26		Callable,
27		Counter as TCounter,
28		DefaultDict,
29		List,
30		Optional,
31	1	Set,
32	1	Union,
33		cast,
34	1	)
35
36		__all__ = ['_Tokenizer']
37	1
38
39		class _Tokenizer:
40		"""Abstract _Tokenizer class.
41
42		.. versionadded:: 0.4.0
43	1	"""
44
45		def __init__(
46		self,
47		scaler: Optional[Union[str, Callable[[float], float]]] = None,
48		*args: Any,
49		**kwargs: Any
50		) -> None:
51		"""Initialize Tokenizer.
52
53		Parameters
54		----------
55		scaler : None, str, or function
56		A scaling function for the Counter:
57
58		- None : no scaling
59		- 'set' : All non-zero values are set to 1.
60		- 'length' : Each token has weight equal to its length.
61		- 'length-log' : Each token has weight equal to the log of its
62		length + 1.
63		- 'length-exp' : Each token has weight equal to e raised to its
64		length.
65		- 'entropy' : Weights are scaled to the (log_2) information
66		entropy of each key's frequency.
67		- a callable function : The function is applied to each value
68		in the Counter. Some useful functions include math.exp,
69	1	math.log1p, math.sqrt, and indexes into interesting integer
70		sequences such as the Fibonacci sequence.
71	1
72	1
73	1	.. versionadded:: 0.4.0
74	1
75	1	"""
76		super(_Tokenizer, self).__init__()
77	1
78		self._scaler = scaler
79		self._tokens = defaultdict(int) # type: DefaultDict[str, float]
80		self._string = ''
81		self._ordered_tokens = [] # type: List[str]
82		self._ordered_weights = [] # type: List[float]
83
84		def tokenize(self, string: str) -> '_Tokenizer':
85		"""Tokenize the term and store it.
86
87		The tokenized term is stored as an ordered list and as a defaultdict
88		object.
89
90		Parameters
91		----------
92		string : str
93		The string to tokenize
94	1
95	1
96	1	.. versionadded:: 0.4.0
97	1	.. versionchanged:: 0.4.1
98		Added 'length', 'entropy', and related scalers
99	1	.. versionchanged:: 0.6.0
100	1	Moved scaling & counterizing to separate function
101	1
102	1	"""
103	1	self._string = string
104	1	self._ordered_tokens = [self._string]
105		self._ordered_weights = [1]
106
107	1	self._scale_and_counterize()
108	1	return self
109
110		def _scale_and_counterize(self) -> None:
111	1	"""Scale the tokens and store them in a defaultdict.
112
113		.. versionadded:: 0.6.0
114	1
115	1	"""
116	1	if self._scaler in {'SSK', 'length', 'length-log', 'length-exp'}:
117	1	self._tokens = defaultdict(float)
118	1	if cast(str, self._scaler)[:6] == 'length':
119		self._ordered_weights = [len(_) for _ in self._ordered_tokens]
120		if self._scaler == 'length-log':
121		self._ordered_weights = [
122	1	log1p(_) for _ in self._ordered_weights
123		]
124		elif self._scaler == 'length-exp':
125		self._ordered_weights = [
126	1	exp(_) for _ in self._ordered_weights
127		]
128	1	for token, weight in zip(
129		self._ordered_tokens, self._ordered_weights
130	1	):
131		self._tokens[token] += weight
132		elif self._scaler == 'entropy':
133		counts = Counter(self._ordered_tokens)
134		n = len(self._ordered_tokens)
135		self._tokens = defaultdict(float)
136		self._tokens.update(
137		{
138		key: -(val / n) * log2(val / n)
139		for key, val in counts.items()
140		}
141		)
142		self._ordered_weights = [
143		self._tokens[tok] / counts[tok] for tok in self._ordered_tokens
144		]
145		else:
146		self._tokens = defaultdict(int)
147		self._tokens.update(Counter(self._ordered_tokens))
148	1
149		def count(self) -> int:
150	1	"""Return token count.
151
152		Returns
153		-------
154		int
155		The total count of tokens
156
157		Examples
158		--------
159		>>> tok = _Tokenizer().tokenize('term')
160		>>> tok.count()
161		1
162
163
164		.. versionadded:: 0.4.0
165
166		"""
167		return sum(self.get_counter().values())
168	1
169		def count_unique(self) -> int:
170	1	"""Return the number of unique elements.
171
172		Returns
173		-------
174		int
175		The number of unique tokens
176
177		Examples
178		--------
179		>>> tok = _Tokenizer().tokenize('term')
180		>>> tok.count_unique()
181		1
182
183
184		.. versionadded:: 0.4.0
185
186		"""
187		return len(self._tokens.values())
188	1
189	1	def get_counter(self) -> TCounter[str]:
190	1	"""Return the tokens as a Counter object.
191	1
192		Returns
193		-------
194		Counter
195	1	The Counter of tokens
196
197	1	Examples
198		--------
199		>>> tok = _Tokenizer().tokenize('term')
200		>>> tok.get_counter()
201		Counter({'term': 1})
202
203
204		.. versionadded:: 0.4.0
205
206		"""
207		if self._scaler == 'set':
208		return Counter({key: 1 for key in self._tokens.keys()})
209		elif callable(self._scaler):
210		return Counter(
211		{key: self._scaler(val) for key, val in self._tokens.items()}
212		)
213		else:
214		return Counter(self._tokens)
215	1
216		def get_set(self) -> Set[str]:
217	1	"""Return the unique tokens as a set.
218
219		Returns
220		-------
221		Counter
222		The set of tokens
223
224		Examples
225		--------
226		>>> tok = _Tokenizer().tokenize('term')
227		>>> tok.get_set()
228		{'term'}
229
230
231		.. versionadded:: 0.4.0
232
233		"""
234		return set(self._tokens.keys())
235	1
236		def get_list(self) -> List[str]:
237	1	"""Return the tokens as an ordered list.
238
239		Returns
240		-------
241		Counter
242		The list of q-grams in the order they were added.
243	1
244		Examples
245	1	--------
246		>>> tok = _Tokenizer().tokenize('term')
247		>>> tok.get_list()
248		['term']
249
250
251	1	.. versionadded:: 0.4.0
252
253	1	"""
254		return self._ordered_tokens
255
256		def __repr__(self) -> str:
257		"""Return representation of tokens object.
258
259	1	.. versionadded:: 0.4.0
260
261	1	"""
262		return self.__class__.__name__ + '({}'.format(str(self._tokens)[27:])
263
264		def __and__(self, other: '_Tokenizer') -> TCounter[str]:
265		"""Return intersection with other tokens.
266
267	1	.. versionadded:: 0.4.0
268
269		"""
270		return self.get_counter() & other.get_counter()
271
272		def __add__(self, other: '_Tokenizer') -> TCounter[str]:
273		"""Return union with other tokens.
274
275		.. versionadded:: 0.4.0
276
277		"""
278		return self.get_counter() + other.get_counter()
279
280		def __sub__(self, other: '_Tokenizer') -> TCounter[str]:
281		"""Return difference from other tokens.
282
283		.. versionadded:: 0.4.0
284
285		"""
286		return self.get_counter() - other.get_counter()
287
288
289		if __name__ == '__main__':
290		import doctest
291
292		doctest.testmod()
293

chrislit / abydos

Push — master ( 416c2f...9ec382 )

abydos.tokenizer._tokenizer._Tokenizer.tokenize() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like