Passed
Push — master ( 416c2f...9ec382 )
by Chris
01:03 queued 13s
created

abydos.tokenizer._tokenizer._Tokenizer.tokenize()   A

Complexity

Conditions 1

Size

Total Lines 25
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 25
ccs 12
cts 12
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 2
crap 1
1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.tokenizer._tokenize.
18
19 1
_Tokenizer base class
20
"""
21
22
from collections import Counter, defaultdict
23
from math import exp, log1p, log2
24 1
from typing import (
25
    Any,
26
    Callable,
27
    Counter as TCounter,
28
    DefaultDict,
29
    List,
30
    Optional,
31 1
    Set,
32 1
    Union,
33
    cast,
34 1
)
35
36
__all__ = ['_Tokenizer']
37 1
38
39
class _Tokenizer:
40
    """Abstract _Tokenizer class.
41
42
    .. versionadded:: 0.4.0
43 1
    """
44
45
    def __init__(
46
        self,
47
        scaler: Optional[Union[str, Callable[[float], float]]] = None,
48
        *args: Any,
49
        **kwargs: Any
50
    ) -> None:
51
        """Initialize Tokenizer.
52
53
        Parameters
54
        ----------
55
        scaler : None, str, or function
56
            A scaling function for the Counter:
57
58
                - None : no scaling
59
                - 'set' : All non-zero values are set to 1.
60
                - 'length' : Each token has weight equal to its length.
61
                - 'length-log' : Each token has weight equal to the log of its
62
                   length + 1.
63
                - 'length-exp' : Each token has weight equal to e raised to its
64
                   length.
65
                - 'entropy' : Weights are scaled to the (log_2) information
66
                  entropy of each key's frequency.
67
                - a callable function : The function is applied to each value
68
                  in the Counter. Some useful functions include math.exp,
69 1
                  math.log1p, math.sqrt, and indexes into interesting integer
70
                  sequences such as the Fibonacci sequence.
71 1
72 1
73 1
        .. versionadded:: 0.4.0
74 1
75 1
        """
76
        super(_Tokenizer, self).__init__()
77 1
78
        self._scaler = scaler
79
        self._tokens = defaultdict(int)  # type: DefaultDict[str, float]
80
        self._string = ''
81
        self._ordered_tokens = []  # type: List[str]
82
        self._ordered_weights = []  # type: List[float]
83
84
    def tokenize(self, string: str) -> '_Tokenizer':
85
        """Tokenize the term and store it.
86
87
        The tokenized term is stored as an ordered list and as a defaultdict
88
        object.
89
90
        Parameters
91
        ----------
92
        string : str
93
            The string to tokenize
94 1
95 1
96 1
        .. versionadded:: 0.4.0
97 1
        .. versionchanged:: 0.4.1
98
            Added 'length', 'entropy', and related scalers
99 1
        .. versionchanged:: 0.6.0
100 1
            Moved scaling & counterizing to separate function
101 1
102 1
        """
103 1
        self._string = string
104 1
        self._ordered_tokens = [self._string]
105
        self._ordered_weights = [1]
106
107 1
        self._scale_and_counterize()
108 1
        return self
109
110
    def _scale_and_counterize(self) -> None:
111 1
        """Scale the tokens and store them in a defaultdict.
112
113
        .. versionadded:: 0.6.0
114 1
115 1
        """
116 1
        if self._scaler in {'SSK', 'length', 'length-log', 'length-exp'}:
117 1
            self._tokens = defaultdict(float)
118 1
            if cast(str, self._scaler)[:6] == 'length':
119
                self._ordered_weights = [len(_) for _ in self._ordered_tokens]
120
                if self._scaler == 'length-log':
121
                    self._ordered_weights = [
122 1
                        log1p(_) for _ in self._ordered_weights
123
                    ]
124
                elif self._scaler == 'length-exp':
125
                    self._ordered_weights = [
126 1
                        exp(_) for _ in self._ordered_weights
127
                    ]
128 1
            for token, weight in zip(
129
                self._ordered_tokens, self._ordered_weights
130 1
            ):
131
                self._tokens[token] += weight
132
        elif self._scaler == 'entropy':
133
            counts = Counter(self._ordered_tokens)
134
            n = len(self._ordered_tokens)
135
            self._tokens = defaultdict(float)
136
            self._tokens.update(
137
                {
138
                    key: -(val / n) * log2(val / n)
139
                    for key, val in counts.items()
140
                }
141
            )
142
            self._ordered_weights = [
143
                self._tokens[tok] / counts[tok] for tok in self._ordered_tokens
144
            ]
145
        else:
146
            self._tokens = defaultdict(int)
147
            self._tokens.update(Counter(self._ordered_tokens))
148 1
149
    def count(self) -> int:
150 1
        """Return token count.
151
152
        Returns
153
        -------
154
        int
155
            The total count of tokens
156
157
        Examples
158
        --------
159
        >>> tok = _Tokenizer().tokenize('term')
160
        >>> tok.count()
161
        1
162
163
164
        .. versionadded:: 0.4.0
165
166
        """
167
        return sum(self.get_counter().values())
168 1
169
    def count_unique(self) -> int:
170 1
        """Return the number of unique elements.
171
172
        Returns
173
        -------
174
        int
175
            The number of unique tokens
176
177
        Examples
178
        --------
179
        >>> tok = _Tokenizer().tokenize('term')
180
        >>> tok.count_unique()
181
        1
182
183
184
        .. versionadded:: 0.4.0
185
186
        """
187
        return len(self._tokens.values())
188 1
189 1
    def get_counter(self) -> TCounter[str]:
190 1
        """Return the tokens as a Counter object.
191 1
192
        Returns
193
        -------
194
        Counter
195 1
            The Counter of tokens
196
197 1
        Examples
198
        --------
199
        >>> tok = _Tokenizer().tokenize('term')
200
        >>> tok.get_counter()
201
        Counter({'term': 1})
202
203
204
        .. versionadded:: 0.4.0
205
206
        """
207
        if self._scaler == 'set':
208
            return Counter({key: 1 for key in self._tokens.keys()})
209
        elif callable(self._scaler):
210
            return Counter(
211
                {key: self._scaler(val) for key, val in self._tokens.items()}
212
            )
213
        else:
214
            return Counter(self._tokens)
215 1
216
    def get_set(self) -> Set[str]:
217 1
        """Return the unique tokens as a set.
218
219
        Returns
220
        -------
221
        Counter
222
            The set of tokens
223
224
        Examples
225
        --------
226
        >>> tok = _Tokenizer().tokenize('term')
227
        >>> tok.get_set()
228
        {'term'}
229
230
231
        .. versionadded:: 0.4.0
232
233
        """
234
        return set(self._tokens.keys())
235 1
236
    def get_list(self) -> List[str]:
237 1
        """Return the tokens as an ordered list.
238
239
        Returns
240
        -------
241
        Counter
242
            The list of q-grams in the order they were added.
243 1
244
        Examples
245 1
        --------
246
        >>> tok = _Tokenizer().tokenize('term')
247
        >>> tok.get_list()
248
        ['term']
249
250
251 1
        .. versionadded:: 0.4.0
252
253 1
        """
254
        return self._ordered_tokens
255
256
    def __repr__(self) -> str:
257
        """Return representation of tokens object.
258
259 1
        .. versionadded:: 0.4.0
260
261 1
        """
262
        return self.__class__.__name__ + '({}'.format(str(self._tokens)[27:])
263
264
    def __and__(self, other: '_Tokenizer') -> TCounter[str]:
265
        """Return intersection with other tokens.
266
267 1
        .. versionadded:: 0.4.0
268
269
        """
270
        return self.get_counter() & other.get_counter()
271
272
    def __add__(self, other: '_Tokenizer') -> TCounter[str]:
273
        """Return union with other tokens.
274
275
        .. versionadded:: 0.4.0
276
277
        """
278
        return self.get_counter() + other.get_counter()
279
280
    def __sub__(self, other: '_Tokenizer') -> TCounter[str]:
281
        """Return difference from other tokens.
282
283
        .. versionadded:: 0.4.0
284
285
        """
286
        return self.get_counter() - other.get_counter()
287
288
289
if __name__ == '__main__':
290
    import doctest
291
292
    doctest.testmod()
293