Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

_Tokenizer.get_counter()   A

Complexity

Conditions 3

Size

Total Lines 26
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 3

Importance

Changes 0
Metric Value
eloc 7
dl 0
loc 26
ccs 6
cts 6
cp 1
rs 10
c 0
b 0
f 0
cc 3
nop 1
crap 3
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.tokenizer._tokenize.
20
21
_Tokenizer base class
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from collections import Counter
32 1
from math import exp, log, log1p
33
34 1
__all__ = ['_Tokenizer']
35
36
37 1
class _Tokenizer(object):
38
    """Abstract _Tokenizer class.
39
40
    .. versionadded:: 0.4.0
41
    """
42
43 1
    def __init__(self, scaler=None, *args, **kwargs):
44
        """Initialize Tokenizer.
45
46
        Parameters
47
        ----------
48
        scaler : None, str, or function
49
            A scaling function for the Counter:
50
51
                - None : no scaling
52
                - 'set' : All non-zero values are set to 1.
53
                - 'length' : Each token has weight equal to its length.
54
                - 'length-log' : Each token has weight equal to the log of its
55
                   length + 1.
56
                - 'length-exp' : Each token has weight equal to e raised to its
57
                   length.
58
                - 'entropy' : Weights are scaled to the (log_2) information
59
                  entropy of each key's frequency.
60
                - a callable function : The function is applied to each value
61
                  in the Counter. Some useful functions include math.exp,
62
                  math.log1p, math.sqrt, and indexes into interesting integer
63
                  sequences such as the Fibonacci sequence.
64
65
66
        .. versionadded:: 0.4.0
67
68
        """
69 1
        super(_Tokenizer, self).__init__()
70
71 1
        self._scaler = scaler
72 1
        self._tokens = Counter()
73 1
        self._string = ''
74 1
        self._ordered_tokens = []
75 1
        self._ordered_weights = []
76
77 1
    def tokenize(self, string=None):
78
        """Tokenize the term and store it.
79
80
        The tokenized term is stored as an ordered list and as a Counter
81
        object.
82
83
        Parameters
84
        ----------
85
        string : str or None
86
            The string to tokenize
87
88
89
        .. versionadded:: 0.4.0
90
        .. versionchanged:: 0.4.1
91
            Added 'length', 'entropy', and related scalers
92
93
        """
94 1
        if string is not None:
95 1
            self._string = string
96 1
            self._ordered_tokens = [self._string]
97 1
            self._ordered_weights = [1]
98
99 1
        if self._scaler in {'SSK', 'length', 'length-log', 'length-exp'}:
100 1
            self._tokens = Counter()
101 1
            if self._scaler[:6] == 'length':
102 1
                self._ordered_weights = [len(_) for _ in self._ordered_tokens]
103 1
                if self._scaler == 'length-log':
104 1
                    self._ordered_weights = [
105
                        log1p(_) for _ in self._ordered_weights
106
                    ]
107 1
                elif self._scaler == 'length-exp':
108 1
                    self._ordered_weights = [
109
                        exp(_) for _ in self._ordered_weights
110
                    ]
111 1
            for token, weight in zip(
112
                self._ordered_tokens, self._ordered_weights
113
            ):
114 1
                self._tokens[token] += weight
115 1
        elif self._scaler == 'entropy':
116 1
            counts = Counter(self._ordered_tokens)
117 1
            n = len(self._ordered_tokens)
118 1
            self._tokens = {
119
                key: -(val / n) * log(val / n, 2)
120
                for key, val in counts.items()
121
            }
122 1
            self._ordered_weights = [
123
                self._tokens[tok] / counts[tok] for tok in self._ordered_tokens
124
            ]
125
        else:
126 1
            self._tokens = Counter(self._ordered_tokens)
127
128 1
        return self
129
130 1
    def count(self):
131
        """Return token count.
132
133
        Returns
134
        -------
135
        int
136
            The total count of tokens
137
138
        Examples
139
        --------
140
        >>> tok = _Tokenizer().tokenize('term')
141
        >>> tok.count()
142
        1
143
144
145
        .. versionadded:: 0.4.0
146
147
        """
148 1
        return sum(self.get_counter().values())
149
150 1
    def count_unique(self):
151
        """Return the number of unique elements.
152
153
        Returns
154
        -------
155
        int
156
            The number of unique tokens
157
158
        Examples
159
        --------
160
        >>> tok = _Tokenizer().tokenize('term')
161
        >>> tok.count_unique()
162
        1
163
164
165
        .. versionadded:: 0.4.0
166
167
        """
168 1
        return len(self._tokens.values())
169
170 1
    def get_counter(self):
171
        """Return the tokens as a Counter object.
172
173
        Returns
174
        -------
175
        Counter
176
            The Counter of tokens
177
178
        Examples
179
        --------
180
        >>> tok = _Tokenizer().tokenize('term')
181
        >>> tok.get_counter()
182
        Counter({'term': 1})
183
184
185
        .. versionadded:: 0.4.0
186
187
        """
188 1
        if self._scaler == 'set':
189 1
            return Counter({key: 1 for key in self._tokens.keys()})
190 1
        elif callable(self._scaler):
191 1
            return Counter(
192
                {key: self._scaler(val) for key, val in self._tokens.items()}
193
            )
194
        else:
195 1
            return self._tokens
196
197 1
    def get_set(self):
198
        """Return the unique tokens as a set.
199
200
        Returns
201
        -------
202
        Counter
203
            The set of tokens
204
205
        Examples
206
        --------
207
        >>> tok = _Tokenizer().tokenize('term')
208
        >>> tok.get_set()
209
        {'term'}
210
211
212
        .. versionadded:: 0.4.0
213
214
        """
215 1
        return set(self._tokens.keys())
216
217 1
    def get_list(self):
218
        """Return the tokens as an ordered list.
219
220
        Returns
221
        -------
222
        Counter
223
            The list of q-grams in the order they were added.
224
225
        Examples
226
        --------
227
        >>> tok = _Tokenizer().tokenize('term')
228
        >>> tok.get_list()
229
        ['term']
230
231
232
        .. versionadded:: 0.4.0
233
234
        """
235 1
        return self._ordered_tokens
236
237 1
    def __repr__(self):
238
        """Return representation of tokens object.
239
240
        .. versionadded:: 0.4.0
241
242
        """
243 1
        return self.__class__.__name__ + '{}'.format(str(self._tokens)[7:])
244
245 1
    def __and__(self, other):
246
        """Return intersection with other tokens.
247
248
        .. versionadded:: 0.4.0
249
250
        """
251 1
        return self.get_counter() & other.get_counter()
252
253 1
    def __add__(self, other):
254
        """Return union with other tokens.
255
256
        .. versionadded:: 0.4.0
257
258
        """
259 1
        return self.get_counter() + other.get_counter()
260
261 1
    def __sub__(self, other):
262
        """Return difference from other tokens.
263
264
        .. versionadded:: 0.4.0
265
266
        """
267 1
        return self.get_counter() - other.get_counter()
268
269
270
if __name__ == '__main__':
271
    import doctest
272
273
    doctest.testmod()
274