Completed
Push — master ( 3ac297...afe14d )
by Chris
16:40 queued 07:25
created

abydos.tokenizer._qgrams   A

Complexity

Total Complexity 11

Size/Duplication

Total Lines 152
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 152
ccs 28
cts 28
cp 1
rs 10
c 0
b 0
f 0
wmc 11

2 Methods

Rating   Name   Duplication   Size   Complexity  
C QGrams.__init__() 0 74 10
A QGrams.count() 0 24 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.tokenizer._q_grams.
20
21
QGrams multi-set class
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from collections import Counter, Iterable
32
33 1
from six.moves import range
34
35 1
__all__ = ['QGrams']
36
37
38 1
class QGrams(Counter):
39
    """A q-gram class, which functions like a bag/multiset.
40
41
    A q-gram is here defined as all sequences of q characters. Q-grams are also
42
    known as k-grams and n-grams, but the term n-gram more typically refers to
43
    sequences of whitespace-delimited words in a string, where q-gram refers
44
    to sequences of characters in a word or string.
45
    """
46
47 1
    def __init__(self, term, qval=2, start_stop='$#', skip=0):
48
        """Initialize QGrams.
49
50
        Parameters
51
        ----------
52
        term : str
53
            A string to extract q-grams from
54
        qval : int or Iterable
55
            The q-gram length (defaults to 2), can be an integer, range object,
56
            or list
57
        start_stop : str
58
            A string of length >= 0 indicating start & stop symbols.
59
            If the string is '', q-grams will be calculated without start &
60
            stop symbols appended to each end.
61
            Otherwise, the first character of start_stop will pad the
62
            beginning of the string and the last character of start_stop
63
            will pad the end of the string before q-grams are calculated.
64
            (In the case that start_stop is only 1 character long, the same
65
            symbol will be used for both.)
66
        skip : int or Iterable
67
            The number of characters to skip, can be an integer, range object,
68
            or list
69
70
        Examples
71
        --------
72
        >>> qg = QGrams('AATTATAT')
73
        >>> qg
74
        QGrams({'AT': 3, 'TA': 2, '$A': 1, 'AA': 1, 'TT': 1, 'T#': 1})
75
76
        >>> qg = QGrams('AATTATAT', qval=1, start_stop='')
77
        >>> qg
78
        QGrams({'A': 4, 'T': 4})
79
80
        >>> qg = QGrams('AATTATAT', qval=3, start_stop='')
81
        >>> qg
82
        QGrams({'TAT': 2, 'AAT': 1, 'ATT': 1, 'TTA': 1, 'ATA': 1})
83
84
        """
85
        # Save the term itself
86 1
        self._term = term
87 1
        self._term_ss = term
88 1
        self._ordered_list = []
89
90 1
        if not isinstance(qval, Iterable):
91 1
            qval = (qval,)
92 1
        if not isinstance(skip, Iterable):
93 1
            skip = (skip,)
94
95 1
        for qval_i in qval:
96 1
            for skip_i in skip:
97 1
                if len(self._term) < qval_i or qval_i < 1:
98 1
                    continue
99
100 1
                if start_stop and qval_i > 1:
101 1
                    term = (
102
                        start_stop[0] * (qval_i - 1)
103
                        + self._term
104
                        + start_stop[-1] * (qval_i - 1)
105
                    )
106
                else:
107 1
                    term = self._term
108
109
                # Having appended start & stop symbols (or not), save the
110
                # result, but only for the longest valid qval_i
111 1
                if len(term) > len(self._term_ss):
112 1
                    self._term_ss = term
113
114 1
                skip_i += 1
115 1
                self._ordered_list += [
116
                    term[i : i + (qval_i * skip_i) : skip_i]
117
                    for i in range(len(term) - (qval_i - 1))
118
                ]
119
120 1
        super(QGrams, self).__init__(self._ordered_list)
121
122 1
    def count(self):
123
        """Return q-grams count.
124
125
        Returns
126
        -------
127
        int
128
            The total count of q-grams in a QGrams object
129
130
        Examples
131
        --------
132
        >>> qg = QGrams('AATTATAT')
133
        >>> qg.count()
134
        9
135
136
        >>> qg = QGrams('AATTATAT', qval=1, start_stop='')
137
        >>> qg.count()
138
        8
139
140
        >>> qg = QGrams('AATTATAT', qval=3, start_stop='')
141
        >>> qg.count()
142
        6
143
144
        """
145 1
        return sum(self.values())
146
147
148
if __name__ == '__main__':
149
    import doctest
150
151
    doctest.testmod()
152