Completed
Push — master ( 482d7f...39cbea )
by Chris
14:30
created

abydos.qgram   A

Complexity

Total Complexity 11

Size/Duplication

Total Lines 127
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 32
dl 0
loc 127
rs 10
c 0
b 0
f 0
wmc 11

2 Methods

Rating   Name   Duplication   Size   Complexity  
A QGrams.count() 0 19 1
C QGrams.__init__() 0 57 10
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.qgram.
20
21
The qgram module defines the QGrams multi-set class
22
"""
23
24
from __future__ import division, unicode_literals
25
26
from collections import Counter, Iterable
27
28
from six.moves import range
29
30
31
# pylint: disable=abstract-method
32
class QGrams(Counter):
33
    """A q-gram class, which functions like a bag/multiset.
34
35
    A q-gram is here defined as all sequences of q characters. Q-grams are also
36
    known as k-grams and n-grams, but the term n-gram more typically refers to
37
    sequences of whitespace-delimited words in a string, where q-gram refers
38
    to sequences of characters in a word or string.
39
    """
40
41
    term = ''
42
    term_ss = ''
43
    ordered_list = []
44
45
    def __init__(self, term, qval=2, start_stop='$#', skip=0):
46
        """Initialize QGrams.
47
48
        :param str word: a string to extract q-grams from
49
        :param int or iterable qval: the q-gram length (defaults to 2), can be
50
            an integer, range object, or list
51
        :param str start_stop: a string of length >= 0 indicating start & stop
52
            symbols.
53
            If the string is '', q-grams will be calculated without start &
54
            stop symbols appended to each end.
55
            Otherwise, the first character of start_stop will pad the beginning
56
            of the string and the last character of start_stop will pad the end
57
            of the string before q-grams are calculated. (In the case that
58
            start_stop is only 1 character long, the same symbol will be used
59
            for both.)
60
        :param int or iterable skip: the number of characters to skip, can be
61
            an integer, range object, or list
62
63
        >>> qg = QGrams('AATTATAT')
64
        >>> qg
65
        QGrams({'AT': 3, 'TA': 2, '$A': 1, 'AA': 1, 'TT': 1, 'T#': 1})
66
67
        >>> qg = QGrams('AATTATAT', qval=1, start_stop='')
68
        >>> qg
69
        QGrams({'A': 4, 'T': 4})
70
71
        >>> qg = QGrams('AATTATAT', qval=3, start_stop='')
72
        >>> qg
73
        QGrams({'TAT': 2, 'AAT': 1, 'ATT': 1, 'TTA': 1, 'ATA': 1})
74
        """
75
        # Save the term itself
76
        self.term = term
77
        self.ordered_list = []
78
79
        if not isinstance(qval, Iterable):
80
            qval = (qval,)
81
        if not isinstance(skip, Iterable):
82
            skip = (skip,)
83
84
        for qval_i in qval:
85
            for skip_i in skip:
86
                if len(term) < qval_i or qval_i < 1:
87
                    continue
88
89
                if start_stop and qval_i > 1:
90
                    term = start_stop[0]*(qval_i-1) + term + start_stop[-1]*(qval_i-1)
91
92
                # Having appended start & stop symbols (or not), save the result
93
                # but only for the longest valid qval_i
94
                if len(term) > len(self.term_ss):
95
                    self.term_ss = term
96
97
                skip_i += 1
98
                self.ordered_list += [term[i:i+(qval_i*skip_i):skip_i] for i in
99
                                      range(len(term)-(qval_i-1))]
100
101
        super(QGrams, self).__init__(self.ordered_list)
102
103
    def count(self):
104
        """Return q-grams count.
105
106
        :returns: the total count of q-grams in a QGrams object
107
        :rtype: int
108
109
        >>> qg = QGrams('AATTATAT')
110
        >>> qg.count()
111
        9
112
113
        >>> qg = QGrams('AATTATAT', qval=1, start_stop='')
114
        >>> qg.count()
115
        8
116
117
        >>> qg = QGrams('AATTATAT', qval=3, start_stop='')
118
        >>> qg.count()
119
        6
120
        """
121
        return sum(self.values())
122
123
124
if __name__ == '__main__':
125
    import doctest
126
    doctest.testmod()
127