1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.tokenizer._q_grams. |
20
|
|
|
|
21
|
|
|
QGrams multi-set class |
22
|
|
|
""" |
23
|
|
|
|
24
|
1 |
|
from __future__ import ( |
25
|
|
|
absolute_import, |
26
|
|
|
division, |
27
|
|
|
print_function, |
28
|
|
|
unicode_literals, |
29
|
|
|
) |
30
|
|
|
|
31
|
1 |
|
from collections import Counter, Iterable |
32
|
|
|
|
33
|
1 |
|
from six.moves import range |
34
|
|
|
|
35
|
1 |
|
__all__ = ['QGrams'] |
36
|
|
|
|
37
|
|
|
|
38
|
1 |
|
class QGrams(Counter): |
39
|
|
|
"""A q-gram class, which functions like a bag/multiset. |
40
|
|
|
|
41
|
|
|
A q-gram is here defined as all sequences of q characters. Q-grams are also |
42
|
|
|
known as k-grams and n-grams, but the term n-gram more typically refers to |
43
|
|
|
sequences of whitespace-delimited words in a string, where q-gram refers |
44
|
|
|
to sequences of characters in a word or string. |
45
|
|
|
""" |
46
|
|
|
|
47
|
1 |
|
def __init__(self, term, qval=2, start_stop='$#', skip=0): |
48
|
|
|
"""Initialize QGrams. |
49
|
|
|
|
50
|
|
|
Parameters |
51
|
|
|
---------- |
52
|
|
|
term : str |
53
|
|
|
A string to extract q-grams from |
54
|
|
|
qval : int or Iterable |
55
|
|
|
The q-gram length (defaults to 2), can be an integer, range object, |
56
|
|
|
or list |
57
|
|
|
start_stop : str |
58
|
|
|
A string of length >= 0 indicating start & stop symbols. |
59
|
|
|
If the string is '', q-grams will be calculated without start & |
60
|
|
|
stop symbols appended to each end. |
61
|
|
|
Otherwise, the first character of start_stop will pad the |
62
|
|
|
beginning of the string and the last character of start_stop |
63
|
|
|
will pad the end of the string before q-grams are calculated. |
64
|
|
|
(In the case that start_stop is only 1 character long, the same |
65
|
|
|
symbol will be used for both.) |
66
|
|
|
skip : int or Iterable |
67
|
|
|
The number of characters to skip, can be an integer, range object, |
68
|
|
|
or list |
69
|
|
|
|
70
|
|
|
Examples |
71
|
|
|
-------- |
72
|
|
|
>>> qg = QGrams('AATTATAT') |
73
|
|
|
>>> qg |
74
|
|
|
QGrams({'AT': 3, 'TA': 2, '$A': 1, 'AA': 1, 'TT': 1, 'T#': 1}) |
75
|
|
|
|
76
|
|
|
>>> qg = QGrams('AATTATAT', qval=1, start_stop='') |
77
|
|
|
>>> qg |
78
|
|
|
QGrams({'A': 4, 'T': 4}) |
79
|
|
|
|
80
|
|
|
>>> qg = QGrams('AATTATAT', qval=3, start_stop='') |
81
|
|
|
>>> qg |
82
|
|
|
QGrams({'TAT': 2, 'AAT': 1, 'ATT': 1, 'TTA': 1, 'ATA': 1}) |
83
|
|
|
|
84
|
|
|
""" |
85
|
|
|
# Save the term itself |
86
|
1 |
|
self._term = term |
87
|
1 |
|
self._term_ss = term |
88
|
1 |
|
self._ordered_list = [] |
89
|
|
|
|
90
|
1 |
|
if not isinstance(qval, Iterable): |
91
|
1 |
|
qval = (qval,) |
92
|
1 |
|
if not isinstance(skip, Iterable): |
93
|
1 |
|
skip = (skip,) |
94
|
|
|
|
95
|
1 |
|
for qval_i in qval: |
96
|
1 |
|
for skip_i in skip: |
97
|
1 |
|
if len(self._term) < qval_i or qval_i < 1: |
98
|
1 |
|
continue |
99
|
|
|
|
100
|
1 |
|
if start_stop and qval_i > 1: |
101
|
1 |
|
term = ( |
102
|
|
|
start_stop[0] * (qval_i - 1) |
103
|
|
|
+ self._term |
104
|
|
|
+ start_stop[-1] * (qval_i - 1) |
105
|
|
|
) |
106
|
|
|
else: |
107
|
1 |
|
term = self._term |
108
|
|
|
|
109
|
|
|
# Having appended start & stop symbols (or not), save the |
110
|
|
|
# result, but only for the longest valid qval_i |
111
|
1 |
|
if len(term) > len(self._term_ss): |
112
|
1 |
|
self._term_ss = term |
113
|
|
|
|
114
|
1 |
|
skip_i += 1 |
115
|
1 |
|
self._ordered_list += [ |
116
|
|
|
term[i : i + (qval_i * skip_i) : skip_i] |
117
|
|
|
for i in range(len(term) - (qval_i - 1)) |
118
|
|
|
] |
119
|
|
|
|
120
|
1 |
|
super(QGrams, self).__init__(self._ordered_list) |
121
|
|
|
|
122
|
1 |
|
def count(self): |
123
|
|
|
"""Return q-grams count. |
124
|
|
|
|
125
|
|
|
Returns |
126
|
|
|
------- |
127
|
|
|
int |
128
|
|
|
The total count of q-grams in a QGrams object |
129
|
|
|
|
130
|
|
|
Examples |
131
|
|
|
-------- |
132
|
|
|
>>> qg = QGrams('AATTATAT') |
133
|
|
|
>>> qg.count() |
134
|
|
|
9 |
135
|
|
|
|
136
|
|
|
>>> qg = QGrams('AATTATAT', qval=1, start_stop='') |
137
|
|
|
>>> qg.count() |
138
|
|
|
8 |
139
|
|
|
|
140
|
|
|
>>> qg = QGrams('AATTATAT', qval=3, start_stop='') |
141
|
|
|
>>> qg.count() |
142
|
|
|
6 |
143
|
|
|
|
144
|
|
|
""" |
145
|
1 |
|
return sum(self.values()) |
146
|
|
|
|
147
|
|
|
|
148
|
|
|
if __name__ == '__main__': |
149
|
|
|
import doctest |
150
|
|
|
|
151
|
|
|
doctest.testmod() |
152
|
|
|
|