Completed
Branch master (78a222)
by Chris
14:36
created

abydos.fingerprint._basic.qgram_fingerprint()   A

Complexity

Conditions 1

Size

Total Lines 28
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 28
ccs 6
cts 6
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 4
crap 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.fingerprint._basic.
20
21
The fingerprint.basic module implements string fingerprints described at
22
:cite:`OpenRefine:2012`:
23
24
    - string fingerprint
25
    - q-gram fingerprint
26
    - phonetic fingerprint
27
"""
28
29 1
from __future__ import unicode_literals
30
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34
35 1
from ..phonetic import double_metaphone
36 1
from ..tokenizer import QGrams
37
38 1
__all__ = ['phonetic_fingerprint', 'qgram_fingerprint', 'str_fingerprint']
39
40
41 1
def str_fingerprint(phrase, joiner=' '):
42
    """Return string fingerprint.
43
44
    The fingerprint of a string is a string consisting of all of the unique
45
    words in a string, alphabetized & concatenated with intervening joiners.
46
    This fingerprint is described at :cite:`OpenRefine:2012`.
47
48
    :param str phrase: the string from which to calculate the fingerprint
49
    :param str joiner: the string that will be placed between each word
50
    :returns: the fingerprint of the phrase
51
    :rtype: str
52
53
    >>> str_fingerprint('The quick brown fox jumped over the lazy dog.')
54
    'brown dog fox jumped lazy over quick the'
55
    """
56 1
    phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower()))
57 1
    phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()])
58 1
    phrase = joiner.join(sorted(list(set(phrase.split()))))
59 1
    return phrase
60
61
62 1
def qgram_fingerprint(phrase, qval=2, start_stop='', joiner=''):
63
    """Return Q-Gram fingerprint.
64
65
    A q-gram fingerprint is a string consisting of all of the unique q-grams
66
    in a string, alphabetized & concatenated. This fingerprint is described at
67
    :cite:`OpenRefine:2012`.
68
69
    :param str phrase: the string from which to calculate the q-gram
70
        fingerprint
71
    :param int qval: the length of each q-gram (by default 2)
72
    :param str start_stop: the start & stop symbol(s) to concatenate on either
73
        end of the phrase, as defined in abydos.util.qgram()
74
    :param str joiner: the string that will be placed between each word
75
    :returns: the q-gram fingerprint of the phrase
76
    :rtype: str
77
78
    >>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.')
79
    'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
80
    >>> qgram_fingerprint('Christopher')
81
    'cherhehrisopphristto'
82
    >>> qgram_fingerprint('Niall')
83
    'aliallni'
84
    """
85 1
    phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower()))
86 1
    phrase = ''.join(c for c in phrase if c.isalnum())
87 1
    phrase = QGrams(phrase, qval, start_stop)
88 1
    phrase = joiner.join(sorted(phrase))
89 1
    return phrase
90
91
92 1
def phonetic_fingerprint(
93
    phrase, phonetic_algorithm=double_metaphone, joiner=' ', *args
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
94
):
95
    """Return the phonetic fingerprint of a phrase.
96
97
    A phonetic fingerprint is identical to a standard string fingerprint, as
98
    implemented in abydos.clustering.fingerprint(), but performs the
99
    fingerprinting function after converting the string to its phonetic form,
100
    as determined by some phonetic algorithm. This fingerprint is described at
101
    :cite:`OpenRefine:2012`.
102
103
    :param str phrase: the string from which to calculate the phonetic
104
        fingerprint
105
    :param function phonetic_algorithm: a phonetic algorithm that takes a
106
        string and returns a string (presumably a phonetic representation of
107
        the original string) By default, this function uses
108
        abydos.phonetic.double_metaphone()
109
    :param str joiner: the string that will be placed between each word
110
    :param args: additional arguments to pass to the phonetic algorithm,
111
        along with the phrase itself
112
    :returns: the phonetic fingerprint of the phrase
113
    :rtype: str
114
115
    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.')
116
    '0 afr fks jmpt kk ls prn tk'
117
    >>> from abydos.phonetic import soundex
118
    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.',
119
    ... phonetic_algorithm=soundex)
120
    'b650 d200 f200 j513 l200 o160 q200 t000'
121
    """
122 1
    phonetic = ''
123 1
    for word in phrase.split():
124 1
        word = phonetic_algorithm(word, *args)
125 1
        if not isinstance(word, text_type) and hasattr(word, '__iter__'):
126 1
            word = word[0]
127 1
        phonetic += word + joiner
128 1
    phonetic = phonetic[: -len(joiner)]
129 1
    return str_fingerprint(phonetic)
130
131
132
if __name__ == '__main__':
133
    import doctest
134
135
    doctest.testmod()
136