abydos.clustering.occurrence_fingerprint() - Code Metrics - Inspection of "sped up dictionary supplmentation a little" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 76e221...14a933 )

by Chris

created 2018-08-26 03:58 UTC

abydos.clustering.occurrence_fingerprint() A

↳ Parent: abydos.clustering

Complexity

Conditions

Size

Total Lines	31
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	5
eloc	14
nop	3
dl	0
loc	31
rs	9.2333
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.clustering.

The clustering module implements clustering algorithms such as:
    - string fingerprint
    - q-gram fingerprint
    - phonetic fingerprint
    - skeleton key
    - omission key
"""

from __future__ import division, unicode_literals

import unicodedata
from collections import Counter

from six import text_type
from six.moves import range

from .distance import sim
from .phonetic import double_metaphone
from .qgram import QGrams
from .stats import hmean


def fingerprint(phrase):
    """Return string fingerprint.

    The fingerprint of a string is a string consisting of all of the unique
    words in a string, alphabetized & concatenated with intervening spaces

    :param str phrase: the string from which to calculate the fingerprint
    :returns: the fingerprint of the phrase
    :rtype: str

    >>> fingerprint('The quick brown fox jumped over the lazy dog.')
    'brown dog fox jumped lazy over quick the'
    """
    phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower()))
    phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()])
    phrase = ' '.join(sorted(list(set(phrase.split()))))
    return phrase


def qgram_fingerprint(phrase, qval=2, start_stop=''):
    """Return Q-Gram fingerprint.

    A q-gram fingerprint is a string consisting of all of the unique q-grams
    in a string, alphabetized & concatenated.

    :param str phrase: the string from which to calculate the q-gram
        fingerprint
    :param int qval: the length of each q-gram (by default 2)
    :param str start_stop: the start & stop symbol(s) to concatenate on either
        end of the phrase, as defined in abydos.util.qgram()
    :returns: the q-gram fingerprint of the phrase
    :rtype: str

    >>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.')
    'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
    >>> qgram_fingerprint('Christopher')
    'cherhehrisopphristto'
    >>> qgram_fingerprint('Niall')
    'aliallni'
    """
    phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower()))
    phrase = ''.join(c for c in phrase if c.isalnum())
    phrase = QGrams(phrase, qval, start_stop)
    phrase = ''.join(sorted(phrase))
    return phrase


def phonetic_fingerprint(phrase, phonetic_algorithm=double_metaphone, *args):
    """Return the phonetic fingerprint of a phrase.

    A phonetic fingerprint is identical to a standard string fingerprint, as
    implemented in abydos.clustering.fingerprint(), but performs the
    fingerprinting function after converting the string to its phonetic form,
    as determined by some phonetic algorithm.

    :param str phrase: the string from which to calculate the phonetic
        fingerprint
    :param function phonetic_algorithm: a phonetic algorithm that takes a
        string and returns a string (presumably a phonetic representation of
        the original string) By default, this function uses
        abydos.phonetic.double_metaphone()
    :param args: additional arguments to pass to the phonetic algorithm,
        along with the phrase itself
    :returns: the phonetic fingerprint of the phrase
    :rtype: str

    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.')
    '0 afr fks jmpt kk ls prn tk'
    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.',
    ... phonetic_algorithm=soundex)
    'b650 d200 f200 j513 l200 o160 q200 t000'
    """
    phonetic = ''
    for word in phrase.split():
        word = phonetic_algorithm(word, *args)
        if not isinstance(word, text_type) and hasattr(word, '__iter__'):
            word = word[0]
        phonetic += word + ' '
    phonetic = phonetic[:-1]
    return fingerprint(phonetic)


def skeleton_key(word):
    """Return the skeleton key.

    The skeleton key of a word is defined in:
    Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
    in Scientific and Scholarly Text." Communications of the ACM, 27(4).
    358--368. <http://dl.acm.org/citation.cfm?id=358048>

    :param str word: the word to transform into its skeleton key
    :returns: the skeleton key
    :rtype: str

    >>> skeleton_key('The quick brown fox jumped over the lazy dog.')
    'THQCKBRWNFXJMPDVLZYGEUIOA'
    >>> skeleton_key('Christopher')
    'CHRSTPIOE'
    >>> skeleton_key('Niall')
    'NLIA'
    """
    _vowels = {'A', 'E', 'I', 'O', 'U'}

    word = unicodedata.normalize('NFKD', text_type(word.upper()))
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})
    start = word[0:1]
    consonant_part = ''
    vowel_part = ''

    # add consonants & vowels to to separate strings
    # (omitting the first char & duplicates)
    for char in word[1:]:
        if char != start:
            if char in _vowels:
                if char not in vowel_part:
                    vowel_part += char
            elif char not in consonant_part:
                consonant_part += char
    # return the first char followed by consonants followed by vowels
    return start + consonant_part + vowel_part


def omission_key(word):
    """Return the omission key.

    The omission key of a word is defined in:
    Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
    in Scientific and Scholarly Text." Communications of the ACM, 27(4).
    358--368. <http://dl.acm.org/citation.cfm?id=358048>

    :param str word: the word to transform into its omission key
    :returns: the omission key
    :rtype: str

    >>> omission_key('The quick brown fox jumped over the lazy dog.')
    'JKQXZVWYBFMGPDHCLNTREUIOA'
    >>> omission_key('Christopher')
    'PHCTSRIOE'
    >>> omission_key('Niall')
    'LNIA'
    """
    _consonants = ('J', 'K', 'Q', 'X', 'Z', 'V', 'W', 'Y', 'B', 'F', 'M', 'G',
                   'P', 'D', 'H', 'C', 'L', 'N', 'T', 'S', 'R')

    word = unicodedata.normalize('NFKD', text_type(word.upper()))
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    key = ''

    # add consonants in order supplied by _consonants (no duplicates)
    for char in _consonants:
        if char in word:
            key += char

    # add vowels in order they appeared in the word (no duplicates)
    for char in word:
        if char not in _consonants and char not in key:
            key += char

    return key

# TODO: Dump all these to a data file.
# most common letters, as defined in Cisłak & Grabowski
MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd',
                          'l', 'c', 'u', 'm', 'w', 'f')

# most common letters (case-folded to lowercase), as shown in Google Books
# English n-grams, among letters a-z & digits 0-9
MOST_COMMON_LETTERS_EN_LC = ('e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'h', 'l',
                             'd', 'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b',
                             'v', 'k', 'x', 'j', 'q', 'z', '1', '2', '0', '9',
                             '3', '4', '8', '5', '6', '7')

# most common letters, as shown in Google Books English n-grams, among letters
# A-Z, a-z & digits 0-9
MOST_COMMON_LETTERS = ('e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd',
                       'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', 'v', 'k',
                       'T', 'I', 'A', 'S', 'C', 'x', 'M', 'P', 'E', 'B', 'H',
                       'R', 'N', 'D', 'L', 'F', 'W', 'O', 'q', 'G', 'z', 'j',
                       'J', 'U', 'V', 'K', 'Y', '1', '2', '0', 'X', '9', 'Q',
                       '3', 'Z', '4', '8', '5', '6', '7',)

# most common letters (case-folded to lowercase), as shown in Google Books
# German n-grams, among letters (a-z and umlauted vowels & eszett) & digits 0-9
MOST_COMMON_LETTERS_DE = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
                          'l', 'g', 'c', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
                          'v', 'p', 'ü', 'ä', 'ß', 'ö', 'j', 'y', 'x', 'q',
                          '1', '2', '3', '4', '0', '5', '6', '9', '8', '7')

# most common letters (case-folded to lowercase), as shown in Google Books
# German n-grams, among letters (A-Z, a-z, umlauted vowels & eszett) & digits
# 0-9
MOST_COMMON_LETTERS_DE_LC = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
                             'l', 'c', 'g', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
                             'v', 'p', 'ü', 'ä', 'S', 'A', 'D', 'B', 'E', 'G',
                             'M', 'ß', 'V', 'K', 'ö', 'W', 'F', 'P', 'R', 'I',
                             'H', 'L', 'T', 'N', 'Z', 'y', 'U', 'j', 'J', 'O',
                             'C', 'x', 'q', 'Ü', 'Q', 'X', 'Ä', 'Ö', '1', '2',
                             'Y', '3', '4', '0', '5', '6', '9', '8', '7')

def occurrence_fingerprint(word, n_bits=16,
                           most_common=MOST_COMMON_LETTERS_CG):
    """Return the occurrence fingerprint.

    Based on the occurence fingerprint from:
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
    Fast Approximate Keyword Matching Using Bitwise Operations."
    http://arxiv.org/abs/1711.08475

    :param word: the word to fingerprint
    :param n_bits: number of bits in the fingerprint returned
    :param most_common: the most common tokens in the target language
    :return: the occurrence fingerprint
    :rtype: int
    """
    word = set(word)
    fingerprint = 0

    for letter in most_common:
        if letter in word:
            fingerprint += 1
        n_bits -= 1
        if n_bits:
            fingerprint <<= 1
        else:
            break

    if n_bits:
        fingerprint <<= n_bits

    return fingerprint


def occurrence_halved_fingerprint(word, n_bits=16,
                                  most_common=MOST_COMMON_LETTERS_CG):
    """Return the occurrence halved fingerprint.

    Based on the occurence halved fingerprint from:
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
    Fast Approximate Keyword Matching Using Bitwise Operations."
    http://arxiv.org/abs/1711.08475

    :param word: the word to fingerprint
    :param n_bits: number of bits in the fingerprint returned
    :param most_common: the most common tokens in the target language
    :return: the occurrence halved fingerprint
    :rtype: int
    """
    if n_bits % 2:
        n_bits += 1

    w_len = len(word)//2
    w_1 = set(word[:w_len])
    w_2 = set(word[w_len:])
    fingerprint = 0

    for letter in most_common:
        if letter in w_1:
            fingerprint += 1
        fingerprint <<= 1
        if letter in w_2:
            fingerprint += 1
        n_bits -= 2
        if n_bits:
            fingerprint <<= 1
        else:
            break

    if n_bits:
        fingerprint <<= n_bits

    return fingerprint


def count_fingerprint(word, n_bits=16,
                      most_common=MOST_COMMON_LETTERS_CG):
    """Return the count fingerprint.

    Based on the count fingerprint from:
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
    Fast Approximate Keyword Matching Using Bitwise Operations."
    http://arxiv.org/abs/1711.08475

    :param word: the word to fingerprint
    :param n_bits: number of bits in the fingerprint returned
    :param most_common: the most common tokens in the target language
    :return: the count fingerprint
    :rtype: int
    """
    if n_bits % 2:
        n_bits += 1

    word = Counter(word)
    fingerprint = 0

    for letter in most_common:
        fingerprint += (word[letter] & 3)
        n_bits -= 2
        if n_bits:
            fingerprint <<= 2
        else:
            break

    if n_bits:
        fingerprint <<= n_bits

    return fingerprint


def position_fingerprint(word, n_bits=16,
                         most_common=MOST_COMMON_LETTERS_CG,
                         bits_per_letter=3):
    """Return the position fingerprint.

    Based on the position fingerprint from:
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
    Fast Approximate Keyword Matching Using Bitwise Operations."
    http://arxiv.org/abs/1711.08475

    :param word: the word to fingerprint
    :param n_bits: number of bits in the fingerprint returned
    :param most_common: the most common tokens in the target language
    :param bits_per_letter: the bits to assign for letter position
    :return: the position fingerprint
    :rtype: int
    """
    position = {}
    for pos, letter in enumerate(word):
        if letter not in position and letter in most_common:
            position[letter] = min(pos, 2**bits_per_letter-1)

    fingerprint = 0
    for letter in most_common:
        if letter in position:
            fingerprint += min(position[letter], 2**n_bits-1)
        n_bits -= bits_per_letter
        if n_bits > 0:
            fingerprint <<= min(bits_per_letter, n_bits)
        else:
            break

    if n_bits > 0:
        fingerprint <<= n_bits

    return fingerprint


def mean_pairwise_similarity(collection, metric=sim,
                             meanfunc=hmean, symmetric=False):
    """Calculate the mean pairwise similarity of a collection of strings.

    Takes the mean of the pairwise similarity between each member of a
    collection, optionally in both directions (for asymmetric similarity
    metrics.

    :param list collection: a collection of terms or a string that can be split
    :param function metric: a similarity metric function
    :param function mean: a mean function that takes a list of values and
        returns a float
    :param bool symmetric: set to True if all pairwise similarities should be
        calculated in both directions
    :returns: the mean pairwise similarity of a collection of strings
    :rtype: str

    >>> mean_pairwise_similarity(['Christopher', 'Kristof', 'Christobal'])
    0.51980198019801982
    >>> mean_pairwise_similarity(['Niall', 'Neal', 'Neil'])
    0.54545454545454541
    """
    if hasattr(collection, 'split'):
        collection = collection.split()
    if not hasattr(collection, '__iter__'):
        raise ValueError('collection is neither a string nor iterable type')
    elif len(collection) < 2:
        raise ValueError('collection has fewer than two members')

    collection = list(collection)

    pairwise_values = []

    for i in range(len(collection)):
        for j in range(i+1, len(collection)):
            pairwise_values.append(metric(collection[i], collection[j]))
            if symmetric:
                pairwise_values.append(metric(collection[j], collection[i]))

    if not callable(meanfunc):
        raise ValueError('meanfunc must be a function')
    return meanfunc(pairwise_values)


1			# -- coding: utf-8 --
2
3			# Copyright 2014-2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19			"""abydos.clustering.
20
21			The clustering module implements clustering algorithms such as:
22			- string fingerprint
23			- q-gram fingerprint
24			- phonetic fingerprint
25			- skeleton key
26			- omission key
27			"""
28
29			from __future__ import division, unicode_literals
30
31			import unicodedata
32			from collections import Counter
33
34			from six import text_type
35			from six.moves import range
36
37			from .distance import sim
38			from .phonetic import double_metaphone
39			from .qgram import QGrams
40			from .stats import hmean
41
42
43			def fingerprint(phrase):
44			"""Return string fingerprint.
45
46			The fingerprint of a string is a string consisting of all of the unique
47			words in a string, alphabetized & concatenated with intervening spaces
48
49			:param str phrase: the string from which to calculate the fingerprint
50			:returns: the fingerprint of the phrase
51			:rtype: str
52
53			>>> fingerprint('The quick brown fox jumped over the lazy dog.')
54			'brown dog fox jumped lazy over quick the'
55			"""
56			phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower()))
57			phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()])
58			phrase = ' '.join(sorted(list(set(phrase.split()))))
59			return phrase
60
61
62			def qgram_fingerprint(phrase, qval=2, start_stop=''):
63			"""Return Q-Gram fingerprint.
64
65			A q-gram fingerprint is a string consisting of all of the unique q-grams
66			in a string, alphabetized & concatenated.
67
68			:param str phrase: the string from which to calculate the q-gram
69			fingerprint
70			:param int qval: the length of each q-gram (by default 2)
71			:param str start_stop: the start & stop symbol(s) to concatenate on either
72			end of the phrase, as defined in abydos.util.qgram()
73			:returns: the q-gram fingerprint of the phrase
74			:rtype: str
75
76			>>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.')
77			'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
78			>>> qgram_fingerprint('Christopher')
79			'cherhehrisopphristto'
80			>>> qgram_fingerprint('Niall')
81			'aliallni'
82			"""
83			phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower()))
84			phrase = ''.join(c for c in phrase if c.isalnum())
85			phrase = QGrams(phrase, qval, start_stop)
86			phrase = ''.join(sorted(phrase))
87			return phrase
88
89
90			def phonetic_fingerprint(phrase, phonetic_algorithm=double_metaphone, *args):
91			"""Return the phonetic fingerprint of a phrase.
92
93			A phonetic fingerprint is identical to a standard string fingerprint, as
94			implemented in abydos.clustering.fingerprint(), but performs the
95			fingerprinting function after converting the string to its phonetic form,
96			as determined by some phonetic algorithm.
97
98			:param str phrase: the string from which to calculate the phonetic
99			fingerprint
100			:param function phonetic_algorithm: a phonetic algorithm that takes a
101			string and returns a string (presumably a phonetic representation of
102			the original string) By default, this function uses
103			abydos.phonetic.double_metaphone()
104			:param args: additional arguments to pass to the phonetic algorithm,
105			along with the phrase itself
106			:returns: the phonetic fingerprint of the phrase
107			:rtype: str
108
109			>>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.')
110			'0 afr fks jmpt kk ls prn tk'
111			>>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.',
112			... phonetic_algorithm=soundex)
113			'b650 d200 f200 j513 l200 o160 q200 t000'
114			"""
115			phonetic = ''
116			for word in phrase.split():
117			word = phonetic_algorithm(word, *args)
118			if not isinstance(word, text_type) and hasattr(word, '__iter__'):
119			word = word[0]
120			phonetic += word + ' '
121			phonetic = phonetic[:-1]
122			return fingerprint(phonetic)
123
124
125			def skeleton_key(word):
126			"""Return the skeleton key.
127
128			The skeleton key of a word is defined in:
129			Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
130			in Scientific and Scholarly Text." Communications of the ACM, 27(4).
131			358--368. <http://dl.acm.org/citation.cfm?id=358048>
132
133			:param str word: the word to transform into its skeleton key
134			:returns: the skeleton key
135			:rtype: str
136
137			>>> skeleton_key('The quick brown fox jumped over the lazy dog.')
138			'THQCKBRWNFXJMPDVLZYGEUIOA'
139			>>> skeleton_key('Christopher')
140			'CHRSTPIOE'
141			>>> skeleton_key('Niall')
142			'NLIA'
143			"""
144			_vowels = {'A', 'E', 'I', 'O', 'U'}
145
146			word = unicodedata.normalize('NFKD', text_type(word.upper()))
147			word = ''.join(c for c in word if c in
148			{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
149			'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
150			'Y', 'Z'})
151			start = word[0:1]
152			consonant_part = ''
153			vowel_part = ''
154
155			# add consonants & vowels to to separate strings
156			# (omitting the first char & duplicates)
157			for char in word[1:]:
158			if char != start:
159			if char in _vowels:
160			if char not in vowel_part:
161			vowel_part += char
162			elif char not in consonant_part:
163			consonant_part += char
164			# return the first char followed by consonants followed by vowels
165			return start + consonant_part + vowel_part
166
167
168			def omission_key(word):
169			"""Return the omission key.
170
171			The omission key of a word is defined in:
172			Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
173			in Scientific and Scholarly Text." Communications of the ACM, 27(4).
174			358--368. <http://dl.acm.org/citation.cfm?id=358048>
175
176			:param str word: the word to transform into its omission key
177			:returns: the omission key
178			:rtype: str
179
180			>>> omission_key('The quick brown fox jumped over the lazy dog.')
181			'JKQXZVWYBFMGPDHCLNTREUIOA'
182			>>> omission_key('Christopher')
183			'PHCTSRIOE'
184			>>> omission_key('Niall')
185			'LNIA'
186			"""
187			_consonants = ('J', 'K', 'Q', 'X', 'Z', 'V', 'W', 'Y', 'B', 'F', 'M', 'G',
188			'P', 'D', 'H', 'C', 'L', 'N', 'T', 'S', 'R')
189
190			word = unicodedata.normalize('NFKD', text_type(word.upper()))
191			word = ''.join(c for c in word if c in
192			{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
193			'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
194			'Y', 'Z'})
195
196			key = ''
197
198			# add consonants in order supplied by _consonants (no duplicates)
199			for char in _consonants:
200			if char in word:
201			key += char
202
203			# add vowels in order they appeared in the word (no duplicates)
204			for char in word:
205			if char not in _consonants and char not in key:
206			key += char
207
208			return key
209
210			# TODO: Dump all these to a data file.
211			# most common letters, as defined in Cisłak & Grabowski
212			MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd',
213			'l', 'c', 'u', 'm', 'w', 'f')
214
215			# most common letters (case-folded to lowercase), as shown in Google Books
216			# English n-grams, among letters a-z & digits 0-9
217			MOST_COMMON_LETTERS_EN_LC = ('e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'h', 'l',
218			'd', 'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b',
219			'v', 'k', 'x', 'j', 'q', 'z', '1', '2', '0', '9',
220			'3', '4', '8', '5', '6', '7')
221
222			# most common letters, as shown in Google Books English n-grams, among letters
223			# A-Z, a-z & digits 0-9
224			MOST_COMMON_LETTERS = ('e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd',
225			'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', 'v', 'k',
226			'T', 'I', 'A', 'S', 'C', 'x', 'M', 'P', 'E', 'B', 'H',
227			'R', 'N', 'D', 'L', 'F', 'W', 'O', 'q', 'G', 'z', 'j',
228			'J', 'U', 'V', 'K', 'Y', '1', '2', '0', 'X', '9', 'Q',
229			'3', 'Z', '4', '8', '5', '6', '7',)
230
231			# most common letters (case-folded to lowercase), as shown in Google Books
232			# German n-grams, among letters (a-z and umlauted vowels & eszett) & digits 0-9
233			MOST_COMMON_LETTERS_DE = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
234			'l', 'g', 'c', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
235			'v', 'p', 'ü', 'ä', 'ß', 'ö', 'j', 'y', 'x', 'q',
236			'1', '2', '3', '4', '0', '5', '6', '9', '8', '7')
237
238			# most common letters (case-folded to lowercase), as shown in Google Books
239			# German n-grams, among letters (A-Z, a-z, umlauted vowels & eszett) & digits
240			# 0-9
241			MOST_COMMON_LETTERS_DE_LC = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
242			'l', 'c', 'g', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
243			'v', 'p', 'ü', 'ä', 'S', 'A', 'D', 'B', 'E', 'G',
244			'M', 'ß', 'V', 'K', 'ö', 'W', 'F', 'P', 'R', 'I',
245			'H', 'L', 'T', 'N', 'Z', 'y', 'U', 'j', 'J', 'O',
246			'C', 'x', 'q', 'Ü', 'Q', 'X', 'Ä', 'Ö', '1', '2',
247			'Y', '3', '4', '0', '5', '6', '9', '8', '7')
248
249			def occurrence_fingerprint(word, n_bits=16,
250			most_common=MOST_COMMON_LETTERS_CG):
251			"""Return the occurrence fingerprint.
252
253			Based on the occurence fingerprint from:
254			Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
255			Fast Approximate Keyword Matching Using Bitwise Operations."
256			http://arxiv.org/abs/1711.08475
257
258			:param word: the word to fingerprint
259			:param n_bits: number of bits in the fingerprint returned
260			:param most_common: the most common tokens in the target language
261			:return: the occurrence fingerprint
262			:rtype: int
263			"""
264			word = set(word)
265			fingerprint = 0
266
267			for letter in most_common:
268			if letter in word:
269			fingerprint += 1
270			n_bits -= 1
271			if n_bits:
272			fingerprint <<= 1
273			else:
274			break
275
276			if n_bits:
277			fingerprint <<= n_bits
278
279			return fingerprint
280
281
282			def occurrence_halved_fingerprint(word, n_bits=16,
283			most_common=MOST_COMMON_LETTERS_CG):
284			"""Return the occurrence halved fingerprint.
285
286			Based on the occurence halved fingerprint from:
287			Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
288			Fast Approximate Keyword Matching Using Bitwise Operations."
289			http://arxiv.org/abs/1711.08475
290
291			:param word: the word to fingerprint
292			:param n_bits: number of bits in the fingerprint returned
293			:param most_common: the most common tokens in the target language
294			:return: the occurrence halved fingerprint
295			:rtype: int
296			"""
297			if n_bits % 2:
298			n_bits += 1
299
300			w_len = len(word)//2
301			w_1 = set(word[:w_len])
302			w_2 = set(word[w_len:])
303			fingerprint = 0
304
305			for letter in most_common:
306			if letter in w_1:
307			fingerprint += 1
308			fingerprint <<= 1
309			if letter in w_2:
310			fingerprint += 1
311			n_bits -= 2
312			if n_bits:
313			fingerprint <<= 1
314			else:
315			break
316
317			if n_bits:
318			fingerprint <<= n_bits
319
320			return fingerprint
321
322
323			def count_fingerprint(word, n_bits=16,
324			most_common=MOST_COMMON_LETTERS_CG):
325			"""Return the count fingerprint.
326
327			Based on the count fingerprint from:
328			Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
329			Fast Approximate Keyword Matching Using Bitwise Operations."
330			http://arxiv.org/abs/1711.08475
331
332			:param word: the word to fingerprint
333			:param n_bits: number of bits in the fingerprint returned
334			:param most_common: the most common tokens in the target language
335			:return: the count fingerprint
336			:rtype: int
337			"""
338			if n_bits % 2:
339			n_bits += 1
340
341			word = Counter(word)
342			fingerprint = 0
343
344			for letter in most_common:
345			fingerprint += (word[letter] & 3)
346			n_bits -= 2
347			if n_bits:
348			fingerprint <<= 2
349			else:
350			break
351
352			if n_bits:
353			fingerprint <<= n_bits
354
355			return fingerprint
356
357
358			def position_fingerprint(word, n_bits=16,
359			most_common=MOST_COMMON_LETTERS_CG,
360			bits_per_letter=3):
361			"""Return the position fingerprint.
362
363			Based on the position fingerprint from:
364			Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
365			Fast Approximate Keyword Matching Using Bitwise Operations."
366			http://arxiv.org/abs/1711.08475
367
368			:param word: the word to fingerprint
369			:param n_bits: number of bits in the fingerprint returned
370			:param most_common: the most common tokens in the target language
371			:param bits_per_letter: the bits to assign for letter position
372			:return: the position fingerprint
373			:rtype: int
374			"""
375			position = {}
376			for pos, letter in enumerate(word):
377			if letter not in position and letter in most_common:
378			position[letter] = min(pos, 2**bits_per_letter-1)
379
380			fingerprint = 0
381			for letter in most_common:
382			if letter in position:
383			fingerprint += min(position[letter], 2**n_bits-1)
384			n_bits -= bits_per_letter
385			if n_bits > 0:
386			fingerprint <<= min(bits_per_letter, n_bits)
387			else:
388			break
389
390			if n_bits > 0:
391			fingerprint <<= n_bits
392
393			return fingerprint
394
395
396			def mean_pairwise_similarity(collection, metric=sim,
397			meanfunc=hmean, symmetric=False):
398			"""Calculate the mean pairwise similarity of a collection of strings.
399
400			Takes the mean of the pairwise similarity between each member of a
401			collection, optionally in both directions (for asymmetric similarity
402			metrics.
403
404			:param list collection: a collection of terms or a string that can be split
405			:param function metric: a similarity metric function
406			:param function mean: a mean function that takes a list of values and
407			returns a float
408			:param bool symmetric: set to True if all pairwise similarities should be
409			calculated in both directions
410			:returns: the mean pairwise similarity of a collection of strings
411			:rtype: str
412
413			>>> mean_pairwise_similarity(['Christopher', 'Kristof', 'Christobal'])
414			0.51980198019801982
415			>>> mean_pairwise_similarity(['Niall', 'Neal', 'Neil'])
416			0.54545454545454541
417			"""
418			if hasattr(collection, 'split'):
419			collection = collection.split()
420			if not hasattr(collection, '__iter__'):
421			raise ValueError('collection is neither a string nor iterable type')
422			elif len(collection) < 2:
423			raise ValueError('collection has fewer than two members')
424
425			collection = list(collection)
426
427			pairwise_values = []
428
429			for i in range(len(collection)):
430			for j in range(i+1, len(collection)):
431			pairwise_values.append(metric(collection[i], collection[j]))
432			if symmetric:
433			pairwise_values.append(metric(collection[j], collection[i]))
434
435			if not callable(meanfunc):
436			raise ValueError('meanfunc must be a function')
437			return meanfunc(pairwise_values)
438

chrislit / abydos

Push — master ( 76e221...14a933 )

abydos.clustering.occurrence_fingerprint() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like