Completed
Pull Request — master (#1242)
by Lasse
02:08
created

bears.c_languages.codeclone_detection.get_difference()   F

Complexity

Conditions 9

Size

Total Lines 46

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 9
dl 0
loc 46
rs 3.1578
1
import os
2
import math
3
import copy
4
from munkres import Munkres
5
# Instantiate globally since this class is holding stateless public methods.
6
munkres = Munkres()
7
8
from coalib.collecting.Collectors import collect_dirs
9
10
11
def exclude_function(count_matrix):
12
    """
13
    Determines heuristically whether or not it makes sense for clone
14
    detection to take this function into account.
15
16
    Applied heuristics:
17
     * Functions with only count vectors with a sum of all unweighted elements
18
       of lower then 10 are very likely only declarations or empty and to be
19
       ignored. (Constants are not taken into account.)
20
21
    :param count_matrix: A dictionary with count vectors representing all
22
                         variables for a function.
23
    :return:             True if the function is useless for evaluation.
24
    """
25
    var_count = [cv.name.startswith("#")
26
                 for cv in count_matrix.values()].count(False)
27
    variable_sum = sum(0 if cv.name.startswith("#") else sum(cv.unweighted)
28
                       for cv in count_matrix.values())
29
    return (all((cv.name.startswith("#") or sum(cv.unweighted) < 10)
30
                for cv in count_matrix.values()) or
31
            variable_sum < 11 or
32
            var_count < 2)
33
34
35
def get_count_matrices(count_vector_creator,
36
                       filenames,
37
                       progress_callback,
38
                       base_path,
39
                       extra_include_paths):
40
    """
41
    Retrieves matrices holding count vectors for all variables for all
42
    functions in the given file.
43
44
    :param count_vector_creator: A object with a get_vectors_for_file method
45
                                 taking a filename as argument.
46
    :param filenames:            The files to create count vectors for.
47
    :param progress_callback:    A function with one float argument which is
48
                                 called after processing each file with the
49
                                 progress percentage (float) as an argument.
50
    :param extra_include_paths:  A list containing additional include paths.
51
    :return:                     A dict holding a tuple of (file, line,
52
                                 function) as key and as value a dict with
53
                                 variable names as key and count vector
54
                                 objects as value.
55
    """
56
    result = {}
57
    maxlen = len(filenames)
58
    include_paths = collect_dirs([os.path.dirname(base_path) + "/**"])
59
    include_paths += extra_include_paths
60
61
    for i, filename in enumerate(filenames):
62
        progress_callback(100*(i/maxlen))
63
        count_dict = count_vector_creator.get_vectors_for_file(filename,
64
                                                               include_paths)
65
        for function in count_dict:
66
            if not exclude_function(count_dict[function]):
67
                result[(filename,
68
                        function[0],
69
                        function[1])] = count_dict[function]
70
71
    return result
72
73
74
def pad_count_vectors(cm1, cm2):
75
    """
76
    Pads the smaller count matrix with zeroed count vectors.
77
78
    :param cm1: First cm. Will not be modified.
79
    :param cm2: Second cm. Will not be modified.
80
    :return:    A tuple holding two cms.
81
    """
82
    cm1len = len(cm1)
83
    cm2len = len(cm2)
84
    if cm1len != cm2len:
85
        # Copy the smaller matrix as it will be altered
86
        if cm1len > cm2len:
87
            cm2 = copy.copy(cm2)
88
        else:  # make cm1 the larger (or equal) one
89
            tmp = cm2
90
            cm2 = copy.copy(cm1)
91
            cm1 = tmp
92
93
        any_count_vector = list(cm1.values())[0]
94
        # Fill up smaller count matrix with zero vectors. This way no
95
        # padding is needed later and if count vectors are zero on both
96
        # side, the difference is zero too which wouldn't be taken into
97
        # account with simple padding of ones.
98
        for i in range(len(cm1) - len(cm2)):
99
            cm2[i] = any_count_vector.create_null_vector(i)
100
101
    return cm1, cm2
102
103
104
def relative_difference(difference, maxabs):
105
    if maxabs == 0:
106
        return 1
107
    return difference/maxabs
108
109
110
def average(lst):
111
    return sum(lst)/len(lst)
112
113
114
def get_difference(matching_iterator,
115
                   average_calculation,
116
                   poly_postprocessing,
117
                   exp_postprocessing):
118
    """
119
    Retrieves the difference value for the matched function represented by the
120
    given matches.
121
122
    Postprocessing may be done because small functions are less likely to be
123
    clones at the same difference value than big functions which may provide a
124
    better refactoring opportunity for the user.
125
126
    :return:                    A difference value between 0 and 1.
127
128
    :param matching_iterator:   A list holding tuples of an absolute difference
129
                                value and a value to normalize the difference
130
                                into a range of [0, 1].
131
    :param average_calculation: If set to true this function will take the
132
                                average of all variable differences as the
133
                                difference, else it will normalize the
134
                                function as a whole and thus weighting in
135
                                variables dependent on their size.
136
    :param poly_postprocessing: If set to true, the difference value of big
137
                                function pairs will be reduced using a
138
                                polynomial approach.
139
    :param exp_postprocessing:  If set to true, the difference value of big
140
                                function pairs will be reduced using an
141
                                exponential approach.
142
    """
143
    norm_sum = sum(norm for diff, norm in matching_iterator)
144
145
    if average_calculation:
146
        difference = average([relative_difference(diff, norm)
147
                              for diff, norm in matching_iterator])
148
    else:
149
        difference = relative_difference(
150
            sum(diff for diff, norm in matching_iterator),
151
            norm_sum)
152
153
    if poly_postprocessing and norm_sum != 0:
154
        # This function starts at 1 and converges to .75 for norm_sum -> inf
155
        difference *= (3*norm_sum+1)/(4*norm_sum)
156
    if exp_postprocessing and norm_sum != 0:
157
        difference *= math.exp(1-norm_sum)/4 + 0.75
158
159
    return difference
160
161
162
def compare_functions(cm1,
163
                      cm2,
164
                      average_calculation=False,
165
                      poly_postprocessing=True,
166
                      exp_postprocessing=False):
167
    """
168
    Compares the functions represented by the given count matrices.
169
170
    Postprocessing may be done because small functions are less likely to be
171
    clones at the same difference value than big functions which may provide a
172
    better refactoring opportunity for the user.
173
174
    :param cm1:                 Count vector dict for the first function.
175
    :param cm2:                 Count vector dict for the second function.
176
    :param average_calculation: If set to true the difference calculation
177
                                function will take the average of all variable
178
                                differences as the difference, else it will
179
                                normalize the function as a whole and thus
180
                                weighting in variables dependent on their size.
181
    :param poly_postprocessing: If set to true, the difference value of big
182
                                function pairs will be reduced using a
183
                                polynomial approach.
184
    :param exp_postprocessing:  If set to true, the difference value of big
185
                                function pairs will be reduced using an
186
                                exponential approach.
187
    :return:                    The difference between these functions, 0 is
188
                                identical and 1 is not similar at all.
189
    """
190
    assert 0 not in (len(cm1), len(cm2))
191
192
    cm1, cm2 = pad_count_vectors(cm1, cm2)
193
194
    diff_table = [(cv1,
195
                   [(cv2, cv1.difference(cv2), cv1.maxabs(cv2))
196
                    for cv2 in cm2.values()])
197
                  for cv1 in cm1.values()]
198
199
    # The cost matrix holds the difference between the two variables i and
200
    # j in the i/j field. This is a representation of a bipartite weighted
201
    # graph with nodes representing the first function on the one side
202
    # (rows) and the nodes representing the second function on the other
203
    #  side (columns). The fields in the matrix are the weighted nodes
204
    # connecting each element from one side to the other.
205
    cost_matrix = [[relative_difference(difference, maxabs)
206
                    for cv2, difference, maxabs in lst]
207
                   for cv1, lst in diff_table]
208
209
    # The munkres algorithm will calculate a matching such that the sum of
210
    # the taken fields is minimal. It thus will associate each variable
211
    # from one function to one on the other function.
212
    matching = munkres.compute(cost_matrix)
213
214
    return get_difference([(diff_table[x][1][y][1], diff_table[x][1][y][2])
215
                           for x, y in matching],
216
                          average_calculation,
217
                          poly_postprocessing,
218
                          exp_postprocessing)
219