1
|
|
|
import os |
2
|
|
|
import math |
3
|
|
|
import copy |
4
|
|
|
from munkres import Munkres |
5
|
|
|
# Instantiate globally since this class is holding stateless public methods. |
6
|
|
|
munkres = Munkres() |
7
|
|
|
|
8
|
|
|
from coalib.collecting.Collectors import collect_dirs |
9
|
|
|
|
10
|
|
|
|
11
|
|
|
def exclude_function(count_matrix): |
12
|
|
|
""" |
13
|
|
|
Determines heuristically whether or not it makes sense for clone |
14
|
|
|
detection to take this function into account. |
15
|
|
|
|
16
|
|
|
Applied heuristics: |
17
|
|
|
* Functions with only count vectors with a sum of all unweighted elements |
18
|
|
|
of lower then 10 are very likely only declarations or empty and to be |
19
|
|
|
ignored. (Constants are not taken into account.) |
20
|
|
|
|
21
|
|
|
:param count_matrix: A dictionary with count vectors representing all |
22
|
|
|
variables for a function. |
23
|
|
|
:return: True if the function is useless for evaluation. |
24
|
|
|
""" |
25
|
|
|
var_count = [cv.name.startswith("#") |
26
|
|
|
for cv in count_matrix.values()].count(False) |
27
|
|
|
variable_sum = sum(0 if cv.name.startswith("#") else sum(cv.unweighted) |
28
|
|
|
for cv in count_matrix.values()) |
29
|
|
|
return (all((cv.name.startswith("#") or sum(cv.unweighted) < 10) |
30
|
|
|
for cv in count_matrix.values()) or |
31
|
|
|
variable_sum < 11 or |
32
|
|
|
var_count < 2) |
33
|
|
|
|
34
|
|
|
|
35
|
|
|
def get_count_matrices(count_vector_creator, |
36
|
|
|
filenames, |
37
|
|
|
progress_callback, |
38
|
|
|
base_path, |
39
|
|
|
extra_include_paths): |
40
|
|
|
""" |
41
|
|
|
Retrieves matrices holding count vectors for all variables for all |
42
|
|
|
functions in the given file. |
43
|
|
|
|
44
|
|
|
:param count_vector_creator: A object with a get_vectors_for_file method |
45
|
|
|
taking a filename as argument. |
46
|
|
|
:param filenames: The files to create count vectors for. |
47
|
|
|
:param progress_callback: A function with one float argument which is |
48
|
|
|
called after processing each file with the |
49
|
|
|
progress percentage (float) as an argument. |
50
|
|
|
:param extra_include_paths: A list containing additional include paths. |
51
|
|
|
:return: A dict holding a tuple of (file, line, |
52
|
|
|
function) as key and as value a dict with |
53
|
|
|
variable names as key and count vector |
54
|
|
|
objects as value. |
55
|
|
|
""" |
56
|
|
|
result = {} |
57
|
|
|
maxlen = len(filenames) |
58
|
|
|
include_paths = collect_dirs([os.path.dirname(base_path) + "/**"]) |
59
|
|
|
include_paths += extra_include_paths |
60
|
|
|
|
61
|
|
|
for i, filename in enumerate(filenames): |
62
|
|
|
progress_callback(100*(i/maxlen)) |
63
|
|
|
count_dict = count_vector_creator.get_vectors_for_file(filename, |
64
|
|
|
include_paths) |
65
|
|
|
for function in count_dict: |
66
|
|
|
if not exclude_function(count_dict[function]): |
67
|
|
|
result[(filename, |
68
|
|
|
function[0], |
69
|
|
|
function[1])] = count_dict[function] |
70
|
|
|
|
71
|
|
|
return result |
72
|
|
|
|
73
|
|
|
|
74
|
|
|
def pad_count_vectors(cm1, cm2): |
75
|
|
|
""" |
76
|
|
|
Pads the smaller count matrix with zeroed count vectors. |
77
|
|
|
|
78
|
|
|
:param cm1: First cm. Will not be modified. |
79
|
|
|
:param cm2: Second cm. Will not be modified. |
80
|
|
|
:return: A tuple holding two cms. |
81
|
|
|
""" |
82
|
|
|
cm1len = len(cm1) |
83
|
|
|
cm2len = len(cm2) |
84
|
|
|
if cm1len != cm2len: |
85
|
|
|
# Copy the smaller matrix as it will be altered |
86
|
|
|
if cm1len > cm2len: |
87
|
|
|
cm2 = copy.copy(cm2) |
88
|
|
|
else: # make cm1 the larger (or equal) one |
89
|
|
|
tmp = cm2 |
90
|
|
|
cm2 = copy.copy(cm1) |
91
|
|
|
cm1 = tmp |
92
|
|
|
|
93
|
|
|
any_count_vector = list(cm1.values())[0] |
94
|
|
|
# Fill up smaller count matrix with zero vectors. This way no |
95
|
|
|
# padding is needed later and if count vectors are zero on both |
96
|
|
|
# side, the difference is zero too which wouldn't be taken into |
97
|
|
|
# account with simple padding of ones. |
98
|
|
|
for i in range(len(cm1) - len(cm2)): |
99
|
|
|
cm2[i] = any_count_vector.create_null_vector(i) |
100
|
|
|
|
101
|
|
|
return cm1, cm2 |
102
|
|
|
|
103
|
|
|
|
104
|
|
|
def relative_difference(difference, maxabs): |
105
|
|
|
if maxabs == 0: |
106
|
|
|
return 1 |
107
|
|
|
return difference/maxabs |
108
|
|
|
|
109
|
|
|
|
110
|
|
|
def average(lst): |
111
|
|
|
return sum(lst)/len(lst) |
112
|
|
|
|
113
|
|
|
|
114
|
|
|
def get_difference(matching_iterator, |
115
|
|
|
average_calculation, |
116
|
|
|
poly_postprocessing, |
117
|
|
|
exp_postprocessing): |
118
|
|
|
""" |
119
|
|
|
Retrieves the difference value for the matched function represented by the |
120
|
|
|
given matches. |
121
|
|
|
|
122
|
|
|
Postprocessing may be done because small functions are less likely to be |
123
|
|
|
clones at the same difference value than big functions which may provide a |
124
|
|
|
better refactoring opportunity for the user. |
125
|
|
|
|
126
|
|
|
:return: A difference value between 0 and 1. |
127
|
|
|
|
128
|
|
|
:param matching_iterator: A list holding tuples of an absolute difference |
129
|
|
|
value and a value to normalize the difference |
130
|
|
|
into a range of [0, 1]. |
131
|
|
|
:param average_calculation: If set to true this function will take the |
132
|
|
|
average of all variable differences as the |
133
|
|
|
difference, else it will normalize the |
134
|
|
|
function as a whole and thus weighting in |
135
|
|
|
variables dependent on their size. |
136
|
|
|
:param poly_postprocessing: If set to true, the difference value of big |
137
|
|
|
function pairs will be reduced using a |
138
|
|
|
polynomial approach. |
139
|
|
|
:param exp_postprocessing: If set to true, the difference value of big |
140
|
|
|
function pairs will be reduced using an |
141
|
|
|
exponential approach. |
142
|
|
|
""" |
143
|
|
|
norm_sum = sum(norm for diff, norm in matching_iterator) |
144
|
|
|
|
145
|
|
|
if average_calculation: |
146
|
|
|
difference = average([relative_difference(diff, norm) |
147
|
|
|
for diff, norm in matching_iterator]) |
148
|
|
|
else: |
149
|
|
|
difference = relative_difference( |
150
|
|
|
sum(diff for diff, norm in matching_iterator), |
151
|
|
|
norm_sum) |
152
|
|
|
|
153
|
|
|
if poly_postprocessing and norm_sum != 0: |
154
|
|
|
# This function starts at 1 and converges to .75 for norm_sum -> inf |
155
|
|
|
difference *= (3*norm_sum+1)/(4*norm_sum) |
156
|
|
|
if exp_postprocessing and norm_sum != 0: |
157
|
|
|
difference *= math.exp(1-norm_sum)/4 + 0.75 |
158
|
|
|
|
159
|
|
|
return difference |
160
|
|
|
|
161
|
|
|
|
162
|
|
|
def compare_functions(cm1, |
163
|
|
|
cm2, |
164
|
|
|
average_calculation=False, |
165
|
|
|
poly_postprocessing=True, |
166
|
|
|
exp_postprocessing=False): |
167
|
|
|
""" |
168
|
|
|
Compares the functions represented by the given count matrices. |
169
|
|
|
|
170
|
|
|
Postprocessing may be done because small functions are less likely to be |
171
|
|
|
clones at the same difference value than big functions which may provide a |
172
|
|
|
better refactoring opportunity for the user. |
173
|
|
|
|
174
|
|
|
:param cm1: Count vector dict for the first function. |
175
|
|
|
:param cm2: Count vector dict for the second function. |
176
|
|
|
:param average_calculation: If set to true the difference calculation |
177
|
|
|
function will take the average of all variable |
178
|
|
|
differences as the difference, else it will |
179
|
|
|
normalize the function as a whole and thus |
180
|
|
|
weighting in variables dependent on their size. |
181
|
|
|
:param poly_postprocessing: If set to true, the difference value of big |
182
|
|
|
function pairs will be reduced using a |
183
|
|
|
polynomial approach. |
184
|
|
|
:param exp_postprocessing: If set to true, the difference value of big |
185
|
|
|
function pairs will be reduced using an |
186
|
|
|
exponential approach. |
187
|
|
|
:return: The difference between these functions, 0 is |
188
|
|
|
identical and 1 is not similar at all. |
189
|
|
|
""" |
190
|
|
|
assert 0 not in (len(cm1), len(cm2)) |
191
|
|
|
|
192
|
|
|
cm1, cm2 = pad_count_vectors(cm1, cm2) |
193
|
|
|
|
194
|
|
|
diff_table = [(cv1, |
195
|
|
|
[(cv2, cv1.difference(cv2), cv1.maxabs(cv2)) |
196
|
|
|
for cv2 in cm2.values()]) |
197
|
|
|
for cv1 in cm1.values()] |
198
|
|
|
|
199
|
|
|
# The cost matrix holds the difference between the two variables i and |
200
|
|
|
# j in the i/j field. This is a representation of a bipartite weighted |
201
|
|
|
# graph with nodes representing the first function on the one side |
202
|
|
|
# (rows) and the nodes representing the second function on the other |
203
|
|
|
# side (columns). The fields in the matrix are the weighted nodes |
204
|
|
|
# connecting each element from one side to the other. |
205
|
|
|
cost_matrix = [[relative_difference(difference, maxabs) |
206
|
|
|
for cv2, difference, maxabs in lst] |
207
|
|
|
for cv1, lst in diff_table] |
208
|
|
|
|
209
|
|
|
# The munkres algorithm will calculate a matching such that the sum of |
210
|
|
|
# the taken fields is minimal. It thus will associate each variable |
211
|
|
|
# from one function to one on the other function. |
212
|
|
|
matching = munkres.compute(cost_matrix) |
213
|
|
|
|
214
|
|
|
return get_difference([(diff_table[x][1][y][1], diff_table[x][1][y][2]) |
215
|
|
|
for x, y in matching], |
216
|
|
|
average_calculation, |
217
|
|
|
poly_postprocessing, |
218
|
|
|
exp_postprocessing) |
219
|
|
|
|