|
1
|
|
|
import os |
|
2
|
|
|
import math |
|
3
|
|
|
import copy |
|
4
|
|
|
from munkres import Munkres |
|
5
|
|
|
# Instantiate globally since this class is holding stateless public methods. |
|
6
|
|
|
munkres = Munkres() |
|
7
|
|
|
|
|
8
|
|
|
from coalib.collecting.Collectors import collect_dirs |
|
9
|
|
|
|
|
10
|
|
|
|
|
11
|
|
|
def exclude_function(count_matrix): |
|
12
|
|
|
""" |
|
13
|
|
|
Determines heuristically whether or not it makes sense for clone |
|
14
|
|
|
detection to take this function into account. |
|
15
|
|
|
|
|
16
|
|
|
Applied heuristics: |
|
17
|
|
|
* Functions with only count vectors with a sum of all unweighted elements |
|
18
|
|
|
of lower then 10 are very likely only declarations or empty and to be |
|
19
|
|
|
ignored. (Constants are not taken into account.) |
|
20
|
|
|
|
|
21
|
|
|
:param count_matrix: A dictionary with count vectors representing all |
|
22
|
|
|
variables for a function. |
|
23
|
|
|
:return: True if the function is useless for evaluation. |
|
24
|
|
|
""" |
|
25
|
|
|
var_count = [cv.name.startswith("#") |
|
26
|
|
|
for cv in count_matrix.values()].count(False) |
|
27
|
|
|
variable_sum = sum(0 if cv.name.startswith("#") else sum(cv.unweighted) |
|
28
|
|
|
for cv in count_matrix.values()) |
|
29
|
|
|
return (all((cv.name.startswith("#") or sum(cv.unweighted) < 10) |
|
30
|
|
|
for cv in count_matrix.values()) or |
|
31
|
|
|
variable_sum < 11 or |
|
32
|
|
|
var_count < 2) |
|
33
|
|
|
|
|
34
|
|
|
|
|
35
|
|
|
def get_count_matrices(count_vector_creator, |
|
36
|
|
|
filenames, |
|
37
|
|
|
progress_callback, |
|
38
|
|
|
base_path, |
|
39
|
|
|
extra_include_paths): |
|
40
|
|
|
""" |
|
41
|
|
|
Retrieves matrices holding count vectors for all variables for all |
|
42
|
|
|
functions in the given file. |
|
43
|
|
|
|
|
44
|
|
|
:param count_vector_creator: A object with a get_vectors_for_file method |
|
45
|
|
|
taking a filename as argument. |
|
46
|
|
|
:param filenames: The files to create count vectors for. |
|
47
|
|
|
:param progress_callback: A function with one float argument which is |
|
48
|
|
|
called after processing each file with the |
|
49
|
|
|
progress percentage (float) as an argument. |
|
50
|
|
|
:param extra_include_paths: A list containing additional include paths. |
|
51
|
|
|
:return: A dict holding a tuple of (file, line, |
|
52
|
|
|
function) as key and as value a dict with |
|
53
|
|
|
variable names as key and count vector |
|
54
|
|
|
objects as value. |
|
55
|
|
|
""" |
|
56
|
|
|
result = {} |
|
57
|
|
|
maxlen = len(filenames) |
|
58
|
|
|
include_paths = collect_dirs([os.path.dirname(base_path) + "/**"]) |
|
59
|
|
|
include_paths += extra_include_paths |
|
60
|
|
|
|
|
61
|
|
|
for i, filename in enumerate(filenames): |
|
62
|
|
|
progress_callback(100*(i/maxlen)) |
|
63
|
|
|
count_dict = count_vector_creator.get_vectors_for_file(filename, |
|
64
|
|
|
include_paths) |
|
65
|
|
|
for function in count_dict: |
|
66
|
|
|
if not exclude_function(count_dict[function]): |
|
67
|
|
|
result[(filename, |
|
68
|
|
|
function[0], |
|
69
|
|
|
function[1])] = count_dict[function] |
|
70
|
|
|
|
|
71
|
|
|
return result |
|
72
|
|
|
|
|
73
|
|
|
|
|
74
|
|
|
def pad_count_vectors(cm1, cm2): |
|
75
|
|
|
""" |
|
76
|
|
|
Pads the smaller count matrix with zeroed count vectors. |
|
77
|
|
|
|
|
78
|
|
|
:param cm1: First cm. Will not be modified. |
|
79
|
|
|
:param cm2: Second cm. Will not be modified. |
|
80
|
|
|
:return: A tuple holding two cms. |
|
81
|
|
|
""" |
|
82
|
|
|
cm1len = len(cm1) |
|
83
|
|
|
cm2len = len(cm2) |
|
84
|
|
|
if cm1len != cm2len: |
|
85
|
|
|
# Copy the smaller matrix as it will be altered |
|
86
|
|
|
if cm1len > cm2len: |
|
87
|
|
|
cm2 = copy.copy(cm2) |
|
88
|
|
|
else: # make cm1 the larger (or equal) one |
|
89
|
|
|
tmp = cm2 |
|
90
|
|
|
cm2 = copy.copy(cm1) |
|
91
|
|
|
cm1 = tmp |
|
92
|
|
|
|
|
93
|
|
|
any_count_vector = list(cm1.values())[0] |
|
94
|
|
|
# Fill up smaller count matrix with zero vectors. This way no |
|
95
|
|
|
# padding is needed later and if count vectors are zero on both |
|
96
|
|
|
# side, the difference is zero too which wouldn't be taken into |
|
97
|
|
|
# account with simple padding of ones. |
|
98
|
|
|
for i in range(len(cm1) - len(cm2)): |
|
99
|
|
|
cm2[i] = any_count_vector.create_null_vector(i) |
|
100
|
|
|
|
|
101
|
|
|
return cm1, cm2 |
|
102
|
|
|
|
|
103
|
|
|
|
|
104
|
|
|
def relative_difference(difference, maxabs): |
|
105
|
|
|
if maxabs == 0: |
|
106
|
|
|
return 1 |
|
107
|
|
|
return difference/maxabs |
|
108
|
|
|
|
|
109
|
|
|
|
|
110
|
|
|
def average(lst): |
|
111
|
|
|
return sum(lst)/len(lst) |
|
112
|
|
|
|
|
113
|
|
|
|
|
114
|
|
|
def get_difference(matching_iterator, |
|
115
|
|
|
average_calculation, |
|
116
|
|
|
poly_postprocessing, |
|
117
|
|
|
exp_postprocessing): |
|
118
|
|
|
""" |
|
119
|
|
|
Retrieves the difference value for the matched function represented by the |
|
120
|
|
|
given matches. |
|
121
|
|
|
|
|
122
|
|
|
Postprocessing may be done because small functions are less likely to be |
|
123
|
|
|
clones at the same difference value than big functions which may provide a |
|
124
|
|
|
better refactoring opportunity for the user. |
|
125
|
|
|
|
|
126
|
|
|
:return: A difference value between 0 and 1. |
|
127
|
|
|
|
|
128
|
|
|
:param matching_iterator: A list holding tuples of an absolute difference |
|
129
|
|
|
value and a value to normalize the difference |
|
130
|
|
|
into a range of [0, 1]. |
|
131
|
|
|
:param average_calculation: If set to true this function will take the |
|
132
|
|
|
average of all variable differences as the |
|
133
|
|
|
difference, else it will normalize the |
|
134
|
|
|
function as a whole and thus weighting in |
|
135
|
|
|
variables dependent on their size. |
|
136
|
|
|
:param poly_postprocessing: If set to true, the difference value of big |
|
137
|
|
|
function pairs will be reduced using a |
|
138
|
|
|
polynomial approach. |
|
139
|
|
|
:param exp_postprocessing: If set to true, the difference value of big |
|
140
|
|
|
function pairs will be reduced using an |
|
141
|
|
|
exponential approach. |
|
142
|
|
|
""" |
|
143
|
|
|
norm_sum = sum(norm for diff, norm in matching_iterator) |
|
144
|
|
|
|
|
145
|
|
|
if average_calculation: |
|
146
|
|
|
difference = average([relative_difference(diff, norm) |
|
147
|
|
|
for diff, norm in matching_iterator]) |
|
148
|
|
|
else: |
|
149
|
|
|
difference = relative_difference( |
|
150
|
|
|
sum(diff for diff, norm in matching_iterator), |
|
151
|
|
|
norm_sum) |
|
152
|
|
|
|
|
153
|
|
|
if poly_postprocessing and norm_sum != 0: |
|
154
|
|
|
# This function starts at 1 and converges to .75 for norm_sum -> inf |
|
155
|
|
|
difference *= (3*norm_sum+1)/(4*norm_sum) |
|
156
|
|
|
if exp_postprocessing and norm_sum != 0: |
|
157
|
|
|
difference *= math.exp(1-norm_sum)/4 + 0.75 |
|
158
|
|
|
|
|
159
|
|
|
return difference |
|
160
|
|
|
|
|
161
|
|
|
|
|
162
|
|
|
def compare_functions(cm1, |
|
163
|
|
|
cm2, |
|
164
|
|
|
average_calculation=False, |
|
165
|
|
|
poly_postprocessing=True, |
|
166
|
|
|
exp_postprocessing=False): |
|
167
|
|
|
""" |
|
168
|
|
|
Compares the functions represented by the given count matrices. |
|
169
|
|
|
|
|
170
|
|
|
Postprocessing may be done because small functions are less likely to be |
|
171
|
|
|
clones at the same difference value than big functions which may provide a |
|
172
|
|
|
better refactoring opportunity for the user. |
|
173
|
|
|
|
|
174
|
|
|
:param cm1: Count vector dict for the first function. |
|
175
|
|
|
:param cm2: Count vector dict for the second function. |
|
176
|
|
|
:param average_calculation: If set to true the difference calculation |
|
177
|
|
|
function will take the average of all variable |
|
178
|
|
|
differences as the difference, else it will |
|
179
|
|
|
normalize the function as a whole and thus |
|
180
|
|
|
weighting in variables dependent on their size. |
|
181
|
|
|
:param poly_postprocessing: If set to true, the difference value of big |
|
182
|
|
|
function pairs will be reduced using a |
|
183
|
|
|
polynomial approach. |
|
184
|
|
|
:param exp_postprocessing: If set to true, the difference value of big |
|
185
|
|
|
function pairs will be reduced using an |
|
186
|
|
|
exponential approach. |
|
187
|
|
|
:return: The difference between these functions, 0 is |
|
188
|
|
|
identical and 1 is not similar at all. |
|
189
|
|
|
""" |
|
190
|
|
|
assert 0 not in (len(cm1), len(cm2)) |
|
191
|
|
|
|
|
192
|
|
|
cm1, cm2 = pad_count_vectors(cm1, cm2) |
|
193
|
|
|
|
|
194
|
|
|
diff_table = [(cv1, |
|
195
|
|
|
[(cv2, cv1.difference(cv2), cv1.maxabs(cv2)) |
|
196
|
|
|
for cv2 in cm2.values()]) |
|
197
|
|
|
for cv1 in cm1.values()] |
|
198
|
|
|
|
|
199
|
|
|
# The cost matrix holds the difference between the two variables i and |
|
200
|
|
|
# j in the i/j field. This is a representation of a bipartite weighted |
|
201
|
|
|
# graph with nodes representing the first function on the one side |
|
202
|
|
|
# (rows) and the nodes representing the second function on the other |
|
203
|
|
|
# side (columns). The fields in the matrix are the weighted nodes |
|
204
|
|
|
# connecting each element from one side to the other. |
|
205
|
|
|
cost_matrix = [[relative_difference(difference, maxabs) |
|
206
|
|
|
for cv2, difference, maxabs in lst] |
|
207
|
|
|
for cv1, lst in diff_table] |
|
208
|
|
|
|
|
209
|
|
|
# The munkres algorithm will calculate a matching such that the sum of |
|
210
|
|
|
# the taken fields is minimal. It thus will associate each variable |
|
211
|
|
|
# from one function to one on the other function. |
|
212
|
|
|
matching = munkres.compute(cost_matrix) |
|
213
|
|
|
|
|
214
|
|
|
return get_difference([(diff_table[x][1][y][1], diff_table[x][1][y][2]) |
|
215
|
|
|
for x, y in matching], |
|
216
|
|
|
average_calculation, |
|
217
|
|
|
poly_postprocessing, |
|
218
|
|
|
exp_postprocessing) |
|
219
|
|
|
|