|
1
|
|
|
import copy |
|
2
|
|
|
from difflib import SequenceMatcher |
|
3
|
|
|
|
|
4
|
|
|
from coalib.results.Diff import ConflictError, Diff |
|
5
|
|
|
from coalib.results.SourceRange import SourceRange |
|
6
|
|
|
|
|
7
|
|
|
|
|
8
|
|
|
def filter_results(original_file_dict, |
|
9
|
|
|
modified_file_dict, |
|
10
|
|
|
original_results, |
|
11
|
|
|
modified_results): |
|
12
|
|
|
""" |
|
13
|
|
|
Filters results for such ones that are unique across file changes |
|
14
|
|
|
|
|
15
|
|
|
:param original_file_dict: Dict of lists of file contents before changes |
|
16
|
|
|
:param modified_file_dict: Dict of lists of file contents after changes |
|
17
|
|
|
:param original_results: List of results of the old files |
|
18
|
|
|
:param modified_results: List of results of the new files |
|
19
|
|
|
:return: List of results from new files that are unique |
|
20
|
|
|
from all those that existed in the old changes |
|
21
|
|
|
""" |
|
22
|
|
|
|
|
23
|
|
|
renamed_files = ensure_files_present(original_file_dict, |
|
24
|
|
|
modified_file_dict) |
|
25
|
|
|
# diffs_dict[file] is a diff between the original and modified file |
|
26
|
|
|
diffs_dict = {} |
|
27
|
|
|
for file in original_file_dict: |
|
28
|
|
|
if file in renamed_files: |
|
29
|
|
|
diffs_dict[file] = Diff.from_string_arrays( |
|
30
|
|
|
original_file_dict[file], |
|
31
|
|
|
modified_file_dict[renamed_files[file]]) |
|
32
|
|
|
else: |
|
33
|
|
|
diffs_dict[file] = Diff.from_string_arrays( |
|
34
|
|
|
original_file_dict[file], |
|
35
|
|
|
modified_file_dict[file]) |
|
36
|
|
|
|
|
37
|
|
|
orig_result_diff_dict_dict = remove_result_ranges_diffs(original_results, |
|
38
|
|
|
original_file_dict) |
|
39
|
|
|
|
|
40
|
|
|
mod_result_diff_dict_dict = remove_result_ranges_diffs(modified_results, |
|
41
|
|
|
modified_file_dict) |
|
42
|
|
|
|
|
43
|
|
|
unique_results = [] |
|
44
|
|
|
|
|
45
|
|
|
for m_r in reversed(modified_results): |
|
46
|
|
|
unique = True |
|
47
|
|
|
|
|
48
|
|
|
for o_r in original_results: |
|
49
|
|
|
|
|
50
|
|
|
if basics_match(o_r, m_r): |
|
51
|
|
|
if source_ranges_match(original_file_dict, |
|
52
|
|
|
diffs_dict, |
|
53
|
|
|
orig_result_diff_dict_dict[o_r], |
|
54
|
|
|
mod_result_diff_dict_dict[m_r], |
|
55
|
|
|
renamed_files): |
|
56
|
|
|
|
|
57
|
|
|
# at least one original result matches completely |
|
58
|
|
|
unique = False |
|
59
|
|
|
break |
|
60
|
|
|
if unique: |
|
61
|
|
|
unique_results.append(m_r) |
|
62
|
|
|
|
|
63
|
|
|
return unique_results |
|
64
|
|
|
|
|
65
|
|
|
|
|
66
|
|
|
def basics_match(original_result, modified_result): |
|
67
|
|
|
""" |
|
68
|
|
|
Checks whether the following properties of two results match: |
|
69
|
|
|
* origin |
|
70
|
|
|
* message |
|
71
|
|
|
* severity |
|
72
|
|
|
* debug_msg |
|
73
|
|
|
|
|
74
|
|
|
:param original_result: A result of the old files |
|
75
|
|
|
:param modified_result: A result of the new files |
|
76
|
|
|
:return: Boolean value whether or not the properties match |
|
77
|
|
|
""" |
|
78
|
|
|
|
|
79
|
|
|
return all(getattr(original_result, member) == |
|
80
|
|
|
getattr(modified_result, member) |
|
81
|
|
|
for member in ['origin', 'message', 'severity', 'debug_msg']) |
|
82
|
|
|
|
|
83
|
|
|
|
|
84
|
|
|
def source_ranges_match(original_file_dict, |
|
85
|
|
|
diff_dict, |
|
86
|
|
|
original_result_diff_dict, |
|
87
|
|
|
modified_result_diff_dict, |
|
88
|
|
|
renamed_files): |
|
89
|
|
|
""" |
|
90
|
|
|
Checks whether the SourceRanges of two results match |
|
91
|
|
|
|
|
92
|
|
|
:param original_file_dict: Dict of lists of file contents before changes |
|
93
|
|
|
:param diff_dict: Dict of diffs describing the changes per file |
|
94
|
|
|
:param original_result_diff_dict: diff for each file for this result |
|
95
|
|
|
:param modified_result_diff_dict: guess |
|
96
|
|
|
:param renamed_files: A dictionary containing file renamings across runs |
|
97
|
|
|
:return: Boolean value whether the SourceRanges match |
|
98
|
|
|
""" |
|
99
|
|
|
for file_name in original_file_dict: |
|
100
|
|
|
mod_file_name = file_name |
|
101
|
|
|
if file_name in renamed_files: |
|
102
|
|
|
mod_file_name = renamed_files[file_name] |
|
103
|
|
|
|
|
104
|
|
|
try: # fails if the affected range of the result get's modified |
|
105
|
|
|
original_total_diff = (diff_dict[file_name] + |
|
106
|
|
|
original_result_diff_dict[file_name]) |
|
107
|
|
|
except ConflictError: |
|
108
|
|
|
return False |
|
109
|
|
|
|
|
110
|
|
|
# original file with file_diff and original_diff applied |
|
111
|
|
|
original_total_file = original_total_diff.modified |
|
112
|
|
|
# modified file with modified_diff applied |
|
113
|
|
|
modified_total_file = modified_result_diff_dict[mod_file_name].modified |
|
114
|
|
|
if original_total_file != modified_total_file: |
|
115
|
|
|
return False |
|
116
|
|
|
return True |
|
117
|
|
|
|
|
118
|
|
|
|
|
119
|
|
|
def remove_range(file_contents, source_range): |
|
120
|
|
|
""" |
|
121
|
|
|
removes the chars covered by the sourceRange from the file |
|
122
|
|
|
|
|
123
|
|
|
:param file_contents: list of lines in the file |
|
124
|
|
|
:param source_range: Source Range |
|
125
|
|
|
:return: list of file contents without specified chars removed |
|
126
|
|
|
""" |
|
127
|
|
|
if not file_contents: |
|
128
|
|
|
return [] |
|
129
|
|
|
|
|
130
|
|
|
newfile = list(file_contents) |
|
131
|
|
|
# attention: line numbers in the SourceRange are human-readable, |
|
132
|
|
|
# list indices start with 0 |
|
133
|
|
|
|
|
134
|
|
|
source_range = source_range.expand(file_contents) |
|
135
|
|
|
|
|
136
|
|
|
if source_range.start.line == source_range.end.line: |
|
137
|
|
|
# if it's all in one line, replace the line by it's beginning and end |
|
138
|
|
|
newfile[source_range.start.line - 1] = ( |
|
139
|
|
|
newfile[source_range.start.line - 1][:source_range.start.column-1] |
|
140
|
|
|
+ newfile[source_range.start.line - 1][source_range.end.column:]) |
|
141
|
|
|
if newfile[source_range.start.line - 1] == "": |
|
142
|
|
|
del newfile[source_range.start.line - 1] |
|
143
|
|
|
else: |
|
144
|
|
|
# cut away after start |
|
145
|
|
|
newfile[source_range.start.line - 1] = ( |
|
146
|
|
|
newfile[source_range.start.line - 1][:source_range.start.column-1]) |
|
147
|
|
|
|
|
148
|
|
|
# cut away before end |
|
149
|
|
|
newfile[source_range.end.line - 1] = ( |
|
150
|
|
|
newfile[source_range.end.line - 1][source_range.end.column:]) |
|
151
|
|
|
|
|
152
|
|
|
# start: index = first line number ==> line after first line |
|
153
|
|
|
# end: index = last line -2 ==> line before last line |
|
154
|
|
|
|
|
155
|
|
|
for i in reversed(range( |
|
156
|
|
|
source_range.start.line, source_range.end.line - 1)): |
|
157
|
|
|
del newfile[i] |
|
158
|
|
|
|
|
159
|
|
|
# remove leftover empty lines |
|
160
|
|
|
# the first line here is actually the former `source_range.end.line -1` |
|
161
|
|
|
if newfile[source_range.start.line] == "": |
|
162
|
|
|
del newfile[source_range.start.line] |
|
163
|
|
|
if newfile[source_range.start.line - 1] == "": |
|
164
|
|
|
del newfile[source_range.start.line - 1] |
|
165
|
|
|
|
|
166
|
|
|
return newfile |
|
167
|
|
|
|
|
168
|
|
|
|
|
169
|
|
|
def remove_result_ranges_diffs(result_list, file_dict): |
|
170
|
|
|
""" |
|
171
|
|
|
Calculates the diffs to all files in file_dict that describe the removal of |
|
172
|
|
|
each respective result's affected code. |
|
173
|
|
|
|
|
174
|
|
|
:param result_list: list of results |
|
175
|
|
|
:param file_dict: dict of file contents |
|
176
|
|
|
:return: returnvalue[result][file] is a diff of the changes the |
|
177
|
|
|
removal of this result's affected code would cause for |
|
178
|
|
|
the file. |
|
179
|
|
|
""" |
|
180
|
|
|
result_diff_dict_dict = {} |
|
181
|
|
|
for original_result in result_list: |
|
182
|
|
|
mod_file_dict = copy.deepcopy(file_dict) |
|
183
|
|
|
|
|
184
|
|
|
# gather all source ranges from this result |
|
185
|
|
|
source_ranges = [] |
|
186
|
|
|
|
|
187
|
|
|
# SourceRanges must be sorted backwards and overlaps must be eliminated |
|
188
|
|
|
# this way, the deletion based on sourceRanges is not offset by |
|
189
|
|
|
# previous deletions in the same line that invalidate the indices. |
|
190
|
|
|
previous = None |
|
191
|
|
|
|
|
192
|
|
|
for source_range in sorted(original_result.affected_code, reverse=True): |
|
193
|
|
|
# previous exists and overlaps |
|
194
|
|
|
if previous is not None and source_range.overlaps(previous): |
|
195
|
|
|
combined_sr = SourceRange.join(previous, source_range) |
|
196
|
|
|
previous = combined_sr |
|
197
|
|
|
elif previous is None: |
|
198
|
|
|
previous = source_range |
|
199
|
|
|
# previous exists but it doesn't overlap |
|
200
|
|
|
else: |
|
201
|
|
|
source_ranges.append(previous) |
|
202
|
|
|
previous = source_range |
|
203
|
|
|
# don't forget last entry if there were any: |
|
204
|
|
|
if previous: |
|
205
|
|
|
source_ranges.append(previous) |
|
206
|
|
|
|
|
207
|
|
|
for source_range in source_ranges: |
|
208
|
|
|
file_name = source_range.file |
|
209
|
|
|
new_file = remove_range(mod_file_dict[file_name], |
|
210
|
|
|
source_range) |
|
211
|
|
|
mod_file_dict[file_name] = new_file |
|
212
|
|
|
|
|
213
|
|
|
diff_dict = {} |
|
214
|
|
|
for file_name in file_dict: |
|
215
|
|
|
diff_dict[file_name] = Diff.from_string_arrays( |
|
216
|
|
|
file_dict[file_name], |
|
217
|
|
|
mod_file_dict[file_name]) |
|
218
|
|
|
|
|
219
|
|
|
result_diff_dict_dict[original_result] = diff_dict |
|
220
|
|
|
|
|
221
|
|
|
return result_diff_dict_dict |
|
222
|
|
|
|
|
223
|
|
|
|
|
224
|
|
|
def ensure_files_present(original_file_dict, modified_file_dict): |
|
225
|
|
|
""" |
|
226
|
|
|
Ensures that all files are available as keys in both dicts. Return a |
|
227
|
|
|
dictionary of renamed files. |
|
228
|
|
|
|
|
229
|
|
|
:param original_file_dict: Dict of lists of file contents before changes |
|
230
|
|
|
:param modified_file_dict: Dict of lists of file contents after changes |
|
231
|
|
|
""" |
|
232
|
|
|
affected_files = set(original_file_dict.keys()).union( |
|
233
|
|
|
set(modified_file_dict.keys())) |
|
234
|
|
|
original_unique_files = affected_files - set(modified_file_dict.keys()) |
|
235
|
|
|
renamed_files_dict = {} |
|
236
|
|
|
for file in affected_files: |
|
237
|
|
|
if file not in original_file_dict: |
|
238
|
|
|
renamed = 0 |
|
239
|
|
|
for comparable_file in original_unique_files: |
|
240
|
|
|
s = SequenceMatcher( |
|
241
|
|
|
None, |
|
242
|
|
|
''.join(modified_file_dict[file]), |
|
243
|
|
|
''.join(original_file_dict[comparable_file])) |
|
244
|
|
|
if s.real_quick_ratio() >= 0.5: |
|
245
|
|
|
if s.ratio() > 0.5: |
|
246
|
|
|
renamed_files_dict[comparable_file] = file |
|
247
|
|
|
renamed = 1 |
|
248
|
|
|
break |
|
249
|
|
|
if renamed == 0: |
|
250
|
|
|
original_file_dict[file] = [] |
|
251
|
|
|
if file not in modified_file_dict: |
|
252
|
|
|
modified_file_dict[file] = [] |
|
253
|
|
|
return renamed_files_dict |
|
254
|
|
|
|