Completed
Pull Request — master (#1109)
by Abdeali
01:53
created

bears.codeclone_detection.get_difference()   B

Complexity

Conditions 1

Size

Total Lines 33

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 1
dl 0
loc 33
rs 8.8571
1
import functools
2
from itertools import combinations
3
4
from coalib.misc.StringConverter import StringConverter
5
from coalib.results.HiddenResult import HiddenResult
6
from coalib.settings.Setting import typed_ordered_dict, path_list
7
from coalib.collecting.Collectors import collect_dirs
8
from coalib.bears.GlobalBear import GlobalBear
9
from bears.codeclone_detection.ClangCountVectorCreator import (
10
    ClangCountVectorCreator)
11
from bears.codeclone_detection.ClangCountingConditions import condition_dict
12
from bears.codeclone_detection.CloneDetectionRoutines import (
13
    compare_functions,
14
    get_count_matrices)
15
16
17
# counting_condition_dict is a function object generated by typed_dict. This
18
# function takes a setting and creates a dictionary out of it while it
19
# converts all keys to counting condition function objects (via the
20
# condition_dict) and all values to floats while unset values default to 1.
21
counting_condition_dict = typed_ordered_dict(
22
    lambda setting: condition_dict[str(setting).lower()],
23
    float,
24
    1)
25
26
default_cc_dict = counting_condition_dict(StringConverter(
27
    """
28
used: 0,
29
returned: 1.4,
30
is_condition: 0,
31
in_condition: 1.4,
32
in_second_level_condition: 1.4,
33
in_third_level_condition: 1.0,
34
is_assignee: 0,
35
is_assigner: 0.6,
36
loop_content: 0,
37
second_level_loop_content,
38
third_level_loop_content,
39
is_param: 2,
40
is_called: 1.4,
41
is_call_param: 0.0,
42
in_sum: 2.0,
43
in_product: 0,
44
in_binary_operation,
45
member_accessed"""))
46
47
48
def get_difference(function_pair,
49
                   count_matrices,
50
                   average_calculation,
51
                   poly_postprocessing,
52
                   exp_postprocessing):
53
    """
54
    Retrieves the difference between two functions using the munkres algorithm.
55
56
    :param function_pair:       A tuple containing both indices for the
57
                                count_matrices dictionary.
58
    :param count_matrices:      A dictionary holding CMs.
59
    :param average_calculation: If set to true the difference calculation
60
                                function will take the average of all variable
61
                                differences as the difference, else it will
62
                                normalize the function as a whole and thus
63
                                weighting in variables dependent on their size.
64
    :param poly_postprocessing: If set to true, the difference value of big
65
                                function pairs will be reduced using a
66
                                polynomial approach.
67
    :param exp_postprocessing:  If set to true, the difference value of big
68
                                function pairs will be reduced using an
69
                                exponential approach.
70
    :return:                    A tuple containing both function ids and their
71
                                difference.
72
    """
73
    function_1, function_2 = function_pair
74
    return (function_1,
75
            function_2,
76
            compare_functions(count_matrices[function_1],
77
                              count_matrices[function_2],
78
                              average_calculation,
79
                              poly_postprocessing,
80
                              exp_postprocessing))
81
82
83
class ClangFunctionDifferenceBear(GlobalBear):
84
    def run(self,
85
            counting_conditions: counting_condition_dict=default_cc_dict,
86
            average_calculation: bool=False,
87
            poly_postprocessing: bool=True,
88
            exp_postprocessing: bool=False,
89
            extra_include_paths: path_list=()):
90
        '''
91
        Retrieves similarities for code clone detection. Those can be reused in
92
        another bear to produce results.
93
94
        Postprocessing may be done because small functions are less likely to
95
        be clones at the same difference value than big functions which may
96
        provide a better refactoring opportunity for the user.
97
98
        :param counting_conditions: A comma seperated list of counting
99
                                    conditions. Possible values are: used,
100
                                    returned, is_condition, in_condition,
101
                                    in_second_level_condition,
102
                                    in_third_level_condition, is_assignee,
103
                                    is_assigner, loop_content,
104
                                    second_level_loop_content,
105
                                    third_level_loop_content, is_param,
106
                                    in_sum, in_product, in_binary_operation,
107
                                    member_accessed.
108
                                    Weightings can be assigned to each
109
                                    condition due to providing a dict
110
                                    value, i.e. having used weighted in
111
                                    half as much as other conditions would
112
                                    simply be: "used: 0.5, is_assignee".
113
                                    Weightings default to 1 if unset.
114
        :param average_calculation: If set to true the difference calculation
115
                                    function will take the average of all
116
                                    variable differences as the difference,
117
                                    else it will normalize the function as a
118
                                    whole and thus weighting in variables
119
                                    dependent on their size.
120
        :param poly_postprocessing: If set to true, the difference value of big
121
                                    function pairs will be reduced using a
122
                                    polynomial approach.
123
        :param extra_include_paths: A list containing additional include paths.
124
        :param exp_postprocessing:  If set to true, the difference value of big
125
                                    function pairs will be reduced using an
126
                                    exponential approach.
127
        '''
128
        self.debug("Using the following counting conditions:")
129
        for key, val in counting_conditions.items():
130
            self.debug(" *", key.__name__, "(weighting: {})".format(val))
131
132
        self.debug("Creating count matrices...")
133
        count_matrices = get_count_matrices(
134
            ClangCountVectorCreator(list(counting_conditions.keys()),
135
                                    list(counting_conditions.values())),
136
            list(self.file_dict.keys()),
137
            lambda prog: self.debug("{:2.4f}%...".format(prog)),
138
            self.section["files"].origin,
139
            collect_dirs(extra_include_paths))
140
141
        self.debug("Calculating differences...")
142
143
        differences = []
144
        function_count = len(count_matrices)
145
        # Thats n over 2, hardcoded to simplify calculation
146
        combination_length = function_count * (function_count-1) / 2
147
        partial_get_difference = functools.partial(
148
            get_difference,
149
            count_matrices=count_matrices,
150
            average_calculation=average_calculation,
151
            poly_postprocessing=poly_postprocessing,
152
            exp_postprocessing=exp_postprocessing)
153
154
        for i, elem in enumerate(
155
                map(partial_get_difference,
156
                    [(f1, f2) for f1, f2 in combinations(count_matrices, 2)])):
157
            if i % 50 == 0:
158
                self.debug("{:2.4f}%...".format(100*i/combination_length))
159
            differences.append(elem)
160
161
        yield HiddenResult(self, differences)
162
        yield HiddenResult(self, count_matrices)
163