Completed
Pull Request — master (#1215)
by Lasse
02:06
created

bears.c_languages.codeclone_detection.get_difference()   B

Complexity

Conditions 1

Size

Total Lines 33

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 1
dl 0
loc 33
rs 8.8571
1
import functools
2
from itertools import combinations
3
4
from bears.c_languages.ClangBear import clang_available
5
from bears.c_languages.codeclone_detection.ClangCountVectorCreator import (
6
    ClangCountVectorCreator)
7
from bears.c_languages.codeclone_detection.ClangCountingConditions import (
8
    condition_dict)
9
from bears.c_languages.codeclone_detection.CloneDetectionRoutines import (
10
    compare_functions,
11
    get_count_matrices)
12
from coalib.bears.GlobalBear import GlobalBear
13
from coalib.collecting.Collectors import collect_dirs
14
from coalib.misc.StringConverter import StringConverter
15
from coalib.results.HiddenResult import HiddenResult
16
from coalib.settings.Setting import typed_ordered_dict, path_list
17
18
# counting_condition_dict is a function object generated by typed_dict. This
19
# function takes a setting and creates a dictionary out of it while it
20
# converts all keys to counting condition function objects (via the
21
# condition_dict) and all values to floats while unset values default to 1.
22
counting_condition_dict = typed_ordered_dict(
23
    lambda setting: condition_dict[str(setting).lower()],
24
    float,
25
    1)
26
27
default_cc_dict = counting_condition_dict(StringConverter(
28
    """
29
used: 0,
30
returned: 1.4,
31
is_condition: 0,
32
in_condition: 1.4,
33
in_second_level_condition: 1.4,
34
in_third_level_condition: 1.0,
35
is_assignee: 0,
36
is_assigner: 0.6,
37
loop_content: 0,
38
second_level_loop_content,
39
third_level_loop_content,
40
is_param: 2,
41
is_called: 1.4,
42
is_call_param: 0.0,
43
in_sum: 2.0,
44
in_product: 0,
45
in_binary_operation,
46
member_accessed"""))
47
48
49
def get_difference(function_pair,
50
                   count_matrices,
51
                   average_calculation,
52
                   poly_postprocessing,
53
                   exp_postprocessing):
54
    """
55
    Retrieves the difference between two functions using the munkres algorithm.
56
57
    :param function_pair:       A tuple containing both indices for the
58
                                count_matrices dictionary.
59
    :param count_matrices:      A dictionary holding CMs.
60
    :param average_calculation: If set to true the difference calculation
61
                                function will take the average of all variable
62
                                differences as the difference, else it will
63
                                normalize the function as a whole and thus
64
                                weighting in variables dependent on their size.
65
    :param poly_postprocessing: If set to true, the difference value of big
66
                                function pairs will be reduced using a
67
                                polynomial approach.
68
    :param exp_postprocessing:  If set to true, the difference value of big
69
                                function pairs will be reduced using an
70
                                exponential approach.
71
    :return:                    A tuple containing both function ids and their
72
                                difference.
73
    """
74
    function_1, function_2 = function_pair
75
    return (function_1,
76
            function_2,
77
            compare_functions(count_matrices[function_1],
78
                              count_matrices[function_2],
79
                              average_calculation,
80
                              poly_postprocessing,
81
                              exp_postprocessing))
82
83
84
class ClangFunctionDifferenceBear(GlobalBear):
85
    check_prerequisites = classmethod(clang_available)
86
87
    def run(self,
88
            counting_conditions: counting_condition_dict=default_cc_dict,
89
            average_calculation: bool=False,
90
            poly_postprocessing: bool=True,
91
            exp_postprocessing: bool=False,
92
            extra_include_paths: path_list=()):
93
        '''
94
        Retrieves similarities for code clone detection. Those can be reused in
95
        another bear to produce results.
96
97
        Postprocessing may be done because small functions are less likely to
98
        be clones at the same difference value than big functions which may
99
        provide a better refactoring opportunity for the user.
100
101
        :param counting_conditions: A comma seperated list of counting
102
                                    conditions. Possible values are: used,
103
                                    returned, is_condition, in_condition,
104
                                    in_second_level_condition,
105
                                    in_third_level_condition, is_assignee,
106
                                    is_assigner, loop_content,
107
                                    second_level_loop_content,
108
                                    third_level_loop_content, is_param,
109
                                    in_sum, in_product, in_binary_operation,
110
                                    member_accessed.
111
                                    Weightings can be assigned to each
112
                                    condition due to providing a dict
113
                                    value, i.e. having used weighted in
114
                                    half as much as other conditions would
115
                                    simply be: "used: 0.5, is_assignee".
116
                                    Weightings default to 1 if unset.
117
        :param average_calculation: If set to true the difference calculation
118
                                    function will take the average of all
119
                                    variable differences as the difference,
120
                                    else it will normalize the function as a
121
                                    whole and thus weighting in variables
122
                                    dependent on their size.
123
        :param poly_postprocessing: If set to true, the difference value of big
124
                                    function pairs will be reduced using a
125
                                    polynomial approach.
126
        :param extra_include_paths: A list containing additional include paths.
127
        :param exp_postprocessing:  If set to true, the difference value of big
128
                                    function pairs will be reduced using an
129
                                    exponential approach.
130
        '''
131
        self.debug("Using the following counting conditions:")
132
        for key, val in counting_conditions.items():
133
            self.debug(" *", key.__name__, "(weighting: {})".format(val))
134
135
        self.debug("Creating count matrices...")
136
        count_matrices = get_count_matrices(
137
            ClangCountVectorCreator(list(counting_conditions.keys()),
138
                                    list(counting_conditions.values())),
139
            list(self.file_dict.keys()),
140
            lambda prog: self.debug("{:2.4f}%...".format(prog)),
141
            self.section["files"].origin,
142
            collect_dirs(extra_include_paths))
143
144
        self.debug("Calculating differences...")
145
146
        differences = []
147
        function_count = len(count_matrices)
148
        # Thats n over 2, hardcoded to simplify calculation
149
        combination_length = function_count * (function_count-1) / 2
150
        partial_get_difference = functools.partial(
151
            get_difference,
152
            count_matrices=count_matrices,
153
            average_calculation=average_calculation,
154
            poly_postprocessing=poly_postprocessing,
155
            exp_postprocessing=exp_postprocessing)
156
157
        for i, elem in enumerate(
158
                map(partial_get_difference,
159
                    [(f1, f2) for f1, f2 in combinations(count_matrices, 2)])):
160
            if i % 50 == 0:
161
                self.debug("{:2.4f}%...".format(100*i/combination_length))
162
            differences.append(elem)
163
164
        yield HiddenResult(self, differences)
165
        yield HiddenResult(self, count_matrices)
166