bears.codeclone_detection.get_difference() - Code Metrics - Inspection of "Linters" - coala-analyzer/coala - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#1109)

by Abdeali

created 2015-12-12 00:56 UTC

bears.codeclone_detection.get_difference() F

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	9
dl	0
loc	46
rs	3.1579

import os
import math
import copy
from munkres import Munkres
# Instantiate globally since this class is holding stateless public methods.
munkres = Munkres()

from coalib.collecting.Collectors import collect_dirs


def exclude_function(count_matrix):
    """
    Determines heuristically whether or not it makes sense for clone
    detection to take this function into account.

    Applied heuristics:
     * Functions with only count vectors with a sum of all unweighted elements
       of lower then 10 are very likely only declarations or empty and to be
       ignored. (Constants are not taken into account.)

    :param count_matrix: A dictionary with count vectors representing all
                         variables for a function.
    :return:             True if the function is useless for evaluation.
    """
    var_count = [cv.name.startswith("#")
                 for cv in count_matrix.values()].count(False)
    variable_sum = sum(0 if cv.name.startswith("#") else sum(cv.unweighted)
                       for cv in count_matrix.values())
    return (all((cv.name.startswith("#") or sum(cv.unweighted) < 10)
                for cv in count_matrix.values()) or
            variable_sum < 11 or
            var_count < 2)


def get_count_matrices(count_vector_creator,
                       filenames,
                       progress_callback,
                       base_path,
                       extra_include_paths):
    """
    Retrieves matrices holding count vectors for all variables for all
    functions in the given file.

    :param count_vector_creator: A object with a get_vectors_for_file method
                                 taking a filename as argument.
    :param filenames:            The files to create count vectors for.
    :param progress_callback:    A function with one float argument which is
                                 called after processing each file with the
                                 progress percentage (float) as an argument.
    :param extra_include_paths:  A list containing additional include paths.
    :return:                     A dict holding a tuple of (file, line,
                                 function) as key and as value a dict with
                                 variable names as key and count vector
                                 objects as value.
    """
    result = {}
    maxlen = len(filenames)
    include_paths = collect_dirs([os.path.dirname(base_path) + "/**"])
    include_paths += extra_include_paths

    for i, filename in enumerate(filenames):
        progress_callback(100*(i/maxlen))
        count_dict = count_vector_creator.get_vectors_for_file(filename,
                                                               include_paths)
        for function in count_dict:
            if not exclude_function(count_dict[function]):
                result[(filename,
                        function[0],
                        function[1])] = count_dict[function]

    return result


def pad_count_vectors(cm1, cm2):
    """
    Pads the smaller count matrix with zeroed count vectors.

    :param cm1: First cm. Will not be modified.
    :param cm2: Second cm. Will not be modified.
    :return:    A tuple holding two cms.
    """
    cm1len = len(cm1)
    cm2len = len(cm2)
    if cm1len != cm2len:
        # Copy the smaller matrix as it will be altered
        if cm1len > cm2len:
            cm2 = copy.copy(cm2)
        else:  # make cm1 the larger (or equal) one
            tmp = cm2
            cm2 = copy.copy(cm1)
            cm1 = tmp

        any_count_vector = list(cm1.values())[0]
        # Fill up smaller count matrix with zero vectors. This way no
        # padding is needed later and if count vectors are zero on both
        # side, the difference is zero too which wouldn't be taken into
        # account with simple padding of ones.
        for i in range(len(cm1) - len(cm2)):
            cm2[i] = any_count_vector.create_null_vector(i)

    return cm1, cm2


def relative_difference(difference, maxabs):
    if maxabs == 0:
        return 1
    return difference/maxabs


def average(lst):
    return sum(lst)/len(lst)


def get_difference(matching_iterator,
                   average_calculation,
                   poly_postprocessing,
                   exp_postprocessing):
    """
    Retrieves the difference value for the matched function represented by the
    given matches.

    Postprocessing may be done because small functions are less likely to be
    clones at the same difference value than big functions which may provide a
    better refactoring opportunity for the user.

    :return:                    A difference value between 0 and 1.

    :param matching_iterator:   A list holding tuples of an absolute difference
                                value and a value to normalize the difference
                                into a range of [0, 1].
    :param average_calculation: If set to true this function will take the
                                average of all variable differences as the
                                difference, else it will normalize the
                                function as a whole and thus weighting in
                                variables dependent on their size.
    :param poly_postprocessing: If set to true, the difference value of big
                                function pairs will be reduced using a
                                polynomial approach.
    :param exp_postprocessing:  If set to true, the difference value of big
                                function pairs will be reduced using an
                                exponential approach.
    """
    norm_sum = sum(norm for diff, norm in matching_iterator)

    if average_calculation:
        difference = average([relative_difference(diff, norm)
                              for diff, norm in matching_iterator])
    else:
        difference = relative_difference(
            sum(diff for diff, norm in matching_iterator),
            norm_sum)

    if poly_postprocessing and norm_sum != 0:
        # This function starts at 1 and converges to .75 for norm_sum -> inf
        difference *= (3*norm_sum+1)/(4*norm_sum)
    if exp_postprocessing and norm_sum != 0:
        difference *= math.exp(1-norm_sum)/4 + 0.75

    return difference


def compare_functions(cm1,
                      cm2,
                      average_calculation=False,
                      poly_postprocessing=True,
                      exp_postprocessing=False):
    """
    Compares the functions represented by the given count matrices.

    Postprocessing may be done because small functions are less likely to be
    clones at the same difference value than big functions which may provide a
    better refactoring opportunity for the user.

    :param cm1:                 Count vector dict for the first function.
    :param cm2:                 Count vector dict for the second function.
    :param average_calculation: If set to true the difference calculation
                                function will take the average of all variable
                                differences as the difference, else it will
                                normalize the function as a whole and thus
                                weighting in variables dependent on their size.
    :param poly_postprocessing: If set to true, the difference value of big
                                function pairs will be reduced using a
                                polynomial approach.
    :param exp_postprocessing:  If set to true, the difference value of big
                                function pairs will be reduced using an
                                exponential approach.
    :return:                    The difference between these functions, 0 is
                                identical and 1 is not similar at all.
    """
    assert 0 not in (len(cm1), len(cm2))

    cm1, cm2 = pad_count_vectors(cm1, cm2)

    diff_table = [(cv1,
                   [(cv2, cv1.difference(cv2), cv1.maxabs(cv2))
                    for cv2 in cm2.values()])
                  for cv1 in cm1.values()]

    # The cost matrix holds the difference between the two variables i and
    # j in the i/j field. This is a representation of a bipartite weighted
    # graph with nodes representing the first function on the one side
    # (rows) and the nodes representing the second function on the other
    #  side (columns). The fields in the matrix are the weighted nodes
    # connecting each element from one side to the other.
    cost_matrix = [[relative_difference(difference, maxabs)
                    for cv2, difference, maxabs in lst]
                   for cv1, lst in diff_table]

    # The munkres algorithm will calculate a matching such that the sum of
    # the taken fields is minimal. It thus will associate each variable
    # from one function to one on the other function.
    matching = munkres.compute(cost_matrix)

    return get_difference([(diff_table[x][1][y][1], diff_table[x][1][y][2])
                           for x, y in matching],
                          average_calculation,
                          poly_postprocessing,
                          exp_postprocessing)


1			import os
2			import math
3			import copy
4			from munkres import Munkres
5			# Instantiate globally since this class is holding stateless public methods.
6			munkres = Munkres()
7
8			from coalib.collecting.Collectors import collect_dirs
9
10
11			def exclude_function(count_matrix):
12			"""
13			Determines heuristically whether or not it makes sense for clone
14			detection to take this function into account.
15
16			Applied heuristics:
17			* Functions with only count vectors with a sum of all unweighted elements
18			of lower then 10 are very likely only declarations or empty and to be
19			ignored. (Constants are not taken into account.)
20
21			:param count_matrix: A dictionary with count vectors representing all
22			variables for a function.
23			:return: True if the function is useless for evaluation.
24			"""
25			var_count = [cv.name.startswith("#")
26			for cv in count_matrix.values()].count(False)
27			variable_sum = sum(0 if cv.name.startswith("#") else sum(cv.unweighted)
28			for cv in count_matrix.values())
29			return (all((cv.name.startswith("#") or sum(cv.unweighted) < 10)
30			for cv in count_matrix.values()) or
31			variable_sum < 11 or
32			var_count < 2)
33
34
35			def get_count_matrices(count_vector_creator,
36			filenames,
37			progress_callback,
38			base_path,
39			extra_include_paths):
40			"""
41			Retrieves matrices holding count vectors for all variables for all
42			functions in the given file.
43
44			:param count_vector_creator: A object with a get_vectors_for_file method
45			taking a filename as argument.
46			:param filenames: The files to create count vectors for.
47			:param progress_callback: A function with one float argument which is
48			called after processing each file with the
49			progress percentage (float) as an argument.
50			:param extra_include_paths: A list containing additional include paths.
51			:return: A dict holding a tuple of (file, line,
52			function) as key and as value a dict with
53			variable names as key and count vector
54			objects as value.
55			"""
56			result = {}
57			maxlen = len(filenames)
58			include_paths = collect_dirs([os.path.dirname(base_path) + "/**"])
59			include_paths += extra_include_paths
60
61			for i, filename in enumerate(filenames):
62			progress_callback(100*(i/maxlen))
63			count_dict = count_vector_creator.get_vectors_for_file(filename,
64			include_paths)
65			for function in count_dict:
66			if not exclude_function(count_dict[function]):
67			result[(filename,
68			function[0],
69			function[1])] = count_dict[function]
70
71			return result
72
73
74			def pad_count_vectors(cm1, cm2):
75			"""
76			Pads the smaller count matrix with zeroed count vectors.
77
78			:param cm1: First cm. Will not be modified.
79			:param cm2: Second cm. Will not be modified.
80			:return: A tuple holding two cms.
81			"""
82			cm1len = len(cm1)
83			cm2len = len(cm2)
84			if cm1len != cm2len:
85			# Copy the smaller matrix as it will be altered
86			if cm1len > cm2len:
87			cm2 = copy.copy(cm2)
88			else: # make cm1 the larger (or equal) one
89			tmp = cm2
90			cm2 = copy.copy(cm1)
91			cm1 = tmp
92
93			any_count_vector = list(cm1.values())[0]
94			# Fill up smaller count matrix with zero vectors. This way no
95			# padding is needed later and if count vectors are zero on both
96			# side, the difference is zero too which wouldn't be taken into
97			# account with simple padding of ones.
98			for i in range(len(cm1) - len(cm2)):
99			cm2[i] = any_count_vector.create_null_vector(i)
100
101			return cm1, cm2
102
103
104			def relative_difference(difference, maxabs):
105			if maxabs == 0:
106			return 1
107			return difference/maxabs
108
109
110			def average(lst):
111			return sum(lst)/len(lst)
112
113
114			def get_difference(matching_iterator,
115			average_calculation,
116			poly_postprocessing,
117			exp_postprocessing):
118			"""
119			Retrieves the difference value for the matched function represented by the
120			given matches.
121
122			Postprocessing may be done because small functions are less likely to be
123			clones at the same difference value than big functions which may provide a
124			better refactoring opportunity for the user.
125
126			:return: A difference value between 0 and 1.
127
128			:param matching_iterator: A list holding tuples of an absolute difference
129			value and a value to normalize the difference
130			into a range of [0, 1].
131			:param average_calculation: If set to true this function will take the
132			average of all variable differences as the
133			difference, else it will normalize the
134			function as a whole and thus weighting in
135			variables dependent on their size.
136			:param poly_postprocessing: If set to true, the difference value of big
137			function pairs will be reduced using a
138			polynomial approach.
139			:param exp_postprocessing: If set to true, the difference value of big
140			function pairs will be reduced using an
141			exponential approach.
142			"""
143			norm_sum = sum(norm for diff, norm in matching_iterator)
144
145			if average_calculation:
146			difference = average([relative_difference(diff, norm)
147			for diff, norm in matching_iterator])
148			else:
149			difference = relative_difference(
150			sum(diff for diff, norm in matching_iterator),
151			norm_sum)
152
153			if poly_postprocessing and norm_sum != 0:
154			# This function starts at 1 and converges to .75 for norm_sum -> inf
155			difference = (3norm_sum+1)/(4*norm_sum)
156			if exp_postprocessing and norm_sum != 0:
157			difference *= math.exp(1-norm_sum)/4 + 0.75
158
159			return difference
160
161
162			def compare_functions(cm1,
163			cm2,
164			average_calculation=False,
165			poly_postprocessing=True,
166			exp_postprocessing=False):
167			"""
168			Compares the functions represented by the given count matrices.
169
170			Postprocessing may be done because small functions are less likely to be
171			clones at the same difference value than big functions which may provide a
172			better refactoring opportunity for the user.
173
174			:param cm1: Count vector dict for the first function.
175			:param cm2: Count vector dict for the second function.
176			:param average_calculation: If set to true the difference calculation
177			function will take the average of all variable
178			differences as the difference, else it will
179			normalize the function as a whole and thus
180			weighting in variables dependent on their size.
181			:param poly_postprocessing: If set to true, the difference value of big
182			function pairs will be reduced using a
183			polynomial approach.
184			:param exp_postprocessing: If set to true, the difference value of big
185			function pairs will be reduced using an
186			exponential approach.
187			:return: The difference between these functions, 0 is
188			identical and 1 is not similar at all.
189			"""
190			assert 0 not in (len(cm1), len(cm2))
191
192			cm1, cm2 = pad_count_vectors(cm1, cm2)
193
194			diff_table = [(cv1,
195			[(cv2, cv1.difference(cv2), cv1.maxabs(cv2))
196			for cv2 in cm2.values()])
197			for cv1 in cm1.values()]
198
199			# The cost matrix holds the difference between the two variables i and
200			# j in the i/j field. This is a representation of a bipartite weighted
201			# graph with nodes representing the first function on the one side
202			# (rows) and the nodes representing the second function on the other
203			# side (columns). The fields in the matrix are the weighted nodes
204			# connecting each element from one side to the other.
205			cost_matrix = [[relative_difference(difference, maxabs)
206			for cv2, difference, maxabs in lst]
207			for cv1, lst in diff_table]
208
209			# The munkres algorithm will calculate a matching such that the sum of
210			# the taken fields is minimal. It thus will associate each variable
211			# from one function to one on the other function.
212			matching = munkres.compute(cost_matrix)
213
214			return get_difference([(diff_table[x][1][y][1], diff_table[x][1][y][2])
215			for x, y in matching],
216			average_calculation,
217			poly_postprocessing,
218			exp_postprocessing)
219

coala-analyzer / coala

Pull Request — master (#1109)

bears.codeclone_detection.get_difference() F

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like