goatools.lin_sim() - Code Metrics - Inspection of "[goatools] add base and semantic; rename notebook..." - tanghaibao/goatools - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( ceaeb4...21ef80 )

by Haibao

created 2016-03-20 00:06 UTC

goatools.lin_sim() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	1
dl	0
loc	7
rs	9.4285

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

"""
Compute semantic similarities between GO terms. Borrowed from book chapter from
Christophe Dessimoz (thanks). For details, please check out:
notebooks/semantic_similarity.ipynb
"""

import math
from collections import Counter


class TermCounts:
    '''
        TermCounts counts the term counts for each
    '''
    def __init__(self, go, annots):
        '''
            Initialise the counts and
        '''
        # Backup
        self._go = go

        # Initialise the counters
        self._counts = Counter()
        self._aspect_counts = Counter()

        # Fill the counters...
        self._count_terms(go, annots)

    def _count_terms(self, go, annots):
        '''
            Fills in the counts and overall aspect counts.
        '''
        for gene, terms in annots.items():
            # Make a union of all the terms for a gene, if term parents are
            # propagated but they won't get double-counted for the gene
            allterms = set(terms)
            for go_id in terms:
                allterms |= go[go_id].get_all_parents()
            for p in allterms:
                self._counts[p] += 1

        for go_id, c in self._counts.items():
            # Group by namespace
            namespace = go[go_id].namespace
            self._aspect_counts[namespace] += c

    def get_count(self, go_id):
        '''
            Returns the count of that GO term observed in the annotations.
        '''
        return self._counts[go_id]

    def get_total_count(self, aspect):
        '''
            Gets the total count that's been precomputed.
        '''
        return self._aspect_counts[aspect]

    def get_term_freq(self, go_id):
        '''
            Returns the frequency at which a particular GO term has
            been observed in the annotations.
        '''
        try:
            namespace = self._go[go_id].namespace
            freq = float(self.get_count(go_id)) / float(self.get_total_count(namespace))
            #print self.get_count(go_id), self.get_total_count(namespace), freq
        except ZeroDivisionError:
            freq = 0

        return freq


def ic(go_id, termcounts):
    '''
        Calculates the information content of a GO term.
    '''
    # Get the observed frequency of the GO term
    freq = termcounts.get_term_freq(go_id)

    # Calculate the information content (i.e., -log("freq of GO term")
    return -1.0 * math.log(freq) if freq else 0


def resnik_sim(go_id1, go_id2, go, termcounts):
    '''
        Computes Resnik's similarity measure.
    '''
    msca = deepest_common_ancestor([go_id1, go_id2], go)
    return ic(msca, termcounts)


def lin_sim(go_id1, go_id2, go, termcounts):
    '''
        Computes Lin's similarity measure.
    '''
    sim_r = resnik_sim(go_id1, go_id2, go, termcounts)

    return (-2*sim_r)/(ic(go_id1, termcounts) + ic(go_id2, termcounts))


def common_parent_go_ids(terms, go):
    '''
        This function finds the common ancestors in the GO
        tree of the list of terms in the input.
    '''
    # Find candidates from first
    rec = go[terms[0]]
    candidates = rec.get_all_parents()
    candidates.update({terms[0]})

    # Find intersection with second to nth term
    for term in terms[1:]:
        rec = go[term]
        parents = rec.get_all_parents()
        parents.update({term})

        # Find the intersection with the candidates, and update.
        candidates.intersection_update(parents)

    return candidates


def deepest_common_ancestor(terms, go):
    '''
        This function gets the nearest common ancestor
        using the above function.
        Only returns single most specific - assumes unique exists.
    '''
    # Take the element at maximum depth.
    return max(common_parent_go_ids(terms, go), key=lambda t: go[t].depth)


def min_branch_length(go_id1, go_id2, go):
    '''
        Finds the minimum branch length between two terms in the GO DAG.
    '''
    # First get the deepest common ancestor
    dca = deepest_common_ancestor([go_id1, go_id2], go)

    # Then get the distance from the DCA to each term
    dca_depth = go[dca].depth
    d1 = go[go_id1].depth - dca_depth
    d2 = go[go_id2].depth - dca_depth

    # Return the total distance - i.e., to the deepest common ancestor and back.
    return d1 + d2


def semantic_distance(go_id1, go_id2, go):
    '''
        Finds the semantic distance (minimum number of connecting branches)
        between two GO terms.
    '''
    return min_branch_length(go_id1, go_id2, go)


def semantic_similarity(go_id1, go_id2, go):
    '''
        Finds the semantic similarity (inverse of the semantic distance)
        between two GO terms.
    '''
    return 1.0 / float(semantic_distance(go_id1, go_id2, go))


1			#!/usr/bin/env python
2			# -- coding: UTF-8 --
3
4			"""
5			Compute semantic similarities between GO terms. Borrowed from book chapter from
6			Christophe Dessimoz (thanks). For details, please check out:
7			notebooks/semantic_similarity.ipynb
8			"""
9
10			import math
11			from collections import Counter
12
13
14			class TermCounts:
15			'''
16			TermCounts counts the term counts for each
17			'''
18			def __init__(self, go, annots):
19			'''
20			Initialise the counts and
21			'''
22			# Backup
23			self._go = go
24
25			# Initialise the counters
26			self._counts = Counter()
27			self._aspect_counts = Counter()
28
29			# Fill the counters...
30			self._count_terms(go, annots)
31
32			def _count_terms(self, go, annots):
33			'''
34			Fills in the counts and overall aspect counts.
35			'''
36			for gene, terms in annots.items():
37			# Make a union of all the terms for a gene, if term parents are
38			# propagated but they won't get double-counted for the gene
39			allterms = set(terms)
40			for go_id in terms:
41			allterms \|= go[go_id].get_all_parents()
42			for p in allterms:
43			self._counts[p] += 1
44
45			for go_id, c in self._counts.items():
46			# Group by namespace
47			namespace = go[go_id].namespace
48			self._aspect_counts[namespace] += c
49
50			def get_count(self, go_id):
51			'''
52			Returns the count of that GO term observed in the annotations.
53			'''
54			return self._counts[go_id]
55
56			def get_total_count(self, aspect):
57			'''
58			Gets the total count that's been precomputed.
59			'''
60			return self._aspect_counts[aspect]
61
62			def get_term_freq(self, go_id):
63			'''
64			Returns the frequency at which a particular GO term has
65			been observed in the annotations.
66			'''
67			try:
68			namespace = self._go[go_id].namespace
69			freq = float(self.get_count(go_id)) / float(self.get_total_count(namespace))
70			#print self.get_count(go_id), self.get_total_count(namespace), freq
71			except ZeroDivisionError:
72			freq = 0
73
74			return freq
75
76
77			def ic(go_id, termcounts):
78			'''
79			Calculates the information content of a GO term.
80			'''
81			# Get the observed frequency of the GO term
82			freq = termcounts.get_term_freq(go_id)
83
84			# Calculate the information content (i.e., -log("freq of GO term")
85			return -1.0 * math.log(freq) if freq else 0
86
87
88			def resnik_sim(go_id1, go_id2, go, termcounts):
89			'''
90			Computes Resnik's similarity measure.
91			'''
92			msca = deepest_common_ancestor([go_id1, go_id2], go)
93			return ic(msca, termcounts)
94
95
96			def lin_sim(go_id1, go_id2, go, termcounts):
97			'''
98			Computes Lin's similarity measure.
99			'''
100			sim_r = resnik_sim(go_id1, go_id2, go, termcounts)
101
102			return (-2*sim_r)/(ic(go_id1, termcounts) + ic(go_id2, termcounts))
103
104
105			def common_parent_go_ids(terms, go):
106			'''
107			This function finds the common ancestors in the GO
108			tree of the list of terms in the input.
109			'''
110			# Find candidates from first
111			rec = go[terms[0]]
112			candidates = rec.get_all_parents()
113			candidates.update({terms[0]})
114
115			# Find intersection with second to nth term
116			for term in terms[1:]:
117			rec = go[term]
118			parents = rec.get_all_parents()
119			parents.update({term})
120
121			# Find the intersection with the candidates, and update.
122			candidates.intersection_update(parents)
123
124			return candidates
125
126
127			def deepest_common_ancestor(terms, go):
128			'''
129			This function gets the nearest common ancestor
130			using the above function.
131			Only returns single most specific - assumes unique exists.
132			'''
133			# Take the element at maximum depth.
134			return max(common_parent_go_ids(terms, go), key=lambda t: go[t].depth)
135
136
137			def min_branch_length(go_id1, go_id2, go):
138			'''
139			Finds the minimum branch length between two terms in the GO DAG.
140			'''
141			# First get the deepest common ancestor
142			dca = deepest_common_ancestor([go_id1, go_id2], go)
143
144			# Then get the distance from the DCA to each term
145			dca_depth = go[dca].depth
146			d1 = go[go_id1].depth - dca_depth
147			d2 = go[go_id2].depth - dca_depth
148
149			# Return the total distance - i.e., to the deepest common ancestor and back.
150			return d1 + d2
151
152
153			def semantic_distance(go_id1, go_id2, go):
154			'''
155			Finds the semantic distance (minimum number of connecting branches)
156			between two GO terms.
157			'''
158			return min_branch_length(go_id1, go_id2, go)
159
160
161			def semantic_similarity(go_id1, go_id2, go):
162			'''
163			Finds the semantic similarity (inverse of the semantic distance)
164			between two GO terms.
165			'''
166			return 1.0 / float(semantic_distance(go_id1, go_id2, go))
167

tanghaibao / goatools

Push — master ( ceaeb4...21ef80 )

goatools.lin_sim() A

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like