Completed
Push — master ( ceaeb4...21ef80 )
by Haibao
58s
created

goatools.ic()   A

Complexity

Conditions 2

Size

Total Lines 9

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 2
dl 0
loc 9
rs 9.6666
1
#!/usr/bin/env python
2
# -*- coding: UTF-8 -*-
3
4
"""
5
Compute semantic similarities between GO terms. Borrowed from book chapter from
6
Christophe Dessimoz (thanks). For details, please check out:
7
notebooks/semantic_similarity.ipynb
8
"""
9
10
import math
11
from collections import Counter
12
13
14
class TermCounts:
15
    '''
16
        TermCounts counts the term counts for each
17
    '''
18
    def __init__(self, go, annots):
19
        '''
20
            Initialise the counts and
21
        '''
22
        # Backup
23
        self._go = go
24
25
        # Initialise the counters
26
        self._counts = Counter()
27
        self._aspect_counts = Counter()
28
29
        # Fill the counters...
30
        self._count_terms(go, annots)
31
32
    def _count_terms(self, go, annots):
33
        '''
34
            Fills in the counts and overall aspect counts.
35
        '''
36
        for gene, terms in annots.items():
37
            # Make a union of all the terms for a gene, if term parents are
38
            # propagated but they won't get double-counted for the gene
39
            allterms = set(terms)
40
            for go_id in terms:
41
                allterms |= go[go_id].get_all_parents()
42
            for p in allterms:
43
                self._counts[p] += 1
44
45
        for go_id, c in self._counts.items():
46
            # Group by namespace
47
            namespace = go[go_id].namespace
48
            self._aspect_counts[namespace] += c
49
50
    def get_count(self, go_id):
51
        '''
52
            Returns the count of that GO term observed in the annotations.
53
        '''
54
        return self._counts[go_id]
55
56
    def get_total_count(self, aspect):
57
        '''
58
            Gets the total count that's been precomputed.
59
        '''
60
        return self._aspect_counts[aspect]
61
62
    def get_term_freq(self, go_id):
63
        '''
64
            Returns the frequency at which a particular GO term has
65
            been observed in the annotations.
66
        '''
67
        try:
68
            namespace = self._go[go_id].namespace
69
            freq = float(self.get_count(go_id)) / float(self.get_total_count(namespace))
70
            #print self.get_count(go_id), self.get_total_count(namespace), freq
71
        except ZeroDivisionError:
72
            freq = 0
73
74
        return freq
75
76
77
def ic(go_id, termcounts):
78
    '''
79
        Calculates the information content of a GO term.
80
    '''
81
    # Get the observed frequency of the GO term
82
    freq = termcounts.get_term_freq(go_id)
83
84
    # Calculate the information content (i.e., -log("freq of GO term")
85
    return -1.0 * math.log(freq) if freq else 0
86
87
88
def resnik_sim(go_id1, go_id2, go, termcounts):
89
    '''
90
        Computes Resnik's similarity measure.
91
    '''
92
    msca = deepest_common_ancestor([go_id1, go_id2], go)
93
    return ic(msca, termcounts)
94
95
96
def lin_sim(go_id1, go_id2, go, termcounts):
97
    '''
98
        Computes Lin's similarity measure.
99
    '''
100
    sim_r = resnik_sim(go_id1, go_id2, go, termcounts)
101
102
    return (-2*sim_r)/(ic(go_id1, termcounts) + ic(go_id2, termcounts))
103
104
105
def common_parent_go_ids(terms, go):
106
    '''
107
        This function finds the common ancestors in the GO
108
        tree of the list of terms in the input.
109
    '''
110
    # Find candidates from first
111
    rec = go[terms[0]]
112
    candidates = rec.get_all_parents()
113
    candidates.update({terms[0]})
114
115
    # Find intersection with second to nth term
116
    for term in terms[1:]:
117
        rec = go[term]
118
        parents = rec.get_all_parents()
119
        parents.update({term})
120
121
        # Find the intersection with the candidates, and update.
122
        candidates.intersection_update(parents)
123
124
    return candidates
125
126
127
def deepest_common_ancestor(terms, go):
128
    '''
129
        This function gets the nearest common ancestor
130
        using the above function.
131
        Only returns single most specific - assumes unique exists.
132
    '''
133
    # Take the element at maximum depth.
134
    return max(common_parent_go_ids(terms, go), key=lambda t: go[t].depth)
135
136
137
def min_branch_length(go_id1, go_id2, go):
138
    '''
139
        Finds the minimum branch length between two terms in the GO DAG.
140
    '''
141
    # First get the deepest common ancestor
142
    dca = deepest_common_ancestor([go_id1, go_id2], go)
143
144
    # Then get the distance from the DCA to each term
145
    dca_depth = go[dca].depth
146
    d1 = go[go_id1].depth - dca_depth
147
    d2 = go[go_id2].depth - dca_depth
148
149
    # Return the total distance - i.e., to the deepest common ancestor and back.
150
    return d1 + d2
151
152
153
def semantic_distance(go_id1, go_id2, go):
154
    '''
155
        Finds the semantic distance (minimum number of connecting branches)
156
        between two GO terms.
157
    '''
158
    return min_branch_length(go_id1, go_id2, go)
159
160
161
def semantic_similarity(go_id1, go_id2, go):
162
    '''
163
        Finds the semantic similarity (inverse of the semantic distance)
164
        between two GO terms.
165
    '''
166
    return 1.0 / float(semantic_distance(go_id1, go_id2, go))
167