1
|
|
|
"""Semantic Similarity test for Issue #86. |
2
|
|
|
|
3
|
|
|
https://github.com/tanghaibao/goatools/issues/86 |
4
|
|
|
|
5
|
|
|
semantic_similarity & resnik_sim works for few entities but it's giving an error: |
6
|
|
|
return max(common_parent_go_ids(terms, go), key=lambda t: go[t].depth) |
7
|
|
|
ValueError: max() arg is an empty sequence |
8
|
|
|
|
9
|
|
|
It issues this error when these is no common parent in both provided |
10
|
|
|
entities/genes. Here is one example producing this error |
11
|
|
|
semantic_similarity(GO:0003676, GO:0007516, godag) |
12
|
|
|
|
13
|
|
|
""" |
14
|
|
|
|
15
|
|
|
import os |
16
|
|
|
import sys |
17
|
|
|
from goatools.base import get_godag |
18
|
|
|
from goatools.associations import dnld_assc |
19
|
|
|
from goatools.semantic import semantic_distance, semantic_similarity, TermCounts |
20
|
|
|
from goatools.semantic import resnik_sim, lin_sim |
21
|
|
|
|
22
|
|
|
|
23
|
|
|
def test_top_parent(prt=sys.stdout): |
24
|
|
|
"""Semantic Similarity test for Issue #86.""" |
25
|
|
|
fin_obo = "data/i86.obo" |
26
|
|
|
branch_dist = 5 |
27
|
|
|
repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") |
28
|
|
|
godag = get_godag(os.path.join(repo, fin_obo)) |
29
|
|
|
# Get all the annotations from arabidopsis. |
30
|
|
|
|
31
|
|
|
# Calculate the semantic distance and semantic similarity: |
32
|
|
|
_test_path_same(godag, prt) |
33
|
|
|
_test_path_parallel(godag, prt) |
34
|
|
|
_test_path_bp_mf(branch_dist, godag, prt) |
35
|
|
|
sys.stdout.write("TESTS PASSed: similarity_top_parent\n") |
36
|
|
|
|
37
|
|
|
def _test_path_bp_mf(branch_dist, godag, prt): |
38
|
|
|
"""Test distances between BP branch and MF branch.""" |
39
|
|
|
go_mf = 'GO:0003676' # level-03 depth-03 nucleic acid binding [molecular_function] |
40
|
|
|
go_bp = 'GO:0007516' # level-04 depth-05 hemocyte development [biological_process] |
41
|
|
|
dst_none = semantic_distance(go_mf, go_bp, godag) |
42
|
|
|
sim_none = semantic_similarity(go_mf, go_bp, godag) |
43
|
|
|
assc = dnld_assc("gene_association.tair", godag) |
44
|
|
|
termcounts = TermCounts(godag, assc) |
45
|
|
|
fmt = '({GO1}, {GO2}) {TYPE:6} score = {VAL}\n' |
46
|
|
|
sim_r = resnik_sim(go_mf, go_bp, godag, termcounts) |
47
|
|
|
sim_l = lin_sim(go_mf, go_bp, godag, termcounts) |
48
|
|
|
if prt is not None: |
49
|
|
|
prt.write(fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=dst_none)) |
50
|
|
|
prt.write(fmt.format(TYPE='semantic similarity', GO1=go_mf, GO2=go_bp, VAL=sim_none)) |
51
|
|
|
prt.write(fmt.format(TYPE='Resnik similarity', GO1=go_mf, GO2=go_bp, VAL=sim_r)) |
52
|
|
|
prt.write(fmt.format(TYPE='Lin similarity', GO1=go_mf, GO2=go_bp, VAL=sim_l)) |
53
|
|
|
assert dst_none is None |
54
|
|
|
assert sim_none is None |
55
|
|
|
assert sim_r is None |
56
|
|
|
assert sim_l is None |
57
|
|
|
sim_d = semantic_distance(go_mf, go_bp, godag, branch_dist) |
58
|
|
|
if prt is not None: |
59
|
|
|
prt.write(fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=sim_d)) |
60
|
|
|
assert sim_d == godag[go_mf].depth + godag[go_bp].depth + branch_dist |
61
|
|
|
|
62
|
|
|
def _test_path_parallel(godag, prt): |
63
|
|
|
"""Test distances between GO IDs on parallel branches.""" |
64
|
|
|
goid_bottom = 'GO:0007516' # BP level-04 depth-05 hemocyte development |
65
|
|
|
# Test distances up a parallel branch |
66
|
|
|
goids = [ |
67
|
|
|
'GO:0044763', # BP level-02 depth-02 single-organism cellular process |
68
|
|
|
'GO:0008219', # BP level-03 depth-03 cell death |
69
|
|
|
'GO:0070997', # BP level-04 depth-04 neuron death |
70
|
|
|
'GO:0036475', # BP level-05 depth-05 neuron death in response to oxidative stress |
71
|
|
|
'GO:0036476'] # BP level-06 depth-06 neuron death in response to hydrogen peroxide |
72
|
|
|
fmt = '{DST} semantic_distance between {GO1} and {GO2} on parallel branches\n' |
73
|
|
|
for dst_exp, goid in enumerate(goids, 3): |
74
|
|
|
dst_act = semantic_distance(goid_bottom, goid, godag) |
75
|
|
|
if prt is not None: |
76
|
|
|
prt.write(fmt.format(DST=dst_act, GO1=goid_bottom, GO2=goid)) |
77
|
|
|
assert dst_act == dst_exp |
78
|
|
|
|
79
|
|
|
|
80
|
|
|
def _test_path_same(godag, prt): |
81
|
|
|
"""Test distances btween GO IDs on the same path.""" |
82
|
|
|
goid_bottom = 'GO:0007516' # level-04 depth-05 hemocyte development [biological_process] |
83
|
|
|
# Test distances up the same branch |
84
|
|
|
goids_bp = [ |
85
|
|
|
'GO:0008150', # level-00 depth-00 biological_process [biological_process] |
86
|
|
|
'GO:0009987', # level-01 depth-01 cellular process [biological_process] |
87
|
|
|
'GO:0044763', # level-02 depth-02 single-organism cellular process [biological_process] |
88
|
|
|
'GO:0048869', # level-03 depth-03 cellular developmental process [biological_process] |
89
|
|
|
'GO:0048468'] # level-03 depth-04 cell development [biological_process] |
90
|
|
|
fmt = '{DST} semantic_distance for {GO1} and {GO2} on the same branch\n' |
91
|
|
|
for dst_exp, goid in enumerate(reversed(goids_bp), 1): |
92
|
|
|
dst_act = semantic_distance(goid_bottom, goid, godag) |
93
|
|
|
if prt is not None: |
94
|
|
|
prt.write(fmt.format(DST=dst_act, GO1=goid_bottom, GO2=goid)) |
95
|
|
|
assert dst_act == dst_exp |
96
|
|
|
|
97
|
|
|
if __name__ == '__main__': |
98
|
|
|
PRT = None if len(sys.argv) != 1 else sys.stdout |
99
|
|
|
test_top_parent(PRT) |
100
|
|
|
|