|
1
|
|
|
"""Semantic Similarity test for Issue #86. |
|
2
|
|
|
|
|
3
|
|
|
https://github.com/tanghaibao/goatools/issues/86 |
|
4
|
|
|
|
|
5
|
|
|
semantic_similarity & resnik_sim works for few entities but it's giving an error: |
|
6
|
|
|
return max(common_parent_go_ids(terms, go), key=lambda t: go[t].depth) |
|
7
|
|
|
ValueError: max() arg is an empty sequence |
|
8
|
|
|
|
|
9
|
|
|
It issues this error when these is no common parent in both provided |
|
10
|
|
|
entities/genes. Here is one example producing this error |
|
11
|
|
|
semantic_similarity(GO:0003676, GO:0007516, godag) |
|
12
|
|
|
|
|
13
|
|
|
""" |
|
14
|
|
|
|
|
15
|
|
|
import os |
|
16
|
|
|
import sys |
|
17
|
|
|
from goatools.base import get_godag |
|
18
|
|
|
from goatools.associations import dnld_assc |
|
19
|
|
|
from goatools.semantic import semantic_distance, semantic_similarity, TermCounts |
|
20
|
|
|
from goatools.semantic import resnik_sim, lin_sim |
|
21
|
|
|
|
|
22
|
|
|
|
|
23
|
|
|
def test_top_parent(prt=sys.stdout): |
|
24
|
|
|
"""Semantic Similarity test for Issue #86.""" |
|
25
|
|
|
fin_obo = "data/i86.obo" |
|
26
|
|
|
branch_dist = 5 |
|
27
|
|
|
repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") |
|
28
|
|
|
godag = get_godag(os.path.join(repo, fin_obo)) |
|
29
|
|
|
# Get all the annotations from arabidopsis. |
|
30
|
|
|
|
|
31
|
|
|
# Calculate the semantic distance and semantic similarity: |
|
32
|
|
|
_test_path_same(godag, prt) |
|
33
|
|
|
_test_path_parallel(godag, prt) |
|
34
|
|
|
_test_path_bp_mf(branch_dist, godag, prt) |
|
35
|
|
|
sys.stdout.write("TESTS PASSed: similarity_top_parent\n") |
|
36
|
|
|
|
|
37
|
|
|
def _test_path_bp_mf(branch_dist, godag, prt): |
|
38
|
|
|
"""Test distances between BP branch and MF branch.""" |
|
39
|
|
|
go_mf = 'GO:0003676' # level-03 depth-03 nucleic acid binding [molecular_function] |
|
40
|
|
|
go_bp = 'GO:0007516' # level-04 depth-05 hemocyte development [biological_process] |
|
41
|
|
|
dst_none = semantic_distance(go_mf, go_bp, godag) |
|
42
|
|
|
sim_none = semantic_similarity(go_mf, go_bp, godag) |
|
43
|
|
|
assc = dnld_assc("gene_association.tair", godag) |
|
44
|
|
|
termcounts = TermCounts(godag, assc) |
|
45
|
|
|
fmt = '({GO1}, {GO2}) {TYPE:6} score = {VAL}\n' |
|
46
|
|
|
sim_r = resnik_sim(go_mf, go_bp, godag, termcounts) |
|
47
|
|
|
sim_l = lin_sim(go_mf, go_bp, godag, termcounts) |
|
48
|
|
|
if prt is not None: |
|
49
|
|
|
prt.write(fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=dst_none)) |
|
50
|
|
|
prt.write(fmt.format(TYPE='semantic similarity', GO1=go_mf, GO2=go_bp, VAL=sim_none)) |
|
51
|
|
|
prt.write(fmt.format(TYPE='Resnik similarity', GO1=go_mf, GO2=go_bp, VAL=sim_r)) |
|
52
|
|
|
prt.write(fmt.format(TYPE='Lin similarity', GO1=go_mf, GO2=go_bp, VAL=sim_l)) |
|
53
|
|
|
assert dst_none is None |
|
54
|
|
|
assert sim_none is None |
|
55
|
|
|
assert sim_r is None |
|
56
|
|
|
assert sim_l is None |
|
57
|
|
|
sim_d = semantic_distance(go_mf, go_bp, godag, branch_dist) |
|
58
|
|
|
if prt is not None: |
|
59
|
|
|
prt.write(fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=sim_d)) |
|
60
|
|
|
assert sim_d == godag[go_mf].depth + godag[go_bp].depth + branch_dist |
|
61
|
|
|
|
|
62
|
|
|
def _test_path_parallel(godag, prt): |
|
63
|
|
|
"""Test distances between GO IDs on parallel branches.""" |
|
64
|
|
|
goid_bottom = 'GO:0007516' # BP level-04 depth-05 hemocyte development |
|
65
|
|
|
# Test distances up a parallel branch |
|
66
|
|
|
goids = [ |
|
67
|
|
|
'GO:0044763', # BP level-02 depth-02 single-organism cellular process |
|
68
|
|
|
'GO:0008219', # BP level-03 depth-03 cell death |
|
69
|
|
|
'GO:0070997', # BP level-04 depth-04 neuron death |
|
70
|
|
|
'GO:0036475', # BP level-05 depth-05 neuron death in response to oxidative stress |
|
71
|
|
|
'GO:0036476'] # BP level-06 depth-06 neuron death in response to hydrogen peroxide |
|
72
|
|
|
fmt = '{DST} semantic_distance between {GO1} and {GO2} on parallel branches\n' |
|
73
|
|
|
for dst_exp, goid in enumerate(goids, 3): |
|
74
|
|
|
dst_act = semantic_distance(goid_bottom, goid, godag) |
|
75
|
|
|
if prt is not None: |
|
76
|
|
|
prt.write(fmt.format(DST=dst_act, GO1=goid_bottom, GO2=goid)) |
|
77
|
|
|
assert dst_act == dst_exp |
|
78
|
|
|
|
|
79
|
|
|
|
|
80
|
|
|
def _test_path_same(godag, prt): |
|
81
|
|
|
"""Test distances btween GO IDs on the same path.""" |
|
82
|
|
|
goid_bottom = 'GO:0007516' # level-04 depth-05 hemocyte development [biological_process] |
|
83
|
|
|
# Test distances up the same branch |
|
84
|
|
|
goids_bp = [ |
|
85
|
|
|
'GO:0008150', # level-00 depth-00 biological_process [biological_process] |
|
86
|
|
|
'GO:0009987', # level-01 depth-01 cellular process [biological_process] |
|
87
|
|
|
'GO:0044763', # level-02 depth-02 single-organism cellular process [biological_process] |
|
88
|
|
|
'GO:0048869', # level-03 depth-03 cellular developmental process [biological_process] |
|
89
|
|
|
'GO:0048468'] # level-03 depth-04 cell development [biological_process] |
|
90
|
|
|
fmt = '{DST} semantic_distance for {GO1} and {GO2} on the same branch\n' |
|
91
|
|
|
for dst_exp, goid in enumerate(reversed(goids_bp), 1): |
|
92
|
|
|
dst_act = semantic_distance(goid_bottom, goid, godag) |
|
93
|
|
|
if prt is not None: |
|
94
|
|
|
prt.write(fmt.format(DST=dst_act, GO1=goid_bottom, GO2=goid)) |
|
95
|
|
|
assert dst_act == dst_exp |
|
96
|
|
|
|
|
97
|
|
|
if __name__ == '__main__': |
|
98
|
|
|
PRT = None if len(sys.argv) != 1 else sys.stdout |
|
99
|
|
|
test_top_parent(PRT) |
|
100
|
|
|
|