1
|
|
|
"""Print a GO term's lower-level hierarchy.""" |
2
|
|
|
|
3
|
|
|
__copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, H Tang. All rights reserved." |
4
|
|
|
__author__ = "DV Klopfenstein" |
5
|
|
|
|
6
|
|
|
import sys |
7
|
|
|
import collections as cx |
8
|
|
|
from goatools.godag.consts import Consts |
9
|
|
|
from goatools.gosubdag.go_paths import GoPaths |
10
|
|
|
|
11
|
|
|
|
12
|
|
|
class WrHierGO(object): |
13
|
|
|
"""Write hierarchy object.""" |
14
|
|
|
|
15
|
|
|
kws_dct = set(['max_indent']) |
16
|
|
|
kws_set = set(['no_indent', 'concise']) |
17
|
|
|
consts = Consts() |
18
|
|
|
|
19
|
|
|
def __init__(self, gosubdag, **kws): |
20
|
|
|
self.gosubdag = gosubdag # GoSubDag arg, children=True, must be used |
21
|
|
|
self.usrdct = {k:v for k, v in kws.items() if k in kws} |
22
|
|
|
self.usrset = set([k for k, v in kws.items() if k in kws and v]) |
23
|
|
|
# ' {NS} {dcnt:6,} L{level:02} D{depth:02} {D1:5} {GO_name}' |
24
|
|
|
|
25
|
|
|
def prt_hier_all(self, prt=sys.stdout): |
26
|
|
|
"""Write hierarchy for all GO Terms in obo file.""" |
27
|
|
|
# Print: [biological_process, molecular_function, and cellular_component] |
28
|
|
|
gos_printed = set() |
29
|
|
|
for goid in ['GO:0008150', 'GO:0003674', 'GO:0005575']: |
30
|
|
|
gos_printed.update(self.prt_hier_down(goid, prt)) |
31
|
|
|
return gos_printed |
32
|
|
|
|
33
|
|
|
def prt_hier_down(self, goid, prt=sys.stdout): |
34
|
|
|
"""Write hierarchy for all GO IDs below GO ID in arg, goid.""" |
35
|
|
|
obj = _WrHierPrt(self, prt) |
36
|
|
|
obj.prt_hier_rec(goid) |
37
|
|
|
return obj.gos_printed |
38
|
|
|
|
39
|
|
|
def prt_hier_up(self, goids, prt=sys.stdout): |
40
|
|
|
"""Write hierarchy for all GO IDs below GO ID in arg, goid.""" |
41
|
|
|
go2goterm_all = {go:self.gosubdag.go2obj[go] for go in goids} |
42
|
|
|
objp = GoPaths() |
43
|
|
|
gos_printed = set() |
44
|
|
|
for namespace, go2term_ns in self._get_namespace2go2term(go2goterm_all).items(): |
45
|
|
|
go_root = self.consts.NAMESPACE2GO[namespace] |
46
|
|
|
goids_all = set() # GO IDs from user-specfied GO to root |
47
|
|
|
for goid, goterm in go2term_ns.items(): |
48
|
|
|
goids_all.add(goid) |
49
|
|
|
paths = objp.get_paths_from_to(goterm, goid_end=None, dn0_up1=True) |
50
|
|
|
goids_all.update(set(o.id for p in paths for o in p)) |
51
|
|
|
# Only include GO IDs from user-specified GO to the root |
52
|
|
|
if 'include_only' not in self.usrdct: |
53
|
|
|
self.usrdct['include_only'] = set() |
54
|
|
|
self.usrdct['include_only'].update(goids_all) |
55
|
|
|
# Mark the user-specfied GO term |
56
|
|
|
if 'go_marks' not in self.usrdct: |
57
|
|
|
self.usrdct['go_marks'] = set() |
58
|
|
|
self.usrdct['go_marks'].update(go2term_ns.keys()) |
59
|
|
|
obj = _WrHierPrt(self, prt) # , goids_all, set(go2term_ns.keys())) |
60
|
|
|
gos_printed.update(obj.gos_printed) |
61
|
|
|
obj.prt_hier_rec(go_root) |
62
|
|
|
return gos_printed |
63
|
|
|
|
64
|
|
|
@staticmethod |
65
|
|
|
def _get_namespace2go2term(go2terms): |
66
|
|
|
"""Group GO IDs by namespace.""" |
67
|
|
|
namespace2go2term = cx.defaultdict(dict) |
68
|
|
|
for goid, goterm in go2terms.items(): |
69
|
|
|
namespace2go2term[goterm.namespace][goid] = goterm |
70
|
|
|
return namespace2go2term |
71
|
|
|
|
72
|
|
|
|
73
|
|
|
# pylint: disable=too-many-instance-attributes,too-few-public-methods |
74
|
|
|
class _WrHierPrt(object): |
75
|
|
|
"""Print GO hierarchy.""" |
76
|
|
|
|
77
|
|
|
def __init__(self, obj, prt=sys.stdout): |
78
|
|
|
self.gosubdag = obj.gosubdag |
79
|
|
|
self.max_indent = obj.usrdct.get('max_indent') |
80
|
|
|
self.include_only = obj.usrdct['include_only'] if 'include_only' in obj.usrdct else None |
81
|
|
|
self.go_marks = obj.usrdct['go_marks'] if 'go_marks' in obj.usrdct else set() |
82
|
|
|
self.concise_prt = 'concise' in obj.usrset |
83
|
|
|
self.indent = 'no_indent' not in obj.usrset |
84
|
|
|
# vars |
85
|
|
|
self.prt = prt |
86
|
|
|
self.gos_printed = set() |
87
|
|
|
self.prtfmt = self._init_prtfmt() |
88
|
|
|
self.dash_len = obj.usrdct.get('dash_len', 6) + 12 |
89
|
|
|
|
90
|
|
|
def prt_hier_rec(self, goid, depth=1): |
91
|
|
|
"""Write hierarchy for a GO Term record and all GO IDs down to the leaf level.""" |
92
|
|
|
ntgo = self.gosubdag.go2nt[goid] |
93
|
|
|
ntobj = self.gosubdag.go2obj[goid] |
94
|
|
|
# Shortens hierarchy report by only printing the hierarchy |
95
|
|
|
# for the sub-set of user-specified GO terms which are connected. |
96
|
|
|
if self.include_only and goid not in self.include_only: |
97
|
|
|
return |
98
|
|
|
nrp = self.concise_prt and goid in self.gos_printed |
99
|
|
|
if self.go_marks: |
100
|
|
|
self.prt.write('{} '.format('>' if goid in self.go_marks else ' ')) |
101
|
|
|
|
102
|
|
|
# '-' is default character indicating hierarchy level |
103
|
|
|
# '=' is used to indicate a hierarchical path printed in detail previously. |
104
|
|
|
dashgo = self._str_dashgoid(ntgo, depth, not nrp or not ntobj.children) |
105
|
|
|
self.prt.write('{DASHGO:{N}}'.format(DASHGO=dashgo, N=self.dash_len)) |
106
|
|
|
|
107
|
|
|
self.prt.write("{GO_INFO}\n".format(GO_INFO=self.prtfmt.format(**ntgo._asdict()))) |
108
|
|
|
self.gos_printed.add(goid) |
109
|
|
|
# Do not print hierarchy below this turn if it has already been printed |
110
|
|
|
if nrp: |
111
|
|
|
return |
112
|
|
|
depth += 1 |
113
|
|
|
if self.max_indent is not None and depth > self.max_indent: |
114
|
|
|
return |
115
|
|
|
for child in ntobj.children: |
116
|
|
|
self.prt_hier_rec(child.id, depth) |
117
|
|
|
|
118
|
|
|
@staticmethod |
119
|
|
|
def _str_dash(depth, single_or_double): |
120
|
|
|
"""Return a string containing dashes (optional) and GO ID.""" |
121
|
|
|
# '-' is default character indicating hierarchy level |
122
|
|
|
# '=' is used to indicate a hierarchical path printed in detail previously. |
123
|
|
|
letter = '-' if single_or_double else '=' |
124
|
|
|
return ''.join([letter]*depth) |
125
|
|
|
|
126
|
|
|
def _str_dashgoid(self, ntgo, depth, single_or_double): |
127
|
|
|
"""Return a string containing dashes (optional) and GO ID.""" |
128
|
|
|
dashes = self._str_dash(depth, single_or_double) if self.indent else "" |
129
|
|
|
return "{DASHES} {GO}{alt:1}".format(DASHES=dashes, GO=ntgo.GO, alt=ntgo.alt) |
130
|
|
|
|
131
|
|
|
def _init_prtfmt(self): |
132
|
|
|
"""Initialize print format.""" |
133
|
|
|
prtfmt = self.gosubdag.prt_attr['fmt'] |
134
|
|
|
prtfmt = prtfmt.replace('{GO} # ', '') |
135
|
|
|
prtfmt = prtfmt.replace('{D1:5} ', '') |
136
|
|
|
return prtfmt |
137
|
|
|
|
138
|
|
|
#### Examples: |
139
|
|
|
#### |
140
|
|
|
#### Print the hierarchy below Term, GO:0030663 |
141
|
|
|
#### >>> python {SCR} GO:0030663 |
142
|
|
|
#### |
143
|
|
|
#### - GO:0030663 level-05 depth-07 COPI-coated vesicle membrane [cellular_component] |
144
|
|
|
#### -- GO:0012508 level-05 depth-08 Golgi to ER transport vesicle membrane [cellular_component] |
145
|
|
|
#### -- GO:0012509 level-05 depth-08 inter-Golgi transport vesicle membrane [cellular_component] |
146
|
|
|
#### |
147
|
|
|
#### |
148
|
|
|
#### Write the hierarchy below Term, GO:0030663 into a file |
149
|
|
|
#### >>> python {SCR} GO:0030663 --o=hier_GO_0030663.rpt |
150
|
|
|
#### |
151
|
|
|
#### WROTE: hier_GO_0030663.rpt |
152
|
|
|
#### |
153
|
|
|
#### Print the hierarchy for biological process, molecular_function, and cellular_component: |
154
|
|
|
#### >>> python {SCR} --o=hier_BP_MF_CC.rpt |
155
|
|
|
#### |
156
|
|
|
#### Print hierarchy for BP, MF, CC only printing the first 2 levels. |
157
|
|
|
#### >>> python {SCR} --max_indent=2 |
158
|
|
|
#### >>> python {SCR} --max_indent=2 --dash_len=2 |
159
|
|
|
#### |
160
|
|
|
#### |
161
|
|
|
#### Print a conciseened version of the hierarchy for BP, MF, and CC. |
162
|
|
|
#### This will only print a path to a leaf GO Term once. |
163
|
|
|
#### If the path appears a second time, the term is printed again, but its path is not. |
164
|
|
|
#### The presence of a compressed (unprinted) paths is marked by using '=" instead of '-'. |
165
|
|
|
#### |
166
|
|
|
#### $ wc -l hier_BP_MF_CC*.rpt |
167
|
|
|
#### |
168
|
|
|
#### 789583 hier_BP_MF_CC.rpt |
169
|
|
|
#### 70152 hier_BP_MF_CC_concise.rpt |
170
|
|
|
#### |
171
|
|
|
#### >>> python {SCR} --o=hier_BP_MF_CC_concise.rpt --concise |
172
|
|
|
#### |
173
|
|
|
#### Print hierarchy |
174
|
|
|
#### - 26894 GO:0008150 level-00 depth-00 biological_process [biological_process] |
175
|
|
|
#### -- 30 GO:0001906 level-01 depth-01 cell killing [biological_process] |
176
|
|
|
#### -- 555 GO:0002376 level-01 depth-01 immune system process [biological_process] |
177
|
|
|
#### -- 11208 GO:0065007 level-01 depth-01 biological regulation [biological_process] |
178
|
|
|
#### |
179
|
|
|
#### >>> python {SCR} |
180
|
|
|
#### |
181
|
|
|
#### This program prints the hierarchy for all GO terms, if no argument is provided. |
182
|
|
|
#### If a GO term is provided as an argument, then the hierarchy of all children |
183
|
|
|
#### for that term is printed. |
184
|
|
|
#### |
185
|
|
|
#### """.format(SCR='write_hierarchy') |
186
|
|
|
|
187
|
|
|
|
188
|
|
|
# Copyright (C) 2016-2018, DV Klopfenstein, H Tang. All rights reserved. |
189
|
|
|
|