1
|
|
|
"""Given user GO ids and parent terms, group user GO ids under one parent term. |
2
|
|
|
|
3
|
|
|
Given a group of GO ids with one or more higher-level grouping terms, group |
4
|
|
|
each user GO id under the most descriptive parent GO term. |
5
|
|
|
|
6
|
|
|
Each GO id may have more than one parent. One of the parent(s) is chosen |
7
|
|
|
to best represent the user GO id's function. The choice of parent is made by |
8
|
|
|
regarding how close the parent GO id is to the bottom of its hierarchy. |
9
|
|
|
|
10
|
|
|
The estimation of how close a GO term is to "the bottom" of its GO hierarchy |
11
|
|
|
is estimated using the number of total Go term descendent counts below |
12
|
|
|
that term. |
13
|
|
|
""" |
14
|
|
|
|
15
|
|
|
import sys |
16
|
|
|
import collections as cx |
17
|
|
|
from goatools.godag.consts import Consts |
18
|
|
|
from goatools.grouper.grprobj_init import GrouperInit |
19
|
|
|
|
20
|
|
|
__copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved." |
21
|
|
|
__author__ = "DV Klopfenstein" |
22
|
|
|
|
23
|
|
|
|
24
|
|
|
class Grouper(object): |
25
|
|
|
"""Groups the user GO ids under other GO IDs acting as headers for the GO groups.""" |
26
|
|
|
|
27
|
|
|
fmtsum = ("{GO_DESC} GOs({GOs:6,} in {SECs:2} sections, " |
28
|
|
|
"{UNGRP:>3} {undesc}) {ACTION} {FILE}\n") |
29
|
|
|
|
30
|
|
|
def __init__(self, grpname, goids, hdrobj, gosubdag, **kws): |
31
|
|
|
# print("INITIALIZING Grouper") |
32
|
|
|
# Data members read |
33
|
|
|
self.grpname = grpname |
34
|
|
|
self.hdrobj = hdrobj # Contains all possible hdrgos, not just ones used |
35
|
|
|
self.gosubdag = gosubdag |
36
|
|
|
assert self.gosubdag.rcntobj is not None |
37
|
|
|
# _ini = GrouperInit(grpname, goids, hdrobj, gosubdag, kws.get('fnc_most_specific', 'dcnt')) |
38
|
|
|
_ini = GrouperInit(goids, self, kws.get('fnc_most_specific', 'dcnt')) |
39
|
|
|
self.usrgos = _ini.usrgos |
40
|
|
|
# Initialize: hdrgo2usrgos hdrgo_is_usrgo |
41
|
|
|
# * hdrgo2usrgos: User GO IDs, grouped under high GO IDs (grouped, but not sorted) |
42
|
|
|
self.hdrgo2usrgos = _ini.hdrgo2usrgos |
43
|
|
|
self.hdrgo_is_usrgo = _ini.hdrgo_is_usrgo # set of GO IDs -> both headers/user GO IDs |
44
|
|
|
# User GO IDs and their corresponding high GO IDs (not grouped or sorted) |
45
|
|
|
self.go2nt = _ini.get_go2nt(kws.get('go2nt', None)) |
46
|
|
|
|
47
|
|
|
def get_usrgos_w_parents(self, hdrgos, usrgos_all=None): |
48
|
|
|
"""Get usrgos w/parents in hdrgos, even if usrgos did not get grouped under hdrgos.""" |
49
|
|
|
usrgos = set() |
50
|
|
|
_go2parents = self.gosubdag.rcntobj.go2parents |
51
|
|
|
if usrgos_all is None: |
52
|
|
|
usrgos_all = self.usrgos |
53
|
|
|
for usrgo in usrgos_all: |
54
|
|
|
all_usrgo_parents = _go2parents.get(usrgo) |
55
|
|
|
sel_usrgo_parents = all_usrgo_parents.intersection(hdrgos) |
56
|
|
|
if sel_usrgo_parents: |
57
|
|
|
usrgos.add(usrgo) |
58
|
|
|
return usrgos |
59
|
|
|
|
60
|
|
|
def get_sections_2d(self): |
61
|
|
|
"""Get 2-D list of sections and hdrgos sets actually used in grouping.""" |
62
|
|
|
sections_hdrgos_act = [] |
63
|
|
|
hdrgos_act_all = self.get_hdrgos() |
64
|
|
|
hdrgos_act_secs = set() |
65
|
|
|
if self.hdrobj.sections: |
66
|
|
|
for section_name, hdrgos_all_lst in self.hdrobj.sections: |
67
|
|
|
hdrgos_all_set = set(hdrgos_all_lst) |
68
|
|
|
hdrgos_act_set = hdrgos_all_set.intersection(hdrgos_act_all) |
69
|
|
|
if hdrgos_act_set: |
70
|
|
|
hdrgos_act_secs |= hdrgos_act_set |
71
|
|
|
# Use original order of header GOs found in sections |
72
|
|
|
hdrgos_act_lst = [] |
73
|
|
|
hdrgos_act_ctr = cx.Counter() |
74
|
|
|
for hdrgo_p in hdrgos_all_lst: # Header GO that may or may not be used. |
75
|
|
|
if hdrgo_p in hdrgos_act_set and hdrgos_act_ctr[hdrgo_p] == 0: |
76
|
|
|
hdrgos_act_lst.append(hdrgo_p) |
77
|
|
|
hdrgos_act_ctr[hdrgo_p] += 1 |
78
|
|
|
sections_hdrgos_act.append((section_name, hdrgos_act_lst)) |
79
|
|
|
hdrgos_act_rem = hdrgos_act_all.difference(hdrgos_act_secs) |
80
|
|
|
if hdrgos_act_rem: |
81
|
|
|
sections_hdrgos_act.append((self.hdrobj.secdflt, hdrgos_act_rem)) |
82
|
|
|
else: |
83
|
|
|
sections_hdrgos_act.append((self.hdrobj.secdflt, hdrgos_act_all)) |
84
|
|
|
return sections_hdrgos_act |
85
|
|
|
|
86
|
|
|
def get_usrgos_g_section(self, section=None): |
87
|
|
|
"""Get usrgos in a requested section.""" |
88
|
|
|
if section is None: |
89
|
|
|
section = self.hdrobj.secdflt |
90
|
|
|
if section is True: |
91
|
|
|
return self.usrgos |
92
|
|
|
# Get dict of sections and hdrgos actually used in grouping |
93
|
|
|
section2hdrgos = cx.OrderedDict(self.get_sections_2d()) |
94
|
|
|
hdrgos_lst = section2hdrgos.get(section, None) |
95
|
|
|
if hdrgos_lst is not None: |
96
|
|
|
hdrgos_set = set(hdrgos_lst) |
97
|
|
|
hdrgos_u = hdrgos_set.intersection(self.hdrgo_is_usrgo) |
98
|
|
|
hdrgos_h = hdrgos_set.intersection(self.hdrgo2usrgos.keys()) |
99
|
|
|
usrgos = set([u for h in hdrgos_h for u in self.hdrgo2usrgos.get(h)]) |
100
|
|
|
usrgos |= hdrgos_u |
101
|
|
|
return usrgos |
102
|
|
|
return set() |
103
|
|
|
|
104
|
|
|
def get_section2usrnts(self): |
105
|
|
|
"""Get dict section2usrnts.""" |
106
|
|
|
sec_nts = [] |
107
|
|
|
for section_name, _ in self.get_sections_2d(): |
108
|
|
|
usrgos = self.get_usrgos_g_section(section_name) |
109
|
|
|
sec_nts.append((section_name, [self.go2nt.get(u) for u in usrgos])) |
110
|
|
|
return cx.OrderedDict(sec_nts) |
111
|
|
|
|
112
|
|
|
def get_section2items(self, itemkey): |
113
|
|
|
"""Collect all items into a single set per section.""" |
114
|
|
|
sec_items = [] |
115
|
|
|
section2usrnts = self.get_section2usrnts() |
116
|
|
|
for section, usrnts in section2usrnts.items(): |
117
|
|
|
items = set([e for nt in usrnts for e in getattr(nt, itemkey, set())]) |
118
|
|
|
sec_items.append((section, items)) |
119
|
|
|
return cx.OrderedDict(sec_items) |
120
|
|
|
|
121
|
|
|
def get_hdrgos_g_usrgos(self, usrgos): |
122
|
|
|
"""Return hdrgos which contain the usrgos.""" |
123
|
|
|
hdrgos_for_usrgos = set() |
124
|
|
|
hdrgos_all = self.get_hdrgos() |
125
|
|
|
usrgo2hdrgo = self.get_usrgo2hdrgo() |
126
|
|
|
for usrgo in usrgos: |
127
|
|
|
if usrgo in hdrgos_all: |
128
|
|
|
hdrgos_for_usrgos.add(usrgo) |
129
|
|
|
continue |
130
|
|
|
hdrgo_cur = usrgo2hdrgo.get(usrgo, None) |
131
|
|
|
if hdrgo_cur is not None: |
132
|
|
|
hdrgos_for_usrgos.add(hdrgo_cur) |
133
|
|
|
return hdrgos_for_usrgos |
134
|
|
|
|
135
|
|
|
def get_section_hdrgos_nts(self, sortby=None): |
136
|
|
|
"""Get a flat list of sections and hdrgos actually used in grouping.""" |
137
|
|
|
nts_all = [] |
138
|
|
|
section_hdrgos_actual = self.get_sections_2d() |
139
|
|
|
flds_all = ['Section'] + self.gosubdag.prt_attr['flds'] |
140
|
|
|
ntobj = cx.namedtuple("NtGoSec", " ".join(flds_all)) |
141
|
|
|
flds_go = None |
142
|
|
|
if sortby is None: |
143
|
|
|
sortby = lambda nt: -1*nt.dcnt |
144
|
|
|
for section_name, hdrgos_actual in section_hdrgos_actual: |
145
|
|
|
nts_sec = [] |
146
|
|
|
for hdrgo_nt in self.gosubdag.get_go2nt(hdrgos_actual).values(): |
147
|
|
|
if flds_go is None: |
148
|
|
|
flds_go = hdrgo_nt._fields |
149
|
|
|
key2val = {key:val for key, val in zip(flds_go, list(hdrgo_nt))} |
150
|
|
|
key2val['Section'] = section_name |
151
|
|
|
nts_sec.append(ntobj(**key2val)) |
152
|
|
|
nts_all.extend(sorted(nts_sec, key=sortby)) |
153
|
|
|
return nts_all |
154
|
|
|
|
155
|
|
|
def get_sections_2d_nts(self, sortby=None): |
156
|
|
|
"""Get high GO IDs that are actually used to group current set of GO IDs.""" |
157
|
|
|
sections_2d_nts = [] |
158
|
|
|
for section_name, hdrgos_actual in self.get_sections_2d(): |
159
|
|
|
hdrgo_nts = self.gosubdag.get_nts(hdrgos_actual, sortby=sortby) |
160
|
|
|
sections_2d_nts.append((section_name, hdrgo_nts)) |
161
|
|
|
return sections_2d_nts |
162
|
|
|
|
163
|
|
|
def get_hdrgos(self): |
164
|
|
|
"""Return high GO IDs that are actually used to group current set of GO IDs.""" |
165
|
|
|
return set(self.hdrgo2usrgos.keys()).union(self.hdrgo_is_usrgo) |
166
|
|
|
|
167
|
|
|
def get_usrgos_g_hdrgos(self, hdrgos): |
168
|
|
|
"""Return usrgos under provided hdrgos.""" |
169
|
|
|
usrgos_all = set() |
170
|
|
|
if isinstance(hdrgos, str): |
171
|
|
|
hdrgos = [hdrgos] |
172
|
|
|
for hdrgo in hdrgos: |
173
|
|
|
usrgos_cur = self.hdrgo2usrgos.get(hdrgo, None) |
174
|
|
|
if usrgos_cur is not None: |
175
|
|
|
usrgos_all |= usrgos_cur |
176
|
|
|
if hdrgo in self.hdrgo_is_usrgo: |
177
|
|
|
usrgos_all.add(hdrgo) |
178
|
|
|
return usrgos_all |
179
|
|
|
|
180
|
|
|
def get_hdrgos_unplaced(self): |
181
|
|
|
"""Get hdrgos which are not headers in sections.""" |
182
|
|
|
return self.get_hdrgos().difference(self.hdrobj.get_section_hdrgos()) |
183
|
|
|
|
184
|
|
|
def get_hdrgos_u0(self): |
185
|
|
|
"""Return header GO IDs which ARE NOT user GO IDs.""" |
186
|
|
|
return set(self.hdrgo2usrgos.keys()).difference(self.usrgos) |
187
|
|
|
|
188
|
|
|
def get_hdrgos_u1(self): |
189
|
|
|
"""Return header GO IDs which ARE user GO IDs.""" |
190
|
|
|
return self.hdrgo_is_usrgo |
191
|
|
|
|
192
|
|
|
def get_hdrgo2usrgos(self, hdrgos): |
193
|
|
|
"""Return a subset of hdrgo2usrgos.""" |
194
|
|
|
get_usrgos = self.hdrgo2usrgos.get |
195
|
|
|
hdrgos_actual = self.get_hdrgos().intersection(hdrgos) |
196
|
|
|
return {h:get_usrgos(h) for h in hdrgos_actual} |
197
|
|
|
|
198
|
|
|
def get_usrgo2hdrgo(self): |
199
|
|
|
"""Return a dict with all user GO IDs as keys and their respective header GOs as values.""" |
200
|
|
|
usrgo2hdrgo = {} |
201
|
|
|
for hdrgo, usrgos in self.hdrgo2usrgos.items(): |
202
|
|
|
for usrgo in usrgos: |
203
|
|
|
assert usrgo not in usrgo2hdrgo |
204
|
|
|
usrgo2hdrgo[usrgo] = hdrgo |
205
|
|
|
# Add usrgos which are also a hdrgo and the GO group contains no other GO IDs |
206
|
|
|
for goid in self.hdrgo_is_usrgo: |
207
|
|
|
usrgo2hdrgo[goid] = goid |
208
|
|
|
assert len(self.usrgos) <= len(usrgo2hdrgo), \ |
209
|
|
|
"USRGOS({U}) != USRGO2HDRGO({H}): {GOs}".format( |
210
|
|
|
U=len(self.usrgos), |
211
|
|
|
H=len(usrgo2hdrgo), |
212
|
|
|
GOs=self.usrgos.symmetric_difference(set(usrgo2hdrgo.keys()))) |
213
|
|
|
return usrgo2hdrgo |
214
|
|
|
|
215
|
|
|
def get_go2sectiontxt(self): |
216
|
|
|
"""Return a dict with actual header and user GO IDs as keys and their sections as values.""" |
217
|
|
|
go2txt = {} |
218
|
|
|
_get_secs = self.hdrobj.get_sections |
219
|
|
|
hdrgo2sectxt = {h:" ".join(_get_secs(h)) for h in self.get_hdrgos()} |
220
|
|
|
usrgo2hdrgo = self.get_usrgo2hdrgo() |
221
|
|
|
for goid, ntgo in self.go2nt.items(): |
222
|
|
|
hdrgo = ntgo.GO if ntgo.is_hdrgo else usrgo2hdrgo[ntgo.GO] |
223
|
|
|
go2txt[goid] = hdrgo2sectxt[hdrgo] |
224
|
|
|
return go2txt |
225
|
|
|
|
226
|
|
|
def get_usrgo2sections(self): |
227
|
|
|
"""Return a dict with all user GO IDs as keys and their sections as values.""" |
228
|
|
|
usrgo2sections = cx.defaultdict(set) |
229
|
|
|
usrgo2hdrgo = self.get_usrgo2hdrgo() |
230
|
|
|
get_sections = self.hdrobj.get_sections |
231
|
|
|
for usrgo, hdrgo in usrgo2hdrgo.items(): |
232
|
|
|
sections = set(get_sections(hdrgo)) |
233
|
|
|
usrgo2sections[usrgo] |= sections |
234
|
|
|
assert len(usrgo2sections) >= len(self.usrgos), \ |
235
|
|
|
"uGOS({U}) != uGO2sections({H}): {GOs}".format( |
236
|
|
|
U=len(self.usrgos), |
237
|
|
|
H=len(usrgo2sections), |
238
|
|
|
GOs=self.usrgos.symmetric_difference(set(usrgo2sections.keys()))) |
239
|
|
|
return usrgo2sections |
240
|
|
|
|
241
|
|
|
def get_fout_base(self, goid, name=None, pre="gogrp"): |
242
|
|
|
"""Get filename for a group of GO IDs under a single header GO ID.""" |
243
|
|
|
goobj = self.gosubdag.go2obj[goid] |
244
|
|
|
if name is None: |
245
|
|
|
name = self.grpname.replace(" ", "_") |
246
|
|
|
sections = "_".join(self.hdrobj.get_sections(goid)) |
247
|
|
|
return "{PRE}_{BP}_{NAME}_{SEC}_{DSTR}_{D1s}_{GO}".format( |
248
|
|
|
PRE=pre, |
249
|
|
|
BP=Consts.NAMESPACE2NS[goobj.namespace], |
250
|
|
|
NAME=self._str_replace(name), |
251
|
|
|
SEC=self._str_replace(self._str_replace(sections)), |
252
|
|
|
GO=goid.replace(":", ""), |
253
|
|
|
DSTR=self._get_depthsr(goobj), |
254
|
|
|
D1s=self.gosubdag.go2nt[goobj.id].D1) |
255
|
|
|
|
256
|
|
|
def _get_depthsr(self, goobj): |
257
|
|
|
"""Return DNN or RNN depending on if relationships are loaded.""" |
258
|
|
|
if 'reldepth' in self.gosubdag.prt_attr['flds']: |
259
|
|
|
return "R{R:02}".format(R=goobj.reldepth) |
260
|
|
|
return "D{D:02}".format(D=goobj.depth) |
261
|
|
|
|
262
|
|
|
@staticmethod |
263
|
|
|
def _str_replace(txt): |
264
|
|
|
"""Makes a small text amenable to being used in a filename.""" |
265
|
|
|
txt = txt.replace(",", "") |
266
|
|
|
txt = txt.replace(" ", "_") |
267
|
|
|
txt = txt.replace(":", "") |
268
|
|
|
txt = txt.replace(".", "") |
269
|
|
|
txt = txt.replace("/", "") |
270
|
|
|
txt = txt.replace("", "") |
271
|
|
|
return txt |
272
|
|
|
|
273
|
|
|
def prt_summary(self, prt=sys.stdout): |
274
|
|
|
"""Print summary of grouping/sorting run.""" |
275
|
|
|
# Grouping summary |
276
|
|
|
fmtstr = "Grouped: {U:3,} User GOs, using {h:2,} of {H:,} Grouping GOs, for run: {NAME}\n" |
277
|
|
|
prt.write(fmtstr.format( |
278
|
|
|
NAME=self.grpname, |
279
|
|
|
U=len(self.usrgos), |
280
|
|
|
h=len(self.hdrobj.hdrgos.intersection(self.hdrgo2usrgos.keys())), |
281
|
|
|
H=self.hdrobj.num_hdrgos())) |
282
|
|
|
|
283
|
|
|
# Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved. |
284
|
|
|
|