1
|
|
|
"""Creates and manages edges from one GO term to another GO term.""" |
2
|
|
|
|
3
|
|
|
__copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved." |
4
|
|
|
__author__ = "DV Klopfenstein" |
5
|
|
|
|
6
|
|
|
from collections import defaultdict |
7
|
|
|
|
8
|
|
|
|
9
|
|
|
def get_edgesobj(gosubdag, **kws): |
10
|
|
|
"""Return specfied GoSubDag initialization object.""" |
11
|
|
|
# Keyword args (kws): |
12
|
|
|
# 1. dst_srcs_list Used for edges pruned such that only GO terms |
13
|
|
|
# are retained which are between the sets of dst & srcs. |
14
|
|
|
# 2 traverse_parent & traverse_child |
15
|
|
|
# Used to generate a GoSubDag with all parent terms and/or |
16
|
|
|
# all child terms, without pruning any paths. |
17
|
|
|
# Call function, get_edgesobj, with: |
18
|
|
|
# get_edgesobj(go2obj, dst_srcs_list=...) |
19
|
|
|
# Or any of: |
20
|
|
|
# get_edgesobj(go2obj, go_sources=...) |
21
|
|
|
# get_edgesobj(go2obj, go_sources=..., traverse_parent=...,) |
22
|
|
|
# get_edgesobj(go2obj, go_sources=..., traverse_child=...,) |
23
|
|
|
# get_edgesobj(go2obj, go_sources=..., traverse_parent=..., traverse_child=...,) |
24
|
|
|
edgeobj = _get_edgesobj(gosubdag, **kws) |
25
|
|
|
rm_gos = kws.get('rm_gos') |
26
|
|
|
if rm_gos is not None: |
27
|
|
|
edgeobj.rm_gos(rm_gos) |
28
|
|
|
return edgeobj |
29
|
|
|
|
30
|
|
|
def _get_edgesobj(gosubdag, **kws): |
31
|
|
|
"""Return specfied GoSubDag initialization object.""" |
32
|
|
|
# Keyword args (kws): |
33
|
|
|
# 1. dst_srcs_list Used for edges pruned such that only GO terms |
34
|
|
|
# are retained which are between the sets of dst & srcs. |
35
|
|
|
# 2 traverse_parent & traverse_child |
36
|
|
|
# Used to generate a GoSubDag with all parent terms and/or |
37
|
|
|
# all child terms, without pruning any paths. |
38
|
|
|
# Call function, get_edgesobj, with: |
39
|
|
|
# get_edgesobj(go2obj, dst_srcs_list=...) |
40
|
|
|
# Or any of: |
41
|
|
|
# get_edgesobj(go2obj, go_sources=...) |
42
|
|
|
# get_edgesobj(go2obj, go_sources=..., traverse_parent=...,) |
43
|
|
|
# get_edgesobj(go2obj, go_sources=..., traverse_child=...,) |
44
|
|
|
# get_edgesobj(go2obj, go_sources=..., traverse_parent=..., traverse_child=...,) |
45
|
|
|
dst_srcs_list = kws.get('dst_srcs_list', None) |
46
|
|
|
if dst_srcs_list is not None: |
47
|
|
|
return EdgesPath(gosubdag, dst_srcs_list) |
48
|
|
|
return EdgesRelatives(gosubdag, |
49
|
|
|
kws.get('traverse_parent', True), |
50
|
|
|
kws.get('traverse_child', False)) |
51
|
|
|
|
52
|
|
|
# -- Base Class ---------------------------------------------------------------- |
53
|
|
|
class EdgesBase(object): |
54
|
|
|
"""Base class for GoEdges class.""" |
55
|
|
|
|
56
|
|
|
def __init__(self, gosubdag): |
57
|
|
|
self.gosubdag = gosubdag |
58
|
|
|
self.go2obj = gosubdag.go2obj |
59
|
|
|
self.relationships = gosubdag.relationships |
60
|
|
|
# Set by derived edge class |
61
|
|
|
self.edges = [] # Lists of (goid_child, goid_parent) |
62
|
|
|
self.edges_rel = {} |
63
|
|
|
|
64
|
|
|
def rm_gos(self, rm_goids): |
65
|
|
|
"""Remove any edges that contain user-specified edges.""" |
66
|
|
|
self.edges = self._rm_gos_edges(rm_goids, self.edges) |
67
|
|
|
self.edges_rel = self._rm_gos_edges_rel(rm_goids, self.edges_rel) |
68
|
|
|
|
69
|
|
|
def _rm_gos_edges_rel(self, rm_goids, edges_rel): |
70
|
|
|
"""Remove any relationship that contain user-specified edges.""" |
71
|
|
|
edges_ret = {} |
72
|
|
|
for rname, edges_cur in edges_rel.items(): |
73
|
|
|
edges_new = self._rm_gos_edges(rm_goids, edges_cur) |
74
|
|
|
if edges_new: |
75
|
|
|
edges_ret[rname] = edges_new |
76
|
|
|
return edges_ret |
77
|
|
|
|
78
|
|
|
@staticmethod |
79
|
|
|
def _rm_gos_edges(rm_goids, edges_all): |
80
|
|
|
"""Remove any is_a edges that contain user-specified edges.""" |
81
|
|
|
edges_reduced = [] |
82
|
|
|
for goid_child, goid_parent in sorted(edges_all, key=lambda t: t[1]): |
83
|
|
|
if goid_child not in rm_goids and goid_parent not in rm_goids: |
84
|
|
|
edges_reduced.append((goid_child, goid_parent)) |
85
|
|
|
return edges_reduced |
86
|
|
|
|
87
|
|
|
def get_all_edge_nodes(self): |
88
|
|
|
"""Return a list of all GO IDs that are connected to edges.""" |
89
|
|
|
edge_nodes = set(e for es in self.edges for e in es) |
90
|
|
|
for edges in self.edges_rel.values(): |
91
|
|
|
rel_nodes = set(e for es in edges for e in es) |
92
|
|
|
edge_nodes.update(rel_nodes) |
93
|
|
|
return edge_nodes |
94
|
|
|
|
95
|
|
|
def chk_edges(self): |
96
|
|
|
"""Check that all edge nodes exist in local subset.""" |
97
|
|
|
goids = set(self.go2obj) |
98
|
|
|
self.chk_edges_nodes(self.edges, goids, "is_a") |
99
|
|
|
for reltype, edges in self.edges_rel.items(): |
100
|
|
|
self.chk_edges_nodes(edges, goids, reltype) |
101
|
|
|
|
102
|
|
|
@staticmethod |
103
|
|
|
def chk_edges_nodes(edges, nodes, name): |
104
|
|
|
"""Check that user specified edges have a node which exists.""" |
105
|
|
|
edge_nodes = set(e for es in edges for e in es) |
106
|
|
|
missing_nodes = edge_nodes.difference(nodes) |
107
|
|
|
assert not missing_nodes, "MISSING: {GOs}\n{NM} EDGES MISSING {N} NODES (OF {T})".format( |
108
|
|
|
NM=name, N=len(missing_nodes), T=len(edge_nodes), GOs=missing_nodes) |
109
|
|
|
|
110
|
|
|
def get_c2ps(self): |
111
|
|
|
"""Set child2parents dict for all parents used in this set of edges.""" |
112
|
|
|
c2ps = defaultdict(set) |
113
|
|
|
for goid_child, goid_parent in self.edges: |
114
|
|
|
c2ps[goid_child].add(goid_parent) |
115
|
|
|
return c2ps |
116
|
|
|
|
117
|
|
|
def _getobjs_higher(self, goobj): |
118
|
|
|
"""Get all parents/relationships on this GOTerm.""" |
119
|
|
|
goobjs_higher = set(goobj.parents) |
120
|
|
|
for reltyp, relgoobjs in goobj.relationship.items(): |
121
|
|
|
if reltyp in self.relationships: |
122
|
|
|
goobjs_higher.update(relgoobjs) |
123
|
|
|
return goobjs_higher |
124
|
|
|
|
125
|
|
|
|
126
|
|
|
|
127
|
|
|
# -- Initialization by considering all child and/or parent relatives ----------- |
128
|
|
|
class EdgesRelatives(EdgesBase): |
129
|
|
|
"""Inits GO-to-GO edges using all relatives above and/or below source GOs.""" |
130
|
|
|
|
131
|
|
|
# pylint: disable=too-many-arguments |
132
|
|
|
# def __init__(self, go2obj, relationships, go_sources, traverse_parent, traverse_child): |
133
|
|
|
def __init__(self, gosubdag, traverse_parent, traverse_child): |
134
|
|
|
super(EdgesRelatives, self).__init__(gosubdag) |
135
|
|
|
# go2obj contain GO IDs in subset |
136
|
|
|
_gos = set(gosubdag.go2obj) |
137
|
|
|
assert traverse_child or traverse_parent, "NO EDGES IN GRAPH" |
138
|
|
|
# GO IDs for child->parents |
139
|
|
|
p2cs = self._init_p2cs(_gos, traverse_parent) |
140
|
|
|
# GO IDs for parent->children |
141
|
|
|
c2ps = self._init_c2ps(gosubdag.go_sources, traverse_child) |
142
|
|
|
# GO IDs for GO->relationship |
143
|
|
|
rel2src2dsts = self._init_rel2src2dsts(_gos, traverse_parent) |
144
|
|
|
rel2dst2srcs = self._init_rel2dst2srcs(_gos, traverse_child) |
145
|
|
|
# Set by derived edge class |
146
|
|
|
# self.edges = self._init_edges(_gos, p2cs, c2ps) |
147
|
|
|
self.edges = self._init_edges(p2cs, c2ps) |
148
|
|
|
self.edges_rel = self._init_edges_relationships(rel2src2dsts, rel2dst2srcs) |
149
|
|
|
assert _gos == set(self.go2obj) |
150
|
|
|
# self.chk_edges() |
151
|
|
|
|
152
|
|
|
@staticmethod |
153
|
|
|
# Too slow to check goids_present as we go. Only minor init modes need checking. |
154
|
|
|
# def _init_edges(goids_present, p2cs, c2ps): |
155
|
|
|
def _init_edges(p2cs, c2ps): |
156
|
|
|
"""Get the directed edges from GO term to GO term.""" |
157
|
|
|
edge_from_to = [] |
158
|
|
|
for parent, children in p2cs.items(): |
159
|
|
|
for child in children: |
160
|
|
|
# if child in goids_present and parent in goids_present: |
161
|
|
|
edge_from_to.append((child, parent)) |
162
|
|
|
for parent, children in c2ps.items(): |
163
|
|
|
for child in children: |
164
|
|
|
# if child in goids_present and parent in goids_present: |
165
|
|
|
edge_from_to.append((child, parent)) |
166
|
|
|
return edge_from_to |
167
|
|
|
|
168
|
|
|
@staticmethod |
169
|
|
|
def _init_edges_relationships(rel2src2dsts, rel2dst2srcs): |
170
|
|
|
"""Get the directed edges from GO term to GO term using relationships.""" |
171
|
|
|
edge_rel2fromto = {} |
172
|
|
|
relationships = set(rel2src2dsts).union(rel2dst2srcs) |
173
|
|
|
for reltype in relationships: |
174
|
|
|
edge_from_to = [] |
175
|
|
|
if reltype in rel2src2dsts: |
176
|
|
|
for parent, children in rel2src2dsts[reltype].items(): |
177
|
|
|
for child in children: |
178
|
|
|
edge_from_to.append((child, parent)) |
179
|
|
|
if reltype in rel2dst2srcs: |
180
|
|
|
for parent, children in rel2dst2srcs[reltype].items(): |
181
|
|
|
for child in children: |
182
|
|
|
edge_from_to.append((child, parent)) |
183
|
|
|
edge_rel2fromto[reltype] = edge_from_to |
184
|
|
|
return edge_rel2fromto |
185
|
|
|
|
186
|
|
|
# ------------------------------------------------------------------- |
187
|
|
|
def _init_rel2src2dsts(self, go_sources, traverse_parent): |
188
|
|
|
"""Traverse up parents.""" |
189
|
|
|
if not traverse_parent or not self.relationships: |
190
|
|
|
return {} |
191
|
|
|
rel2src2dsts = {r:defaultdict(set) for r in self.relationships} |
192
|
|
|
goids_seen = set() |
193
|
|
|
go2obj = self.go2obj |
194
|
|
|
for goid_src in go_sources: |
195
|
|
|
goobj_src = go2obj[goid_src] |
196
|
|
|
if goobj_src.relationship and goid_src not in goids_seen: |
197
|
|
|
self._traverse_relationship_objs(rel2src2dsts, goobj_src, goids_seen) |
198
|
|
|
return rel2src2dsts |
199
|
|
|
|
200
|
|
View Code Duplication |
def _traverse_relationship_objs(self, rel2src2dsts, goobj_child, goids_seen): |
|
|
|
|
201
|
|
|
"""Traverse from source GO up relationships.""" |
202
|
|
|
child_id = goobj_child.id |
203
|
|
|
goids_seen.add(child_id) |
204
|
|
|
##A self.go2obj[child_id] = goobj_child |
205
|
|
|
# Update goids_seen and go2obj with child alt_ids |
206
|
|
|
for goid_altid in goobj_child.alt_ids: |
207
|
|
|
goids_seen.add(goid_altid) |
208
|
|
|
##A self.go2obj[goid_altid] = goobj_child |
209
|
|
|
# Loop through relationships of child object |
210
|
|
|
for reltype, recs in goobj_child.relationship.items(): |
211
|
|
|
if reltype in self.relationships: |
212
|
|
|
for relationship_obj in recs: |
213
|
|
|
relationship_id = relationship_obj.id |
214
|
|
|
rel2src2dsts[reltype][relationship_id].add(child_id) |
215
|
|
|
# If relationship has not been seen, traverse |
216
|
|
|
if relationship_id not in goids_seen: |
217
|
|
|
self._traverse_relationship_objs(rel2src2dsts, relationship_obj, goids_seen) |
218
|
|
|
|
219
|
|
|
# ------------------------------------------------------------------- |
220
|
|
|
def _init_rel2dst2srcs(self, go_sources, traverse_child): |
221
|
|
|
"""Traverse through reverse relationships.""" |
222
|
|
|
if not traverse_child or not self.relationships: |
223
|
|
|
return {} |
224
|
|
|
rel2dst2srcs = {r:defaultdict(set) for r in self.relationships} |
225
|
|
|
goids_seen = set() |
226
|
|
|
go2obj = self.go2obj |
227
|
|
|
for goid_src in go_sources: |
228
|
|
|
goobj_src = go2obj[goid_src] |
229
|
|
|
if goid_src not in goids_seen: |
230
|
|
|
self._traverse_relationship_rev_objs(rel2dst2srcs, goobj_src, goids_seen) |
231
|
|
|
return rel2dst2srcs |
232
|
|
|
|
233
|
|
View Code Duplication |
def _traverse_relationship_rev_objs(self, rel2dst2srcs, goobj_parent, goids_seen): |
|
|
|
|
234
|
|
|
"""Traverse from source GO down children.""" |
235
|
|
|
parent_id = goobj_parent.id |
236
|
|
|
goids_seen.add(parent_id) |
237
|
|
|
##A self.go2obj[parent_id] = goobj_parent |
238
|
|
|
# Update goids_seen and go2obj with parent alt_ids |
239
|
|
|
for goid_altid in goobj_parent.alt_ids: |
240
|
|
|
goids_seen.add(goid_altid) |
241
|
|
|
##A self.go2obj[goid_altid] = goobj_parent |
242
|
|
|
# Loop through children |
243
|
|
|
for reltype, recs in goobj_parent.relationship.items(): |
244
|
|
|
if reltype in self.relationships: |
245
|
|
|
for relrev_obj in recs: |
246
|
|
|
relrev_id = relrev_obj.id |
247
|
|
|
rel2dst2srcs[relrev_id].add(parent_id) |
248
|
|
|
# If child has not been seen, traverse |
249
|
|
|
if relrev_id not in goids_seen: |
250
|
|
|
##F self._traverse_relrev_objs(rel2dst2srcs, relrev_obj, go2obj, goids_seen) |
251
|
|
|
self._traverse_relationship_rev_objs(rel2dst2srcs, relrev_obj, goids_seen) |
252
|
|
|
|
253
|
|
|
# ------------------------------------------------------------------- |
254
|
|
|
def _init_p2cs(self, go_sources, traverse_parent): |
255
|
|
|
"""Traverse up parents.""" |
256
|
|
|
if not traverse_parent: |
257
|
|
|
return {} |
258
|
|
|
p2cs = defaultdict(set) |
259
|
|
|
goids_seen = set() |
260
|
|
|
go2obj = self.go2obj |
261
|
|
|
for goid_src in go_sources: |
262
|
|
|
goobj_src = go2obj[goid_src] |
263
|
|
|
if goid_src not in goids_seen: |
264
|
|
|
##F self._traverse_parent_objs(p2cs, goobj_src, go2obj, goids_seen) |
265
|
|
|
self._traverse_parent_objs(p2cs, goobj_src, goids_seen) |
266
|
|
|
return p2cs |
267
|
|
|
|
268
|
|
|
##F def _traverse_parent_objs(self, p2cs, goobj_child, go2obj, goids_seen): |
269
|
|
|
def _traverse_parent_objs(self, p2cs, goobj_child, goids_seen): |
270
|
|
|
"""Traverse from source GO up parents.""" |
271
|
|
|
# Update public(go2obj p2cs), private(goids_seen) |
272
|
|
|
child_id = goobj_child.id |
273
|
|
|
# mark child as seen |
274
|
|
|
goids_seen.add(child_id) |
275
|
|
|
##A self.go2obj[child_id] = goobj_child |
276
|
|
|
# Update goids_seen and go2obj with child alt_ids |
277
|
|
|
for goid_altid in goobj_child.alt_ids: |
278
|
|
|
goids_seen.add(goid_altid) |
279
|
|
|
##A self.go2obj[goid_altid] = goobj_child |
280
|
|
|
# Loop through parents of child object |
281
|
|
|
for parent_obj in goobj_child.parents: |
282
|
|
|
parent_id = parent_obj.id |
283
|
|
|
p2cs[parent_id].add(child_id) |
284
|
|
|
# If parent has not been seen, traverse |
285
|
|
|
if parent_id not in goids_seen: |
286
|
|
|
##F self._traverse_parent_objs(p2cs, parent_obj, go2obj, goids_seen) |
287
|
|
|
self._traverse_parent_objs(p2cs, parent_obj, goids_seen) |
288
|
|
|
|
289
|
|
|
# ------------------------------------------------------------------- |
290
|
|
|
def _init_c2ps(self, go_sources, traverse_child): |
291
|
|
|
"""Traverse up children.""" |
292
|
|
|
if not traverse_child: |
293
|
|
|
return {} |
294
|
|
|
c2ps = defaultdict(set) |
295
|
|
|
goids_seen = set() |
296
|
|
|
go2obj = self.go2obj |
297
|
|
|
for goid_src in go_sources: |
298
|
|
|
goobj_src = go2obj[goid_src] |
299
|
|
|
if goid_src not in goids_seen: |
300
|
|
|
##F self._traverse_child_objs(c2ps, goobj_src, go2obj, goids_seen) |
301
|
|
|
self._traverse_child_objs(c2ps, goobj_src, goids_seen) |
302
|
|
|
return c2ps |
303
|
|
|
|
304
|
|
|
##F def _traverse_child_objs(self, c2ps, goobj_parent, go2obj, goids_seen): |
305
|
|
|
def _traverse_child_objs(self, c2ps, goobj_parent, goids_seen): |
306
|
|
|
"""Traverse from source GO down children.""" |
307
|
|
|
# Update public(godag.go2obj godag.c2ps), private(_seen_pids) |
308
|
|
|
parent_id = goobj_parent.id |
309
|
|
|
# mark parent as seen |
310
|
|
|
goids_seen.add(parent_id) |
311
|
|
|
##A self.go2obj[parent_id] = goobj_parent |
312
|
|
|
# Update goids_seen and go2obj with parent alt_ids |
313
|
|
|
for goid_altid in goobj_parent.alt_ids: |
314
|
|
|
goids_seen.add(goid_altid) |
315
|
|
|
##A self.go2obj[goid_altid] = goobj_parent |
316
|
|
|
# Loop through children |
317
|
|
|
for child_obj in goobj_parent.children: |
318
|
|
|
child_id = child_obj.id |
319
|
|
|
c2ps[child_id].add(parent_id) |
320
|
|
|
# If child has not been seen, traverse |
321
|
|
|
if child_id not in goids_seen: |
322
|
|
|
##F self._traverse_child_objs(c2ps, child_obj, go2obj, goids_seen) |
323
|
|
|
self._traverse_child_objs(c2ps, child_obj, goids_seen) |
324
|
|
|
|
325
|
|
|
|
326
|
|
|
# -- Initialization with realtives on specific src-dst paths ------------------- |
327
|
|
|
class EdgesPath(EdgesBase): |
328
|
|
|
"""Inits GO-to-GO edges using a list of (parent destination, child sources)""" |
329
|
|
|
|
330
|
|
|
def __init__(self, gosubdag, dst_srcs_list): |
331
|
|
|
super(EdgesPath, self).__init__(gosubdag) |
332
|
|
|
self.edges = None |
333
|
|
|
self.goid_all = None |
334
|
|
|
self._init_edges(dst_srcs_list) |
335
|
|
|
# GO IDs for child->parents |
336
|
|
|
# self.p2cs = self._init_p2cs(go_sources, traverse_parent) |
337
|
|
|
# GO IDs for parent->children |
338
|
|
|
# self.c2ps = self._init_c2ps(go_sources, traverse_child) |
339
|
|
|
|
340
|
|
|
def get_edges(self): |
341
|
|
|
"""Get the directed edges from GO term to GO term.""" |
342
|
|
|
return self.edges |
343
|
|
|
|
344
|
|
|
def _init_edges(self, dst_srcs_list): |
345
|
|
|
"""Create all GO edges given a list of (dst, srcs).""" |
346
|
|
|
from goatools.gosubdag.go_paths import get_paths_goobjs, paths2edges |
347
|
|
|
edges_all = set() |
348
|
|
|
goid_all = set() |
349
|
|
|
go2obj = self.go2obj |
350
|
|
|
for dst, srcs in dst_srcs_list: |
351
|
|
|
go2obj_srcs = {} |
352
|
|
|
for goid in srcs: |
353
|
|
|
go2obj_srcs[goid] = go2obj[goid] |
354
|
|
|
go_paths, go_all = get_paths_goobjs(go2obj_srcs.values(), go_top=dst, go2obj=go2obj) |
355
|
|
|
edges_all |= paths2edges(go_paths) |
356
|
|
|
goid_all |= go_all |
357
|
|
|
self.edges = [(a.id, b.id) for a, b in edges_all] |
358
|
|
|
self.goid_all = goid_all |
359
|
|
|
|
360
|
|
|
# Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved. |
361
|
|
|
|