1
|
|
|
"""Tasks for go2obj dicts.""" |
2
|
|
|
|
3
|
|
|
__copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved." |
4
|
|
|
__author__ = "DV Klopfenstein" |
5
|
|
|
|
6
|
|
|
import sys |
7
|
|
|
import collections as cx |
8
|
|
|
from goatools.godag.go_tasks import get_go2parents |
9
|
|
|
from goatools.godag.go_tasks import get_go2children |
10
|
|
|
|
11
|
|
|
|
12
|
|
|
# ------------------------------------------------------------------------------------ |
13
|
|
|
def get_sorted_relationship(goterms): |
14
|
|
|
"""Topological sort of GO Terms w/'relationship's loaded.""" |
15
|
|
|
return TopologicalSortRelationships(goterms).goterms_sorted |
16
|
|
|
|
17
|
|
|
class TopologicalSortRelationships(object): |
18
|
|
|
"""Topological sort of GO Terms w/'relationship's loaded.""" |
19
|
|
|
|
20
|
|
|
# pylint: disable=too-few-public-methods |
21
|
|
|
def __init__(self, goterms): |
22
|
|
|
self.goterms_sorted = [] |
23
|
|
|
self.goids_seen = set() |
24
|
|
|
self._init_sorted_relationship(goterms) |
25
|
|
|
|
26
|
|
|
def _init_sorted_relationship(self, goterms): |
27
|
|
|
"""Topologically sort GO Terms using 'is_a' parents and 'relationship' GO IDs.""" |
28
|
|
|
# NOTE: GODag must be loaded with 'relationship' to use this function |
29
|
|
|
for goterm in goterms: |
30
|
|
|
self._get_sorted_relationships(goterm) |
31
|
|
|
|
32
|
|
|
def _get_sorted_relationships(self, goterm): |
33
|
|
|
"""Traverse GO Terms above the current GO Term. Then add current GO Term to sorted.""" |
34
|
|
|
if goterm.id in self.goids_seen: |
35
|
|
|
return |
36
|
|
|
self.goids_seen.add(goterm.id) |
37
|
|
|
for goterm_upper in goterm.get_goterms_upper(): |
38
|
|
|
self._get_sorted_relationships(goterm_upper) |
39
|
|
|
self.goterms_sorted.append(goterm) |
40
|
|
|
|
41
|
|
|
|
42
|
|
|
# ------------------------------------------------------------------------------------ |
43
|
|
|
def update_association(assc_gene2gos, go2obj): |
44
|
|
|
"""Add the GO parents of a gene's associated GO IDs to the gene's association.""" |
45
|
|
|
# Replaces update_association in GODag |
46
|
|
|
goids_avail = set(go2obj) |
47
|
|
|
# Get all assc GO IDs that are current |
48
|
|
|
goid_sets = assc_gene2gos.values() |
49
|
|
|
goids_assoc_all = set.union(*goid_sets) |
50
|
|
|
goids_assoc_cur = goids_assoc_all.intersection(goids_avail) |
51
|
|
|
# Get the subset of GO objects in the association |
52
|
|
|
go2obj_assc = {go:go2obj[go] for go in goids_assoc_cur} |
53
|
|
|
go2parents = get_go2parents_go2obj(go2obj_assc) |
54
|
|
|
# Update the association: update the GO set for each gene |
55
|
|
|
for goids_cur in goid_sets: |
56
|
|
|
parents = set() |
57
|
|
|
for goid in goids_cur.intersection(goids_avail): |
58
|
|
|
parents.update(go2parents[goid]) |
59
|
|
|
goids_cur.update(parents) |
60
|
|
|
goids_bad = goids_assoc_all.difference(goids_avail) |
61
|
|
|
if goids_bad: |
62
|
|
|
sys.stderr.write("{N} GO IDs NOT FOUND IN ASSOCIATION: {GOs}\n".format( |
63
|
|
|
N=len(goids_bad), GOs=" ".join(goids_bad))) |
64
|
|
|
|
65
|
|
|
# ------------------------------------------------------------------------------------ |
66
|
|
|
def get_go2obj_unique(go2obj): |
67
|
|
|
"""If GO keys point to the same GOTerm, return new go2obj w/no duplicates.""" |
68
|
|
|
# Find the unique GO Terms that are represented for each GO in go2obj |
69
|
|
|
goid2gokeys = cx.defaultdict(set) |
70
|
|
|
for goid, goobj in go2obj.items(): |
71
|
|
|
goid2gokeys[goobj.id].add(goid) |
72
|
|
|
go_unique = set() |
73
|
|
|
for goid, gos_seen in goid2gokeys.items(): |
74
|
|
|
# Return main GO ID, if it is present in the go2obj keys |
75
|
|
|
if goid in gos_seen: |
76
|
|
|
go_unique.add(goid) |
77
|
|
|
# Otherwise return an alternate GO ID |
78
|
|
|
else: |
79
|
|
|
go_unique.add(next(iter(gos_seen))) |
80
|
|
|
return go_unique |
81
|
|
|
|
82
|
|
|
# ------------------------------------------------------------------------------------ |
83
|
|
|
def get_go2parents_go2obj(go2obj): |
84
|
|
|
"""Return go2parents (set of parent GO IDs) for all GO ID keys in go2obj.""" |
85
|
|
|
goobjs, altgo2goobj = get_goobjs_altgo2goobj(go2obj) |
86
|
|
|
go2parents = get_go2parents(goobjs) |
87
|
|
|
add_alt_goids(go2parents, altgo2goobj) |
88
|
|
|
return go2parents |
89
|
|
|
|
90
|
|
|
# ------------------------------------------------------------------------------------ |
91
|
|
|
def get_go2children_go2obj(go2obj): |
92
|
|
|
"""Return go2children (set of child GO IDs) for all GO ID keys in go2obj.""" |
93
|
|
|
goobjs, altgo2goobj = get_goobjs_altgo2goobj(go2obj) |
94
|
|
|
go2children = get_go2children(goobjs) |
95
|
|
|
add_alt_goids(go2children, altgo2goobj) |
96
|
|
|
return go2children |
97
|
|
|
|
98
|
|
|
# ------------------------------------------------------------------------------------ |
99
|
|
|
def get_goobjs_altgo2goobj(go2obj): |
100
|
|
|
"""Separate alt GO IDs and key GO IDs.""" |
101
|
|
|
goobjs = set() |
102
|
|
|
altgo2goobj = {} |
103
|
|
|
for goid, goobj in go2obj.items(): |
104
|
|
|
goobjs.add(goobj) |
105
|
|
|
if goid != goobj.id: |
106
|
|
|
altgo2goobj[goid] = goobj |
107
|
|
|
return goobjs, altgo2goobj |
108
|
|
|
|
109
|
|
|
def add_alt_goids(go2values, altgo2goobj): |
110
|
|
|
"""Add alternate source GO IDs.""" |
111
|
|
|
for goobj_key in altgo2goobj.values(): |
112
|
|
|
values_curr = go2values[goobj_key.id] |
113
|
|
|
for goid_alt in goobj_key.alt_ids: |
114
|
|
|
go2values[goid_alt] = values_curr |
115
|
|
|
return go2values |
116
|
|
|
|
117
|
|
|
# ------------------------------------------------------------------------------------ |
118
|
|
|
def fill_main_goids(go2obj, goids): |
119
|
|
|
"""Ensure main GO IDs are included in go2obj.""" |
120
|
|
|
# User GO IDs (goids) may be either main GO IDs or alternate GO IDs. |
121
|
|
|
for goid in goids: |
122
|
|
|
goobj = go2obj[goid] |
123
|
|
|
# If a user specified an ALT GO ID and main GO ID not in go2obj: |
124
|
|
|
if goid != goobj.id and goobj.id not in go2obj: |
125
|
|
|
# Add main GO ID to go2obj |
126
|
|
|
go2obj[goobj.id] = goobj |
127
|
|
|
|
128
|
|
|
def fill_altgoids(go2obj): |
129
|
|
|
"""Given a go2obj containing key GO IDs, fill with all alternate GO IDs.""" |
130
|
|
|
alt2obj = {altgo:goobj for goobj in go2obj.values() for altgo in goobj.alt_ids} |
131
|
|
|
for goid, goobj in alt2obj.items(): |
132
|
|
|
go2obj[goid] = goobj |
133
|
|
|
|
134
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
135
|
|
|
def fill_relationshipobjs(go2obj, relationships): |
136
|
|
|
"""Add GO IDs to go2obj that are involved in relationships.""" |
137
|
|
|
# Get all GO Term record objects that have relationships |
138
|
|
|
obj = RelationshipFill(go2obj, relationships) |
139
|
|
|
for goobj in go2obj.values(): |
140
|
|
|
if goobj.relationship: |
141
|
|
|
obj.fill_relationshipgo2obj(goobj) |
142
|
|
|
if goobj.relationship_rev: |
143
|
|
|
obj.fill_relationshiprevgo2obj(goobj) |
144
|
|
|
|
145
|
|
|
class RelationshipFill(object): |
146
|
|
|
"""Fill go2obj with GO IDs in relatinships.""" |
147
|
|
|
|
148
|
|
|
def __init__(self, go2obj, relationships): |
149
|
|
|
# This dict shall be augmented with higher parent/relationship GO IDs |
150
|
|
|
self.go2obj = go2obj |
151
|
|
|
# A set of relationships we would like to keep |
152
|
|
|
self.relationships = relationships |
153
|
|
|
|
154
|
|
|
def fill_relationshipgo2obj(self, goobj): |
155
|
|
|
"""Fill go2obj with all relationship key GO IDs and their objects.""" |
156
|
|
|
for reltyp, relgoobjs in goobj.relationship.items(): |
157
|
|
|
if reltyp in self.relationships: |
158
|
|
|
for relgoobj in relgoobjs: |
159
|
|
|
if relgoobj.id not in self.go2obj: |
160
|
|
|
self.go2obj[relgoobj.id] = relgoobj |
161
|
|
|
self.fill_relationshipgo2obj(relgoobj) |
162
|
|
|
|
163
|
|
|
def fill_relationshiprevgo2obj(self, goobj): |
164
|
|
|
"""Fill go2obj with all relationship key GO IDs and their objects.""" |
165
|
|
|
for reltyp, relgoobjs in goobj.relationship_rev.items(): |
166
|
|
|
if reltyp in self.relationships: |
167
|
|
|
for relgoobj in relgoobjs: |
168
|
|
|
if relgoobj.id not in self.go2obj: |
169
|
|
|
self.go2obj[relgoobj.id] = relgoobj |
170
|
|
|
self.fill_relationshiprevgo2obj(relgoobj) |
171
|
|
|
|
172
|
|
|
# ------------------------------------------------------------------------------------ |
173
|
|
|
def get_child_objs(parent_obj): |
174
|
|
|
"""Fill child2obj with all child key and alt GO IDs and their objects.""" |
175
|
|
|
child2obj = {} |
176
|
|
|
fill_childgoid2obj(child2obj, parent_obj) |
177
|
|
|
fill_altgoids(child2obj) |
178
|
|
|
return child2obj |
179
|
|
|
|
180
|
|
|
def fill_childgoid2obj(childgoid2obj, parent_obj): |
181
|
|
|
"""Fill childgoid2obj with all child key GO IDs and their objects.""" |
182
|
|
|
for child_obj in parent_obj.children: |
183
|
|
|
if child_obj.id not in childgoid2obj: |
184
|
|
|
childgoid2obj[child_obj.id] = child_obj |
185
|
|
|
fill_childgoid2obj(childgoid2obj, child_obj) |
186
|
|
|
|
187
|
|
|
# ------------------------------------------------------------------------------------ |
188
|
|
|
def get_leaf_children(gos_user, go2obj_arg): |
189
|
|
|
"""Find all the GO descendants under all user GO IDs. Return leaf-level GO IDs.""" |
190
|
|
|
childgoid2obj = {} |
191
|
|
|
for goid_usr in gos_user: |
192
|
|
|
goobj_usr = go2obj_arg[goid_usr] |
193
|
|
|
fill_childgoid2obj(childgoid2obj, goobj_usr) |
194
|
|
|
return set(go for go, o in childgoid2obj.items() if not o.children) |
195
|
|
|
|
196
|
|
|
# ------------------------------------------------------------------------------------ |
197
|
|
|
def goid_is_valid(goid): |
198
|
|
|
"""Check format of user-provided GO IDs""" |
199
|
|
|
return goid[:3] == "GO:" and len(goid) == 10 and goid[3:].isdigit() |
200
|
|
|
|
201
|
|
|
def goids_valid(goids): |
202
|
|
|
"""Check format of user-provided GO IDs""" |
203
|
|
|
for goid in goids: |
204
|
|
|
if not goid_is_valid(goid): |
205
|
|
|
return False |
206
|
|
|
return True |
207
|
|
|
|
208
|
|
|
def chk_goids(goids, msg=None, raise_except=True): |
209
|
|
|
"""check that all GO IDs have the proper format.""" |
210
|
|
|
for goid in goids: |
211
|
|
|
if not goid_is_valid(goid): |
212
|
|
|
if raise_except: |
213
|
|
|
raise RuntimeError("BAD GO({GO}): {MSG}".format(GO=goid, MSG=msg)) |
214
|
|
|
else: |
215
|
|
|
return goid |
216
|
|
|
|
217
|
|
|
# Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved. |
218
|
|
|
|