|
1
|
|
|
"""Used to operate on a sub-graph of a larger GO DAG.""" |
|
2
|
|
|
|
|
3
|
|
|
__copyright__ = "Copyright (C) 2016-2017, DV Klopfenstein, H Tang, All rights reserved." |
|
4
|
|
|
__author__ = "DV Klopfenstein" |
|
5
|
|
|
|
|
6
|
|
|
import sys |
|
7
|
|
|
import re |
|
8
|
|
|
import collections as cx |
|
9
|
|
|
from goatools.gosubdag.gosubdag import GoSubDag |
|
10
|
|
|
from goatools.wr_tbl import wr_xlsx as wr_xlsx_tbl |
|
11
|
|
|
from goatools.wr_tbl import wr_xlsx_sections as wr_xlsx_sections_tbl |
|
12
|
|
|
from goatools.wr_tbl import get_lines |
|
13
|
|
|
from goatools.wr_tbl import prt_txt |
|
14
|
|
|
|
|
15
|
|
|
class GoSubDagWr(object): |
|
16
|
|
|
"""Contains a sub-graph of the original obo from geneontology.org.""" |
|
17
|
|
|
|
|
18
|
|
|
fld2col_widths = { |
|
19
|
|
|
'NS' : 3, |
|
20
|
|
|
'dcnt' : 6, |
|
21
|
|
|
'level' : 4, |
|
22
|
|
|
'depth' : 4, |
|
23
|
|
|
'GO' : 12, |
|
24
|
|
|
'D1' : 6, |
|
25
|
|
|
'GO_name' : 45} |
|
26
|
|
|
|
|
27
|
|
|
def __init__(self, go2obj): |
|
28
|
|
|
self.go2obj = go2obj |
|
29
|
|
|
|
|
30
|
|
|
def wr_xlsx(self, fout_xlsx, goids, sortby=None, **kws_usr): |
|
31
|
|
|
"""Write goids into a table.""" |
|
32
|
|
|
nts = GoSubDag(goids, self.go2obj).get_nts(goids, sortby) |
|
33
|
|
|
kws_wr = kws_usr.copy() |
|
34
|
|
|
if 'fld2col_widths' not in kws_wr: |
|
35
|
|
|
kws_wr['fld2col_widths'] = self.fld2col_widths |
|
36
|
|
|
wr_xlsx_tbl(fout_xlsx, nts, **kws_wr) |
|
37
|
|
|
|
|
38
|
|
|
def wr_xlsx_sections(self, fout_xlsx, sections, sortby=None, **kws_usr): |
|
39
|
|
|
"""Write goids into a table.""" |
|
40
|
|
|
nts = self.get_nts_sections(sections, sortby) |
|
41
|
|
|
kws_wr = kws_usr.copy() |
|
42
|
|
|
if 'fld2col_widths' not in kws_wr: |
|
43
|
|
|
kws_wr['fld2col_widths'] = self.fld2col_widths |
|
44
|
|
|
else: |
|
45
|
|
|
fld2col_widths = self.fld2col_widths.copy() |
|
46
|
|
|
for fld, wid in kws_usr['fld2col_widths'].items(): |
|
47
|
|
|
fld2col_widths[fld] = wid |
|
48
|
|
|
kws_wr['fld2col_widths'] = fld2col_widths |
|
49
|
|
|
wr_xlsx_sections_tbl(fout_xlsx, nts, **kws_wr) |
|
50
|
|
|
|
|
51
|
|
|
def get_nts_sections(self, sections, sortby=None): |
|
52
|
|
|
"""Given a list of sections containing GO IDs, get a list of sections w/GO nts.""" |
|
53
|
|
|
goids = self.get_goids_sections(sections) |
|
54
|
|
|
gosubdag = GoSubDag(goids, self.go2obj) |
|
55
|
|
|
return [(sec, gosubdag.get_nts(gos, sortby)) for sec, gos in sections] |
|
56
|
|
|
|
|
57
|
|
|
@staticmethod |
|
58
|
|
|
def get_goids_sections(sections): |
|
59
|
|
|
"""Return all the GO IDs in a 2-D sections list.""" |
|
60
|
|
|
goids_all = set() |
|
61
|
|
|
for _, goids_sec in sections: |
|
62
|
|
|
goids_all |= set(goids_sec) |
|
63
|
|
|
return goids_all |
|
64
|
|
|
|
|
65
|
|
|
|
|
66
|
|
|
def read_d1_letter(fin_txt): |
|
67
|
|
|
"""Reads letter aliases from a text file created by GoDepth1LettersWr.""" |
|
68
|
|
|
go2letter = {} |
|
69
|
|
|
re_goid = re.compile(r"(GO:\d{7})") |
|
70
|
|
|
with open(fin_txt) as ifstrm: |
|
71
|
|
|
for line in ifstrm: |
|
72
|
|
|
mtch = re_goid.search(line) |
|
73
|
|
|
if mtch and line[:1] != ' ': |
|
74
|
|
|
# Alias is expected to be the first character |
|
75
|
|
|
go2letter[mtch.group(1)] = line[:1] |
|
76
|
|
|
return go2letter |
|
77
|
|
|
|
|
78
|
|
|
class GoDepth1LettersWr(object): |
|
79
|
|
|
"""Writes reports for a GoDepth1Letters object.""" |
|
80
|
|
|
|
|
81
|
|
|
str2ns = {'biological_process': 'BP', 'molecular_function': 'MF', 'cellular_component': 'CC'} |
|
82
|
|
|
hdrs = ['D1', 'NS', 'descendants', 'depth', 'GO', 'GO description'] |
|
83
|
|
|
|
|
84
|
|
|
def __init__(self, rcntobj): |
|
85
|
|
|
self.ns2nt = self._init_ns2nt(rcntobj) |
|
86
|
|
|
self.goone2ntletter = rcntobj.goone2ntletter |
|
87
|
|
|
|
|
88
|
|
|
def prt_txt(self, prt=sys.stdout): |
|
89
|
|
|
"""Print letters, descendant count, and GO information.""" |
|
90
|
|
|
for ntdata in self.get_d1nts(): |
|
91
|
|
|
prt.write("{L:1} {d:6,} D{D:02} {GO} {NAME}\n".format( |
|
92
|
|
|
L=ntdata.D1, |
|
93
|
|
|
d=ntdata.dcnt, |
|
94
|
|
|
D=ntdata.depth, |
|
95
|
|
|
GO=ntdata.GO, |
|
96
|
|
|
NAME=ntdata.name)) |
|
97
|
|
|
|
|
98
|
|
|
def wr_xlsx(self, fout_xlsx="gos_depth01.xlsx", **kws): |
|
99
|
|
|
"""Write xlsx table of depth-01 GO terms and their letter representation.""" |
|
100
|
|
|
data_nts = self.get_d1nts() |
|
101
|
|
|
if 'fld2col_widths' not in kws: |
|
102
|
|
|
kws['fld2col_widths'] = {'D1': 6, 'NS':3, 'depth': 5, 'GO': 12, 'name': 40} |
|
103
|
|
|
if 'hdrs' not in kws: |
|
104
|
|
|
kws['hdrs'] = self.hdrs |
|
105
|
|
|
wr_xlsx_tbl(fout_xlsx, data_nts, **kws) |
|
106
|
|
|
|
|
107
|
|
|
def wr_txt(self, fout_txt="gos_depth01.txt", title=None): |
|
108
|
|
|
"""write text table of depth-01 GO terms and their letter representation.""" |
|
109
|
|
|
data_nts = self.get_d1nts() |
|
110
|
|
|
with open(fout_txt, 'w') as prt: |
|
111
|
|
|
if title is not None: |
|
112
|
|
|
prt.write("{TITLE}\n\n".format(TITLE=title)) |
|
113
|
|
|
prt.write(" D1 : Letter representing the depth-01 GO term\n") |
|
114
|
|
|
prt.write(" dcnt: Total number of all descendants\n") |
|
115
|
|
|
prt.write(" dep: Depth; The maximum length path to ") |
|
116
|
|
|
prt.write("leaf-level (childless) GO descendant(s)\n\n") |
|
117
|
|
|
prt.write("D1 NS dcnt dep GO ID Description\n") |
|
118
|
|
|
prt.write("- -- ------ --- ---------- ------------------------------\n") |
|
119
|
|
|
prt_txt(prt, data_nts) |
|
120
|
|
|
sys.stdout.write(" {N:>5} items WROTE: {TXT}\n".format( |
|
121
|
|
|
N=len(data_nts), TXT=fout_txt)) |
|
122
|
|
|
|
|
123
|
|
|
def wr_tex(self, fout_tex="gos_depth01.tex"): |
|
124
|
|
|
"""write text table of depth-01 GO terms and their letter representation.""" |
|
125
|
|
|
data_nts = self.get_d1nts() |
|
126
|
|
|
joinchr = " & " |
|
127
|
|
|
#pylint: disable=anomalous-backslash-in-string |
|
128
|
|
|
eol = " \\\\\n" |
|
129
|
|
|
with open(fout_tex, 'w') as prt: |
|
130
|
|
|
prt.write("\\begin{table}[!ht]\n") |
|
131
|
|
|
prt.write("\\begin{tabular}{|p{.5cm} | p{.5cm} | >{\\raggedleft}p{.9cm} ") |
|
132
|
|
|
prt.write("|p{.7cm} |p{1.8cm} |p{9cm}|}\n") |
|
133
|
|
|
prt.write("\multicolumn{6}{c}{} \\\\\n") |
|
134
|
|
|
prt.write("\hline\n") |
|
135
|
|
|
prt.write("\\rowcolor{gray!10}\n") |
|
136
|
|
|
prt.write("{HDRS}{EOL}".format( |
|
137
|
|
|
HDRS=joinchr.join(next(iter(data_nts))._fields), EOL=eol)) |
|
138
|
|
|
prt.write("\hline\n") |
|
139
|
|
|
for idx, line in enumerate(get_lines(data_nts, joinchr=joinchr, eol=eol)): |
|
140
|
|
|
if idx%2 == 1: |
|
141
|
|
|
prt.write("\\rowcolor{gray!7}\n") |
|
142
|
|
|
line.replace('_', '\\_') |
|
143
|
|
|
prt.write(line) |
|
144
|
|
|
prt.write("\hline\n") |
|
145
|
|
|
prt.write("\end{tabular}\n") |
|
146
|
|
|
caption = ("The descendant counts of GO terms at depth-01 are highly skewed. The " |
|
147
|
|
|
"root term, \textit{biological\_process} has over twenty GO children at " |
|
148
|
|
|
"depth-01 shown in the table sorted by their number of descendants " |
|
149
|
|
|
"(dcnt) with \textit{cellular process} at the top having 18k+ " |
|
150
|
|
|
"descendants and \textit{cell killing} near the bottom having only " |
|
151
|
|
|
"about 100 descendants. The first column (D1) contains a letter used as " |
|
152
|
|
|
"an alias for each depth-01 GO term. The second column represents the " |
|
153
|
|
|
"number of descendants from the specified GO term from down to the total " |
|
154
|
|
|
"of its descendant leaf-level GO terms, which have no child GO terms.") |
|
155
|
|
|
prt.write("\caption{{{TEXT}}}\n\n".format(TEXT=caption)) |
|
156
|
|
|
prt.write("\label{table:supptbl_d1}\n") |
|
157
|
|
|
prt.write("\end{table}\n") |
|
158
|
|
|
sys.stdout.write(" {N:>5} items WROTE: {TXT}\n".format( |
|
159
|
|
|
N=len(data_nts), TXT=fout_tex)) |
|
160
|
|
|
|
|
161
|
|
|
def get_d1nts(self): |
|
162
|
|
|
"""Get letters for depth-01 GO terms, descendants count, and GO information.""" |
|
163
|
|
|
data = [] |
|
164
|
|
|
ntdata = cx.namedtuple("NtPrt", "D1 NS dcnt depth GO name") |
|
165
|
|
|
namespace = None |
|
166
|
|
|
for ntlet in sorted(self.goone2ntletter.values(), |
|
167
|
|
|
key=lambda nt: [nt.goobj.namespace, -1 * nt.dcnt, nt.D1]): |
|
168
|
|
|
goobj = ntlet.goobj |
|
169
|
|
|
goid = goobj.id |
|
170
|
|
|
assert len(goobj.parents) == 1 |
|
171
|
|
|
if namespace != goobj.namespace: |
|
172
|
|
|
namespace = goobj.namespace |
|
173
|
|
|
ntns = self.ns2nt[namespace] |
|
174
|
|
|
pobj = ntns.goobj |
|
175
|
|
|
ns2 = self.str2ns[goobj.namespace] |
|
176
|
|
|
data.append(ntdata._make([" ", ns2, ntns.dcnt, pobj.depth, pobj.id, pobj.name])) |
|
177
|
|
|
data.append(ntdata._make( |
|
178
|
|
|
[ntlet.D1, self.str2ns[namespace], ntlet.dcnt, goobj.depth, goid, goobj.name])) |
|
179
|
|
|
return data |
|
180
|
|
|
|
|
181
|
|
|
@staticmethod |
|
182
|
|
|
def _init_ns2nt(rcntobj): |
|
183
|
|
|
"""Save depth-00 GO terms ordered using descendants cnt.""" |
|
184
|
|
|
go2dcnt = rcntobj.go2dcnt |
|
185
|
|
|
ntobj = cx.namedtuple("NtD1", "D1 dcnt goobj") |
|
186
|
|
|
d0s = rcntobj.depth2goobjs[0] |
|
187
|
|
|
ns_nt = [(o.namespace, ntobj(D1="", dcnt=go2dcnt[o.id], goobj=o)) for o in d0s] |
|
188
|
|
|
return cx.OrderedDict(ns_nt) |
|
189
|
|
|
|
|
190
|
|
|
# Copyright (C) 2016-2017, DV Klopfenstein, H Tang, All rights reserved. |
|
191
|
|
|
|