|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# -*- coding: UTF-8 -*- |
|
3
|
|
|
|
|
4
|
|
|
""" |
|
5
|
|
|
python %prog study.file population.file gene-association.file |
|
6
|
|
|
|
|
7
|
|
|
This program returns P-values for functional enrichment in a cluster of |
|
8
|
|
|
study genes using Fisher's exact test, and corrected for multiple testing |
|
9
|
|
|
(including Bonferroni, Holm, Sidak, and false discovery rate) |
|
10
|
|
|
""" |
|
11
|
|
|
|
|
12
|
|
|
from __future__ import print_function |
|
13
|
|
|
from __future__ import absolute_import |
|
14
|
|
|
|
|
15
|
|
|
|
|
16
|
|
|
__copyright__ = "Copyright (C) 2010-2018, H Tang et al., All rights reserved." |
|
17
|
|
|
__author__ = "various" |
|
18
|
|
|
|
|
19
|
|
|
import sys |
|
20
|
|
|
import collections as cx |
|
21
|
|
|
import datetime |
|
22
|
|
|
|
|
23
|
|
|
from goatools.multiple_testing import Methods |
|
24
|
|
|
from goatools.multiple_testing import Bonferroni |
|
25
|
|
|
from goatools.multiple_testing import Sidak |
|
26
|
|
|
from goatools.multiple_testing import HolmBonferroni |
|
27
|
|
|
from goatools.multiple_testing import FDR |
|
28
|
|
|
from goatools.multiple_testing import calc_qval |
|
29
|
|
|
from goatools.ratio import get_terms, count_terms, is_ratio_different |
|
30
|
|
|
import goatools.wr_tbl as RPT |
|
31
|
|
|
from goatools.pvalcalc import FisherFactory |
|
32
|
|
|
|
|
33
|
|
|
from goatools.rpt.goea_nt_xfrm import MgrNtGOEAs |
|
34
|
|
|
|
|
35
|
|
|
class GOEnrichmentRecord(object): |
|
36
|
|
|
"""Represents one result (from a single GOTerm) in the GOEnrichmentStudy |
|
37
|
|
|
""" |
|
38
|
|
|
namespace2NS = cx.OrderedDict([ |
|
39
|
|
|
('biological_process', 'BP'), |
|
40
|
|
|
('molecular_function', 'MF'), |
|
41
|
|
|
('cellular_component', 'CC')]) |
|
42
|
|
|
|
|
43
|
|
|
# Fields seen in every enrichment result |
|
44
|
|
|
_fldsdefprt = [ |
|
45
|
|
|
"GO", |
|
46
|
|
|
"NS", |
|
47
|
|
|
"enrichment", |
|
48
|
|
|
"name", |
|
49
|
|
|
"ratio_in_study", |
|
50
|
|
|
"ratio_in_pop", |
|
51
|
|
|
"p_uncorrected", |
|
52
|
|
|
"depth", |
|
53
|
|
|
"study_count", |
|
54
|
|
|
"study_items"] |
|
55
|
|
|
_fldsdeffmt = ["%s"]*3 + ["%-30s"] + ["%d/%d"] * 2 + ["%.3g"] + ["%d"] * 2 + ["%15s"] |
|
56
|
|
|
|
|
57
|
|
|
_flds = set(_fldsdefprt).intersection( |
|
58
|
|
|
set(['study_items', 'study_count', 'study_n', 'pop_items', 'pop_count', 'pop_n'])) |
|
59
|
|
|
|
|
60
|
|
|
def __init__(self, **kwargs): |
|
61
|
|
|
# Methods seen in current enrichment result |
|
62
|
|
|
self.method_flds = [] |
|
63
|
|
|
for key, val in kwargs.items(): |
|
64
|
|
|
setattr(self, key, val) |
|
65
|
|
|
if key == 'ratio_in_study': |
|
66
|
|
|
setattr(self, 'study_count', val[0]) |
|
67
|
|
|
setattr(self, 'study_n', val[1]) |
|
68
|
|
|
if key == 'ratio_in_pop': |
|
69
|
|
|
setattr(self, 'pop_count', val[0]) |
|
70
|
|
|
setattr(self, 'pop_n', val[1]) |
|
71
|
|
|
self._init_enrichment() |
|
72
|
|
|
self.goterm = None # the reference to the GOTerm |
|
73
|
|
|
|
|
74
|
|
|
def get_method_name(self): |
|
75
|
|
|
"""Return name of first method in the method_flds list.""" |
|
76
|
|
|
return self.method_flds[0].fieldname |
|
77
|
|
|
|
|
78
|
|
|
def get_pvalue(self): |
|
79
|
|
|
"""Returns pval for 1st method, if it exists. Else returns uncorrected pval.""" |
|
80
|
|
|
if self.method_flds: |
|
81
|
|
|
return getattr(self, "p_{m}".format(m=self.get_method_name())) |
|
82
|
|
|
return getattr(self, "p_uncorrected") |
|
83
|
|
|
|
|
84
|
|
|
def set_corrected_pval(self, nt_method, pvalue): |
|
85
|
|
|
"""Add object attribute based on method name.""" |
|
86
|
|
|
self.method_flds.append(nt_method) |
|
87
|
|
|
fieldname = "".join(["p_", nt_method.fieldname]) |
|
88
|
|
|
setattr(self, fieldname, pvalue) |
|
89
|
|
|
|
|
90
|
|
|
def __str__(self, indent=False): |
|
91
|
|
|
field_data = [getattr(self, f, "n.a.") for f in self._fldsdefprt[:-1]] + \ |
|
92
|
|
|
[getattr(self, "p_{}".format(m.fieldname)) for m in self.method_flds] + \ |
|
93
|
|
|
[", ".join(sorted(getattr(self, self._fldsdefprt[-1], set())))] |
|
94
|
|
|
fldsdeffmt = self._fldsdeffmt |
|
95
|
|
|
field_formatter = fldsdeffmt[:-1] + ["%.3g"]*len(self.method_flds) + [fldsdeffmt[-1]] |
|
96
|
|
|
self._chk_fields(field_data, field_formatter) |
|
97
|
|
|
|
|
98
|
|
|
# default formatting only works for non-"n.a" data |
|
99
|
|
|
for idx, fld in enumerate(field_data): |
|
100
|
|
|
if fld == "n.a.": |
|
101
|
|
|
field_formatter[idx] = "%s" |
|
102
|
|
|
|
|
103
|
|
|
# print dots to show the level of the term |
|
104
|
|
|
dots = self.get_indent_dots() if indent else "" |
|
105
|
|
|
prtdata = "\t".join(a % b for (a, b) in zip(field_formatter, field_data)) |
|
106
|
|
|
return "".join([dots, prtdata]) |
|
107
|
|
|
|
|
108
|
|
|
def get_indent_dots(self): |
|
109
|
|
|
"""Get a string of dots ("....") representing the level of the GO term.""" |
|
110
|
|
|
return "." * self.goterm.level if self.goterm is not None else "" |
|
111
|
|
|
|
|
112
|
|
|
@staticmethod |
|
113
|
|
|
def _chk_fields(field_data, field_formatter): |
|
114
|
|
|
"""Check that expected fields are present.""" |
|
115
|
|
|
if len(field_data) == len(field_formatter): |
|
116
|
|
|
return |
|
117
|
|
|
len_dat = len(field_data) |
|
118
|
|
|
len_fmt = len(field_formatter) |
|
119
|
|
|
msg = [ |
|
120
|
|
|
"FIELD DATA({d}) != FORMATTER({f})".format(d=len_dat, f=len_fmt), |
|
121
|
|
|
"DAT({N}): {D}".format(N=len_dat, D=field_data), |
|
122
|
|
|
"FMT({N}): {F}".format(N=len_fmt, F=field_formatter)] |
|
123
|
|
|
raise Exception("\n".join(msg)) |
|
124
|
|
|
|
|
125
|
|
|
def __repr__(self): |
|
126
|
|
|
return "GOEnrichmentRecord({GO})".format(GO=self.GO) |
|
127
|
|
|
|
|
128
|
|
|
def set_goterm(self, go2obj): |
|
129
|
|
|
"""Set goterm and copy GOTerm's name and namespace.""" |
|
130
|
|
|
self.goterm = go2obj.get(self.GO, None) |
|
131
|
|
|
present = self.goterm is not None |
|
132
|
|
|
self.name = self.goterm.name if present else "n.a." |
|
133
|
|
|
self.NS = self.namespace2NS[self.goterm.namespace] if present else "XX" |
|
134
|
|
|
|
|
135
|
|
|
def _init_enrichment(self): |
|
136
|
|
|
"""Mark as 'enriched' or 'purified'.""" |
|
137
|
|
|
self.enrichment = 'e' if ((1.0 * self.study_count / self.study_n) > |
|
138
|
|
|
(1.0 * self.pop_count / self.pop_n)) else 'p' |
|
139
|
|
|
|
|
140
|
|
|
def update_remaining_fldsdefprt(self, min_ratio=None): |
|
141
|
|
|
"""Finish updating self (GOEnrichmentRecord) field, is_ratio_different.""" |
|
142
|
|
|
self.is_ratio_different = is_ratio_different(min_ratio, self.study_count, |
|
143
|
|
|
self.study_n, self.pop_count, self.pop_n) |
|
144
|
|
|
|
|
145
|
|
|
|
|
146
|
|
|
# ------------------------------------------------------------------------------------- |
|
147
|
|
|
# Methods for getting flat namedtuple values from GOEnrichmentRecord object |
|
148
|
|
|
def get_prtflds_default(self): |
|
149
|
|
|
"""Get default fields.""" |
|
150
|
|
|
return self._fldsdefprt[:-1] + \ |
|
151
|
|
|
["p_{M}".format(M=m.fieldname) for m in self.method_flds] + \ |
|
152
|
|
|
[self._fldsdefprt[-1]] |
|
153
|
|
|
|
|
154
|
|
|
def get_prtflds_all(self): |
|
155
|
|
|
"""When converting to a namedtuple, get all possible fields in their original order.""" |
|
156
|
|
|
flds = [] |
|
157
|
|
|
dont_add = set(['_parents', 'method_flds', 'relationship_rev', 'relationship']) |
|
158
|
|
|
# Fields: GO NS enrichment name ratio_in_study ratio_in_pop p_uncorrected |
|
159
|
|
|
# depth study_count p_sm_bonferroni p_fdr_bh study_items |
|
160
|
|
|
self._flds_append(flds, self.get_prtflds_default(), dont_add) |
|
161
|
|
|
# Fields: GO NS goterm |
|
162
|
|
|
# ratio_in_pop pop_n pop_count pop_items name |
|
163
|
|
|
# ratio_in_study study_n study_count study_items |
|
164
|
|
|
# method_flds enrichment p_uncorrected p_sm_bonferroni p_fdr_bh |
|
165
|
|
|
self._flds_append(flds, vars(self).keys(), dont_add) |
|
166
|
|
|
# Fields: name level is_obsolete namespace id depth parents children _parents alt_ids |
|
167
|
|
|
self._flds_append(flds, vars(self.goterm).keys(), dont_add) |
|
168
|
|
|
return flds |
|
169
|
|
|
|
|
170
|
|
|
@staticmethod |
|
171
|
|
|
def _flds_append(flds, addthese, dont_add): |
|
172
|
|
|
"""Retain order of fields as we add them once to the list.""" |
|
173
|
|
|
for fld in addthese: |
|
174
|
|
|
if fld not in flds and fld not in dont_add: |
|
175
|
|
|
flds.append(fld) |
|
176
|
|
|
|
|
177
|
|
|
def get_field_values(self, fldnames, rpt_fmt=True, itemid2name=None): |
|
178
|
|
|
"""Get flat namedtuple fields for one GOEnrichmentRecord.""" |
|
179
|
|
|
row = [] |
|
180
|
|
|
# Loop through each user field desired |
|
181
|
|
|
for fld in fldnames: |
|
182
|
|
|
# 1. Check the GOEnrichmentRecord's attributes |
|
183
|
|
|
val = getattr(self, fld, None) |
|
184
|
|
|
if val is not None: |
|
185
|
|
|
if rpt_fmt: |
|
186
|
|
|
val = self._get_rpt_fmt(fld, val, itemid2name) |
|
187
|
|
|
row.append(val) |
|
188
|
|
|
else: |
|
189
|
|
|
# 2. Check the GO object for the field |
|
190
|
|
|
val = getattr(self.goterm, fld, None) |
|
191
|
|
|
if rpt_fmt: |
|
192
|
|
|
val = self._get_rpt_fmt(fld, val, itemid2name) |
|
193
|
|
|
if val is not None: |
|
194
|
|
|
row.append(val) |
|
195
|
|
|
else: |
|
196
|
|
|
# 3. Field not found, raise Exception |
|
197
|
|
|
self._err_fld(fld, fldnames) |
|
198
|
|
|
if rpt_fmt: |
|
199
|
|
|
assert not isinstance(val, list), \ |
|
200
|
|
|
"UNEXPECTED LIST: FIELD({F}) VALUE({V}) FMT({P})".format( |
|
201
|
|
|
P=rpt_fmt, F=fld, V=val) |
|
202
|
|
|
return row |
|
203
|
|
|
|
|
204
|
|
|
@staticmethod |
|
205
|
|
|
def _get_rpt_fmt(fld, val, itemid2name=None): |
|
206
|
|
|
"""Return values in a format amenable to printing in a table.""" |
|
207
|
|
|
if fld.startswith("ratio_"): |
|
208
|
|
|
return "{N}/{TOT}".format(N=val[0], TOT=val[1]) |
|
209
|
|
|
elif fld in set(['study_items', 'pop_items', 'alt_ids']): |
|
210
|
|
|
if itemid2name is not None: |
|
211
|
|
|
val = [itemid2name.get(v, v) for v in val] |
|
212
|
|
|
return ", ".join([str(v) for v in sorted(val)]) |
|
213
|
|
|
return val |
|
214
|
|
|
|
|
215
|
|
|
def _err_fld(self, fld, fldnames): |
|
216
|
|
|
"""Unrecognized field. Print detailed Failure message.""" |
|
217
|
|
|
msg = ['ERROR. UNRECOGNIZED FIELD({F})'.format(F=fld)] |
|
218
|
|
|
actual_flds = set(self.get_prtflds_default() + self.goterm.__dict__.keys()) |
|
219
|
|
|
bad_flds = set(fldnames).difference(set(actual_flds)) |
|
220
|
|
|
if bad_flds: |
|
221
|
|
|
msg.append("\nGOEA RESULT FIELDS: {}".format(" ".join(self._fldsdefprt))) |
|
222
|
|
|
msg.append("GO FIELDS: {}".format(" ".join(self.goterm.__dict__.keys()))) |
|
223
|
|
|
msg.append("\nFATAL: {N} UNEXPECTED FIELDS({F})\n".format( |
|
224
|
|
|
N=len(bad_flds), F=" ".join(bad_flds))) |
|
225
|
|
|
msg.append(" {N} User-provided fields:".format(N=len(fldnames))) |
|
226
|
|
|
for idx, fld in enumerate(fldnames, 1): |
|
227
|
|
|
mrk = "ERROR -->" if fld in bad_flds else "" |
|
228
|
|
|
msg.append(" {M:>9} {I:>2}) {F}".format(M=mrk, I=idx, F=fld)) |
|
229
|
|
|
raise Exception("\n".join(msg)) |
|
230
|
|
|
|
|
231
|
|
|
|
|
232
|
|
|
class GOEnrichmentStudy(object): |
|
233
|
|
|
"""Runs Fisher's exact test, as well as multiple corrections |
|
234
|
|
|
""" |
|
235
|
|
|
# Default Excel table column widths for GOEA results |
|
236
|
|
|
default_fld2col_widths = { |
|
237
|
|
|
'NS' : 3, |
|
238
|
|
|
'GO' : 12, |
|
239
|
|
|
'alt' : 2, |
|
240
|
|
|
'level' : 3, |
|
241
|
|
|
'depth' : 3, |
|
242
|
|
|
'enrichment': 1, |
|
243
|
|
|
'name' : 60, |
|
244
|
|
|
'ratio_in_study': 8, |
|
245
|
|
|
'ratio_in_pop' : 12, |
|
246
|
|
|
'study_items' : 15, |
|
247
|
|
|
} |
|
248
|
|
|
|
|
249
|
|
|
def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): |
|
250
|
|
|
self.log = kws['log'] if 'log' in kws else sys.stdout |
|
251
|
|
|
self._run_multitest = { |
|
252
|
|
|
'local':lambda iargs: self._run_multitest_local(iargs), |
|
253
|
|
|
'statsmodels':lambda iargs: self._run_multitest_statsmodels(iargs)} |
|
254
|
|
|
self.pop = set(pop) |
|
255
|
|
|
self.pop_n = len(pop) |
|
256
|
|
|
self.assoc = assoc |
|
257
|
|
|
self.obo_dag = obo_dag |
|
258
|
|
|
self.alpha = alpha |
|
259
|
|
|
if methods is None: |
|
260
|
|
|
methods = ["bonferroni", "sidak", "holm"] |
|
261
|
|
|
self.methods = Methods(methods) |
|
262
|
|
|
self.pval_obj = FisherFactory(**kws).pval_obj |
|
263
|
|
|
|
|
264
|
|
|
if propagate_counts: |
|
265
|
|
|
sys.stderr.write("Propagating term counts to parents ..\n") |
|
266
|
|
|
obo_dag.update_association(assoc) |
|
267
|
|
|
self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log) |
|
268
|
|
|
|
|
269
|
|
|
def run_study(self, study, **kws): |
|
270
|
|
|
"""Run Gene Ontology Enrichment Study (GOEA) on study ids.""" |
|
271
|
|
|
# Key-word arguments: |
|
272
|
|
|
methods = Methods(kws['methods']) if 'methods' in kws else self.methods |
|
273
|
|
|
alpha = kws['alpha'] if 'alpha' in kws else self.alpha |
|
274
|
|
|
log = kws['log'] if 'log' in kws else self.log |
|
275
|
|
|
# Calculate uncorrected pvalues |
|
276
|
|
|
results = self.get_pval_uncorr(study, log) |
|
277
|
|
|
if not results: |
|
278
|
|
|
return [] |
|
279
|
|
|
|
|
280
|
|
|
if log is not None: |
|
281
|
|
|
log.write(" {MSG}\n".format(MSG="\n ".join(self.get_results_msg(results, study)))) |
|
282
|
|
|
|
|
283
|
|
|
# Do multipletest corrections on uncorrected pvalues and update results |
|
284
|
|
|
self._run_multitest_corr(results, methods, alpha, study, log) |
|
285
|
|
|
|
|
286
|
|
|
for rec in results: |
|
287
|
|
|
# get go term for name and level |
|
288
|
|
|
rec.set_goterm(self.obo_dag) |
|
289
|
|
|
|
|
290
|
|
|
# 'keep_if' can be used to keep only significant GO terms. Example: |
|
291
|
|
|
# >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant |
|
292
|
|
|
# >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if) |
|
293
|
|
|
if 'keep_if' in kws: |
|
294
|
|
|
keep_if = kws['keep_if'] |
|
295
|
|
|
results = [r for r in results if keep_if(r)] |
|
296
|
|
|
|
|
297
|
|
|
# Default sort order: First, sort by BP, MF, CC. Second, sort by pval |
|
298
|
|
|
results.sort(key=lambda r: [r.NS, r.enrichment, r.p_uncorrected]) |
|
299
|
|
|
return results # list of GOEnrichmentRecord objects |
|
300
|
|
|
|
|
301
|
|
|
def run_study_nts(self, study, **kws): |
|
302
|
|
|
"""Run GOEA on study ids. Return results as a list of namedtuples.""" |
|
303
|
|
|
goea_results = self.run_study(study, **kws) |
|
304
|
|
|
return MgrNtGOEAs(goea_results).get_goea_nts_all() |
|
305
|
|
|
|
|
306
|
|
|
def get_results_msg(self, results, study): |
|
307
|
|
|
"""Return summary for GOEA results.""" |
|
308
|
|
|
# To convert msg list to string: "\n".join(msg) |
|
309
|
|
|
msg = [] |
|
310
|
|
|
if results: |
|
311
|
|
|
fmt = "{M:6,} GO terms are associated with {N:6,} of {NT:6,}" |
|
312
|
|
|
stu_items, num_gos_stu = self.get_item_cnt(results, "study_items") |
|
313
|
|
|
pop_items, num_gos_pop = self.get_item_cnt(results, "pop_items") |
|
314
|
|
|
stu_txt = fmt.format(N=len(stu_items), M=num_gos_stu, NT=len(set(study))) |
|
315
|
|
|
pop_txt = fmt.format(N=len(pop_items), M=num_gos_pop, NT=self.pop_n) |
|
316
|
|
|
msg.append("{POP} population items".format(POP=pop_txt)) |
|
317
|
|
|
msg.append("{STU} study items".format(STU=stu_txt)) |
|
318
|
|
|
return msg |
|
319
|
|
|
|
|
320
|
|
|
def get_pval_uncorr(self, study, log=sys.stdout): |
|
321
|
|
|
"""Calculate the uncorrected pvalues for study items.""" |
|
322
|
|
|
results = [] |
|
323
|
|
|
study_in_pop = self.pop.intersection(study) |
|
324
|
|
|
# " 99% 378 of 382 study items found in population" |
|
325
|
|
|
go2studyitems = get_terms("study", study_in_pop, self.assoc, self.obo_dag, log) |
|
326
|
|
|
pop_n, study_n = self.pop_n, len(study_in_pop) |
|
327
|
|
|
allterms = set(go2studyitems).union(set(self.go2popitems)) |
|
328
|
|
|
if log is not None: |
|
329
|
|
|
study_n_orig = len(study) |
|
330
|
|
|
log.write("{R:3.0f}% {N:>6,} of {M:>6,} study items found in population({P})\n".format( |
|
331
|
|
|
N=study_n, M=study_n_orig, P=pop_n, R=100.0*study_n/study_n_orig)) |
|
332
|
|
|
log.write("Calculating {N:,} uncorrected p-values using {PFNC}\n".format( |
|
333
|
|
|
N=len(allterms), PFNC=self.pval_obj.name)) |
|
334
|
|
|
calc_pvalue = self.pval_obj.calc_pvalue |
|
335
|
|
|
|
|
336
|
|
|
for goid in allterms: |
|
337
|
|
|
study_items = go2studyitems.get(goid, set()) |
|
338
|
|
|
study_count = len(study_items) |
|
339
|
|
|
pop_items = self.go2popitems.get(goid, set()) |
|
340
|
|
|
pop_count = len(pop_items) |
|
341
|
|
|
|
|
342
|
|
|
one_record = GOEnrichmentRecord( |
|
343
|
|
|
GO=goid, |
|
344
|
|
|
p_uncorrected=calc_pvalue(study_count, study_n, pop_count, pop_n), |
|
345
|
|
|
study_items=study_items, |
|
346
|
|
|
pop_items=pop_items, |
|
347
|
|
|
ratio_in_study=(study_count, study_n), |
|
348
|
|
|
ratio_in_pop=(pop_count, pop_n)) |
|
349
|
|
|
|
|
350
|
|
|
results.append(one_record) |
|
351
|
|
|
|
|
352
|
|
|
return results |
|
353
|
|
|
|
|
354
|
|
|
def _run_multitest_corr(self, results, usrmethod_flds, alpha, study, log): |
|
355
|
|
|
"""Do multiple-test corrections on uncorrected pvalues.""" |
|
356
|
|
|
assert 0 < alpha < 1, "Test-wise alpha must fall between (0, 1)" |
|
357
|
|
|
pvals = [r.p_uncorrected for r in results] |
|
358
|
|
|
ntobj = cx.namedtuple("ntobj", "results pvals alpha nt_method study") |
|
359
|
|
|
|
|
360
|
|
|
for nt_method in usrmethod_flds: |
|
361
|
|
|
ntmt = ntobj(results, pvals, alpha, nt_method, study) |
|
362
|
|
|
self._run_multitest[nt_method.source](ntmt) |
|
363
|
|
|
if log is not None: |
|
364
|
|
|
self._log_multitest_corr(log, results, ntmt, alpha) |
|
365
|
|
|
|
|
366
|
|
|
def _log_multitest_corr(self, log, results, ntmt, alpha): |
|
367
|
|
|
"""Print information regarding multitest correction results.""" |
|
368
|
|
|
ntm = ntmt.nt_method |
|
369
|
|
|
attr_mult = "p_{M}".format(M=self.methods.get_fieldname(ntm.source, ntm.method)) |
|
370
|
|
|
sig_cnt = sum(1 for r in results if getattr(r, attr_mult) < alpha) |
|
371
|
|
|
log.write("{N:8,} GO terms found significant (< {A}=alpha) after ".format( |
|
372
|
|
|
N=sig_cnt, A=alpha)) |
|
373
|
|
|
log.write("multitest correction: ") |
|
374
|
|
|
log.write("{MSRC} {METHOD}\n".format(MSRC=ntm.source, METHOD=ntm.method)) |
|
375
|
|
|
|
|
376
|
|
|
def _run_multitest_statsmodels(self, ntmt): |
|
377
|
|
|
"""Use multitest mthods that have been implemented in statsmodels.""" |
|
378
|
|
|
# Only load statsmodels if it is used |
|
379
|
|
|
multipletests = self.methods.get_statsmodels_multipletests() |
|
380
|
|
|
results = multipletests(ntmt.pvals, ntmt.alpha, ntmt.nt_method.method) |
|
381
|
|
|
pvals_corrected = results[1] # reject_lst, pvals_corrected, alphacSidak, alphacBonf |
|
382
|
|
|
self._update_pvalcorr(ntmt, pvals_corrected) |
|
383
|
|
|
|
|
384
|
|
|
def _run_multitest_local(self, ntmt): |
|
385
|
|
|
"""Use multitest mthods that have been implemented locally.""" |
|
386
|
|
|
corrected_pvals = None |
|
387
|
|
|
method = ntmt.nt_method.method |
|
388
|
|
|
if method == "bonferroni": |
|
389
|
|
|
corrected_pvals = Bonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals |
|
390
|
|
|
elif method == "sidak": |
|
391
|
|
|
corrected_pvals = Sidak(ntmt.pvals, ntmt.alpha).corrected_pvals |
|
392
|
|
|
elif method == "holm": |
|
393
|
|
|
corrected_pvals = HolmBonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals |
|
394
|
|
|
elif method == "fdr": |
|
395
|
|
|
# get the empirical p-value distributions for FDR |
|
396
|
|
|
term_pop = getattr(self, 'term_pop', None) |
|
397
|
|
|
if term_pop is None: |
|
398
|
|
|
term_pop = count_terms(self.pop, self.assoc, self.obo_dag) |
|
399
|
|
|
p_val_distribution = calc_qval(len(ntmt.study), |
|
400
|
|
|
self.pop_n, |
|
401
|
|
|
self.pop, self.assoc, |
|
402
|
|
|
term_pop, self.obo_dag) |
|
403
|
|
|
corrected_pvals = FDR(p_val_distribution, |
|
404
|
|
|
ntmt.results, ntmt.alpha).corrected_pvals |
|
405
|
|
|
|
|
406
|
|
|
self._update_pvalcorr(ntmt, corrected_pvals) |
|
407
|
|
|
|
|
408
|
|
|
@staticmethod |
|
409
|
|
|
def _update_pvalcorr(ntmt, corrected_pvals): |
|
410
|
|
|
"""Add data members to store multiple test corrections.""" |
|
411
|
|
|
if corrected_pvals is None: |
|
412
|
|
|
return |
|
413
|
|
|
for rec, val in zip(ntmt.results, corrected_pvals): |
|
414
|
|
|
rec.set_corrected_pval(ntmt.nt_method, val) |
|
415
|
|
|
|
|
416
|
|
|
# Methods for writing results into tables: text, tab-separated, Excel spreadsheets |
|
417
|
|
|
def wr_txt(self, fout_txt, goea_results, prtfmt=None, **kws): |
|
418
|
|
|
"""Print GOEA results to text file.""" |
|
419
|
|
|
if not goea_results: |
|
420
|
|
|
sys.stdout.write(" 0 GOEA results. NOT WRITING {FOUT}\n".format(FOUT=fout_txt)) |
|
421
|
|
|
return |
|
422
|
|
|
with open(fout_txt, 'w') as prt: |
|
423
|
|
|
if 'title' in kws: |
|
424
|
|
|
prt.write("{TITLE}\n".format(TITLE=kws['title'])) |
|
425
|
|
|
data_nts = self.prt_txt(prt, goea_results, prtfmt, **kws) |
|
426
|
|
|
log = self.log if self.log is not None else sys.stdout |
|
427
|
|
|
log.write(" {N:>5} GOEA results for {CUR:5} study items. WROTE: {F}\n".format( |
|
428
|
|
|
N=len(data_nts), |
|
429
|
|
|
CUR=len(MgrNtGOEAs(goea_results).get_study_items()), |
|
430
|
|
|
F=fout_txt)) |
|
431
|
|
|
|
|
432
|
|
|
def prt_txt(self, prt, goea_results, prtfmt=None, **kws): |
|
433
|
|
|
"""Print GOEA results in text format.""" |
|
434
|
|
|
if prtfmt is None: |
|
435
|
|
|
prtfmt = ("{GO} {NS} {p_uncorrected:5.2e} {ratio_in_study:>6} {ratio_in_pop:>9} " |
|
436
|
|
|
"{depth:02} {name:40} {study_items}\n") |
|
437
|
|
|
prtfmt = self.adjust_prtfmt(prtfmt) |
|
438
|
|
|
prt_flds = RPT.get_fmtflds(prtfmt) |
|
439
|
|
|
data_nts = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) |
|
440
|
|
|
RPT.prt_txt(prt, data_nts, prtfmt, prt_flds, **kws) |
|
441
|
|
|
return data_nts |
|
442
|
|
|
|
|
443
|
|
|
def wr_xlsx(self, fout_xlsx, goea_results, **kws): |
|
444
|
|
|
"""Write a xlsx file.""" |
|
445
|
|
|
# kws: prt_if indent itemid2name(study_items) |
|
446
|
|
|
prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) |
|
447
|
|
|
xlsx_data = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) |
|
448
|
|
|
if 'fld2col_widths' not in kws: |
|
449
|
|
|
kws['fld2col_widths'] = {f:self.default_fld2col_widths.get(f, 8) for f in prt_flds} |
|
450
|
|
|
RPT.wr_xlsx(fout_xlsx, xlsx_data, **kws) |
|
451
|
|
|
|
|
452
|
|
|
def wr_tsv(self, fout_tsv, goea_results, **kws): |
|
453
|
|
|
"""Write tab-separated table data to file""" |
|
454
|
|
|
prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) |
|
455
|
|
|
tsv_data = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) |
|
456
|
|
|
RPT.wr_tsv(fout_tsv, tsv_data, **kws) |
|
457
|
|
|
|
|
458
|
|
|
def prt_tsv(self, prt, goea_results, **kws): |
|
459
|
|
|
"""Write tab-separated table data""" |
|
460
|
|
|
prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) |
|
461
|
|
|
tsv_data = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) |
|
462
|
|
|
RPT.prt_tsv(prt, tsv_data, prt_flds, **kws) |
|
463
|
|
|
|
|
464
|
|
|
@staticmethod |
|
465
|
|
|
def adjust_prtfmt(prtfmt): |
|
466
|
|
|
"""Adjust format_strings for legal values.""" |
|
467
|
|
|
prtfmt = prtfmt.replace("{p_holm-sidak", "{p_holm_sidak") |
|
468
|
|
|
prtfmt = prtfmt.replace("{p_simes-hochberg", "{p_simes_hochberg") |
|
469
|
|
|
return prtfmt |
|
470
|
|
|
|
|
471
|
|
|
@staticmethod |
|
472
|
|
|
def get_ns2nts(results, fldnames=None, **kws): |
|
473
|
|
|
"""Get namedtuples of GOEA results, split into BP, MF, CC.""" |
|
474
|
|
|
ns2nts = cx.defaultdict(list) |
|
475
|
|
|
nts = MgrNtGOEAs(results).get_goea_nts_all(fldnames, **kws) |
|
476
|
|
|
for ntgoea in nts: |
|
477
|
|
|
ns2nts[ntgoea.NS].append(ntgoea) |
|
478
|
|
|
return ns2nts |
|
479
|
|
|
|
|
480
|
|
|
@staticmethod |
|
481
|
|
|
def get_item_cnt(results, attrname="study_items"): |
|
482
|
|
|
"""Get all study or population items (e.g., geneids).""" |
|
483
|
|
|
items = set() |
|
484
|
|
|
go_cnt = 0 |
|
485
|
|
|
for rec in results: |
|
486
|
|
|
if hasattr(rec, attrname): |
|
487
|
|
|
items_cur = getattr(rec, attrname) |
|
488
|
|
|
# Only count GO term if there are items in the set. |
|
489
|
|
|
if len(items_cur) != 0: |
|
490
|
|
|
items |= items_cur |
|
491
|
|
|
go_cnt += 1 |
|
492
|
|
|
return items, go_cnt |
|
493
|
|
|
|
|
494
|
|
|
@staticmethod |
|
495
|
|
|
def get_prtflds_default(results): |
|
496
|
|
|
"""Get default fields names. Used in printing GOEA results. |
|
497
|
|
|
|
|
498
|
|
|
Researchers can control which fields they want to print in the GOEA results |
|
499
|
|
|
or they can use the default fields. |
|
500
|
|
|
""" |
|
501
|
|
|
if results: |
|
502
|
|
|
return results[0].get_prtflds_default() |
|
503
|
|
|
return [] |
|
504
|
|
|
|
|
505
|
|
|
@staticmethod |
|
506
|
|
|
def print_summary(results, min_ratio=None, indent=False, pval=0.05): |
|
507
|
|
|
"""Print summary.""" |
|
508
|
|
|
import goatools |
|
509
|
|
|
|
|
510
|
|
|
# Header contains provenance and parameters |
|
511
|
|
|
date = datetime.date.today() |
|
512
|
|
|
print("# Generated by GOATOOLS v{0} ({1})".format(goatools.__version__, date)) |
|
513
|
|
|
print("# min_ratio={0} pval={1}".format(min_ratio, pval)) |
|
514
|
|
|
|
|
515
|
|
|
# field names for output |
|
516
|
|
|
if results: |
|
517
|
|
|
print("\t".join(GOEnrichmentStudy.get_prtflds_default(results))) |
|
518
|
|
|
|
|
519
|
|
|
for rec in results: |
|
520
|
|
|
# calculate some additional statistics |
|
521
|
|
|
# (over_under, is_ratio_different) |
|
522
|
|
|
rec.update_remaining_fldsdefprt(min_ratio=min_ratio) |
|
523
|
|
|
|
|
524
|
|
|
if pval is not None and rec.p_uncorrected >= pval: |
|
525
|
|
|
continue |
|
526
|
|
|
|
|
527
|
|
|
if rec.is_ratio_different: |
|
528
|
|
|
print(rec.__str__(indent=indent)) |
|
529
|
|
|
|
|
530
|
|
|
def wr_py_goea_results(self, fout_py, goea_results, **kws): |
|
531
|
|
|
"""Save GOEA results into Python package containing list of namedtuples.""" |
|
532
|
|
|
var_name = kws.get("var_name", "goea_results") |
|
533
|
|
|
docstring = kws.get("docstring", "") |
|
534
|
|
|
sortby = kws.get("sortby", None) |
|
535
|
|
|
if goea_results: |
|
536
|
|
|
from goatools.nt_utils import wr_py_nts |
|
537
|
|
|
nts_goea = goea_results |
|
538
|
|
|
# If list has GOEnrichmentRecords or verbose namedtuples, exclude some fields. |
|
539
|
|
|
if hasattr(goea_results[0], "_fldsdefprt") or hasattr(goea_results[0], 'goterm'): |
|
540
|
|
|
# Exclude some attributes from the namedtuple when saving results |
|
541
|
|
|
# to a Python file because the information is redundant or verbose. |
|
542
|
|
|
nts_goea = MgrNtGOEAs(goea_results).get_goea_nts_prt(**kws) |
|
543
|
|
|
docstring = "\n".join([docstring, "# {VER}\n\n".format(VER=self.obo_dag.version)]) |
|
544
|
|
|
assert hasattr(nts_goea[0], '_fields') |
|
545
|
|
|
if sortby is None: |
|
546
|
|
|
sortby = lambda nt: [getattr(nt, 'namespace'), getattr(nt, 'enrichment'), |
|
547
|
|
|
getattr(nt, 'p_uncorrected'), getattr(nt, 'depth'), |
|
548
|
|
|
getattr(nt, 'GO')] |
|
549
|
|
|
nts_goea = sorted(nts_goea, key=sortby) |
|
550
|
|
|
wr_py_nts(fout_py, nts_goea, docstring, var_name) |
|
551
|
|
|
|
|
552
|
|
|
# Copyright (C) 2010-2018, H Tang et al., All rights reserved. |
|
553
|
|
|
|