1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- coding: UTF-8 -*- |
3
|
|
|
|
4
|
|
|
""" |
5
|
|
|
python %prog study.file population.file gene-association.file |
6
|
|
|
|
7
|
|
|
This program returns P-values for functional enrichment in a cluster of |
8
|
|
|
study genes using Fisher's exact test, and corrected for multiple testing |
9
|
|
|
(including Bonferroni, Holm, Sidak, and false discovery rate) |
10
|
|
|
""" |
11
|
|
|
|
12
|
|
|
from __future__ import print_function |
13
|
|
|
from __future__ import absolute_import |
14
|
|
|
|
15
|
|
|
|
16
|
|
|
__copyright__ = "Copyright (C) 2010-2018, H Tang et al., All rights reserved." |
17
|
|
|
__author__ = "various" |
18
|
|
|
|
19
|
|
|
import sys |
20
|
|
|
import collections as cx |
21
|
|
|
import datetime |
22
|
|
|
|
23
|
|
|
from goatools.multiple_testing import Methods |
24
|
|
|
from goatools.multiple_testing import Bonferroni |
25
|
|
|
from goatools.multiple_testing import Sidak |
26
|
|
|
from goatools.multiple_testing import HolmBonferroni |
27
|
|
|
from goatools.multiple_testing import FDR |
28
|
|
|
from goatools.multiple_testing import calc_qval |
29
|
|
|
from goatools.ratio import get_terms, count_terms, is_ratio_different |
30
|
|
|
import goatools.wr_tbl as RPT |
31
|
|
|
from goatools.pvalcalc import FisherFactory |
32
|
|
|
|
33
|
|
|
from goatools.rpt.goea_nt_xfrm import MgrNtGOEAs |
34
|
|
|
|
35
|
|
|
class GOEnrichmentRecord(object): |
36
|
|
|
"""Represents one result (from a single GOTerm) in the GOEnrichmentStudy |
37
|
|
|
""" |
38
|
|
|
namespace2NS = cx.OrderedDict([ |
39
|
|
|
('biological_process', 'BP'), |
40
|
|
|
('molecular_function', 'MF'), |
41
|
|
|
('cellular_component', 'CC')]) |
42
|
|
|
|
43
|
|
|
# Fields seen in every enrichment result |
44
|
|
|
_fldsdefprt = [ |
45
|
|
|
"GO", |
46
|
|
|
"NS", |
47
|
|
|
"enrichment", |
48
|
|
|
"name", |
49
|
|
|
"ratio_in_study", |
50
|
|
|
"ratio_in_pop", |
51
|
|
|
"p_uncorrected", |
52
|
|
|
"depth", |
53
|
|
|
"study_count", |
54
|
|
|
"study_items"] |
55
|
|
|
_fldsdeffmt = ["%s"]*3 + ["%-30s"] + ["%d/%d"] * 2 + ["%.3g"] + ["%d"] * 2 + ["%15s"] |
56
|
|
|
|
57
|
|
|
_flds = set(_fldsdefprt).intersection( |
58
|
|
|
set(['study_items', 'study_count', 'study_n', 'pop_items', 'pop_count', 'pop_n'])) |
59
|
|
|
|
60
|
|
|
def __init__(self, **kwargs): |
61
|
|
|
# Methods seen in current enrichment result |
62
|
|
|
self.method_flds = [] |
63
|
|
|
for key, val in kwargs.items(): |
64
|
|
|
setattr(self, key, val) |
65
|
|
|
if key == 'ratio_in_study': |
66
|
|
|
setattr(self, 'study_count', val[0]) |
67
|
|
|
setattr(self, 'study_n', val[1]) |
68
|
|
|
if key == 'ratio_in_pop': |
69
|
|
|
setattr(self, 'pop_count', val[0]) |
70
|
|
|
setattr(self, 'pop_n', val[1]) |
71
|
|
|
self._init_enrichment() |
72
|
|
|
self.goterm = None # the reference to the GOTerm |
73
|
|
|
|
74
|
|
|
def get_method_name(self): |
75
|
|
|
"""Return name of first method in the method_flds list.""" |
76
|
|
|
return self.method_flds[0].fieldname |
77
|
|
|
|
78
|
|
|
def get_pvalue(self): |
79
|
|
|
"""Returns pval for 1st method, if it exists. Else returns uncorrected pval.""" |
80
|
|
|
if self.method_flds: |
81
|
|
|
return getattr(self, "p_{m}".format(m=self.get_method_name())) |
82
|
|
|
return getattr(self, "p_uncorrected") |
83
|
|
|
|
84
|
|
|
def set_corrected_pval(self, nt_method, pvalue): |
85
|
|
|
"""Add object attribute based on method name.""" |
86
|
|
|
self.method_flds.append(nt_method) |
87
|
|
|
fieldname = "".join(["p_", nt_method.fieldname]) |
88
|
|
|
setattr(self, fieldname, pvalue) |
89
|
|
|
|
90
|
|
|
def __str__(self, indent=False): |
91
|
|
|
field_data = [getattr(self, f, "n.a.") for f in self._fldsdefprt[:-1]] + \ |
92
|
|
|
[getattr(self, "p_{}".format(m.fieldname)) for m in self.method_flds] + \ |
93
|
|
|
[", ".join(sorted(getattr(self, self._fldsdefprt[-1], set())))] |
94
|
|
|
fldsdeffmt = self._fldsdeffmt |
95
|
|
|
field_formatter = fldsdeffmt[:-1] + ["%.3g"]*len(self.method_flds) + [fldsdeffmt[-1]] |
96
|
|
|
self._chk_fields(field_data, field_formatter) |
97
|
|
|
|
98
|
|
|
# default formatting only works for non-"n.a" data |
99
|
|
|
for idx, fld in enumerate(field_data): |
100
|
|
|
if fld == "n.a.": |
101
|
|
|
field_formatter[idx] = "%s" |
102
|
|
|
|
103
|
|
|
# print dots to show the level of the term |
104
|
|
|
dots = self.get_indent_dots() if indent else "" |
105
|
|
|
prtdata = "\t".join(a % b for (a, b) in zip(field_formatter, field_data)) |
106
|
|
|
return "".join([dots, prtdata]) |
107
|
|
|
|
108
|
|
|
def get_indent_dots(self): |
109
|
|
|
"""Get a string of dots ("....") representing the level of the GO term.""" |
110
|
|
|
return "." * self.goterm.level if self.goterm is not None else "" |
111
|
|
|
|
112
|
|
|
@staticmethod |
113
|
|
|
def _chk_fields(field_data, field_formatter): |
114
|
|
|
"""Check that expected fields are present.""" |
115
|
|
|
if len(field_data) == len(field_formatter): |
116
|
|
|
return |
117
|
|
|
len_dat = len(field_data) |
118
|
|
|
len_fmt = len(field_formatter) |
119
|
|
|
msg = [ |
120
|
|
|
"FIELD DATA({d}) != FORMATTER({f})".format(d=len_dat, f=len_fmt), |
121
|
|
|
"DAT({N}): {D}".format(N=len_dat, D=field_data), |
122
|
|
|
"FMT({N}): {F}".format(N=len_fmt, F=field_formatter)] |
123
|
|
|
raise Exception("\n".join(msg)) |
124
|
|
|
|
125
|
|
|
def __repr__(self): |
126
|
|
|
return "GOEnrichmentRecord({GO})".format(GO=self.GO) |
127
|
|
|
|
128
|
|
|
def set_goterm(self, go2obj): |
129
|
|
|
"""Set goterm and copy GOTerm's name and namespace.""" |
130
|
|
|
self.goterm = go2obj.get(self.GO, None) |
131
|
|
|
present = self.goterm is not None |
132
|
|
|
self.name = self.goterm.name if present else "n.a." |
133
|
|
|
self.NS = self.namespace2NS[self.goterm.namespace] if present else "XX" |
134
|
|
|
|
135
|
|
|
def _init_enrichment(self): |
136
|
|
|
"""Mark as 'enriched' or 'purified'.""" |
137
|
|
|
self.enrichment = 'e' if ((1.0 * self.study_count / self.study_n) > |
138
|
|
|
(1.0 * self.pop_count / self.pop_n)) else 'p' |
139
|
|
|
|
140
|
|
|
def update_remaining_fldsdefprt(self, min_ratio=None): |
141
|
|
|
"""Finish updating self (GOEnrichmentRecord) field, is_ratio_different.""" |
142
|
|
|
self.is_ratio_different = is_ratio_different(min_ratio, self.study_count, |
143
|
|
|
self.study_n, self.pop_count, self.pop_n) |
144
|
|
|
|
145
|
|
|
|
146
|
|
|
# ------------------------------------------------------------------------------------- |
147
|
|
|
# Methods for getting flat namedtuple values from GOEnrichmentRecord object |
148
|
|
|
def get_prtflds_default(self): |
149
|
|
|
"""Get default fields.""" |
150
|
|
|
return self._fldsdefprt[:-1] + \ |
151
|
|
|
["p_{M}".format(M=m.fieldname) for m in self.method_flds] + \ |
152
|
|
|
[self._fldsdefprt[-1]] |
153
|
|
|
|
154
|
|
|
def get_prtflds_all(self): |
155
|
|
|
"""When converting to a namedtuple, get all possible fields in their original order.""" |
156
|
|
|
flds = [] |
157
|
|
|
dont_add = set(['_parents', 'method_flds', 'relationship_rev', 'relationship']) |
158
|
|
|
# Fields: GO NS enrichment name ratio_in_study ratio_in_pop p_uncorrected |
159
|
|
|
# depth study_count p_sm_bonferroni p_fdr_bh study_items |
160
|
|
|
self._flds_append(flds, self.get_prtflds_default(), dont_add) |
161
|
|
|
# Fields: GO NS goterm |
162
|
|
|
# ratio_in_pop pop_n pop_count pop_items name |
163
|
|
|
# ratio_in_study study_n study_count study_items |
164
|
|
|
# method_flds enrichment p_uncorrected p_sm_bonferroni p_fdr_bh |
165
|
|
|
self._flds_append(flds, vars(self).keys(), dont_add) |
166
|
|
|
# Fields: name level is_obsolete namespace id depth parents children _parents alt_ids |
167
|
|
|
self._flds_append(flds, vars(self.goterm).keys(), dont_add) |
168
|
|
|
return flds |
169
|
|
|
|
170
|
|
|
@staticmethod |
171
|
|
|
def _flds_append(flds, addthese, dont_add): |
172
|
|
|
"""Retain order of fields as we add them once to the list.""" |
173
|
|
|
for fld in addthese: |
174
|
|
|
if fld not in flds and fld not in dont_add: |
175
|
|
|
flds.append(fld) |
176
|
|
|
|
177
|
|
|
def get_field_values(self, fldnames, rpt_fmt=True, itemid2name=None): |
178
|
|
|
"""Get flat namedtuple fields for one GOEnrichmentRecord.""" |
179
|
|
|
row = [] |
180
|
|
|
# Loop through each user field desired |
181
|
|
|
for fld in fldnames: |
182
|
|
|
# 1. Check the GOEnrichmentRecord's attributes |
183
|
|
|
val = getattr(self, fld, None) |
184
|
|
|
if val is not None: |
185
|
|
|
if rpt_fmt: |
186
|
|
|
val = self._get_rpt_fmt(fld, val, itemid2name) |
187
|
|
|
row.append(val) |
188
|
|
|
else: |
189
|
|
|
# 2. Check the GO object for the field |
190
|
|
|
val = getattr(self.goterm, fld, None) |
191
|
|
|
if rpt_fmt: |
192
|
|
|
val = self._get_rpt_fmt(fld, val, itemid2name) |
193
|
|
|
if val is not None: |
194
|
|
|
row.append(val) |
195
|
|
|
else: |
196
|
|
|
# 3. Field not found, raise Exception |
197
|
|
|
self._err_fld(fld, fldnames) |
198
|
|
|
if rpt_fmt: |
199
|
|
|
assert not isinstance(val, list), \ |
200
|
|
|
"UNEXPECTED LIST: FIELD({F}) VALUE({V}) FMT({P})".format( |
201
|
|
|
P=rpt_fmt, F=fld, V=val) |
202
|
|
|
return row |
203
|
|
|
|
204
|
|
|
@staticmethod |
205
|
|
|
def _get_rpt_fmt(fld, val, itemid2name=None): |
206
|
|
|
"""Return values in a format amenable to printing in a table.""" |
207
|
|
|
if fld.startswith("ratio_"): |
208
|
|
|
return "{N}/{TOT}".format(N=val[0], TOT=val[1]) |
209
|
|
|
elif fld in set(['study_items', 'pop_items', 'alt_ids']): |
210
|
|
|
if itemid2name is not None: |
211
|
|
|
val = [itemid2name.get(v, v) for v in val] |
212
|
|
|
return ", ".join([str(v) for v in sorted(val)]) |
213
|
|
|
return val |
214
|
|
|
|
215
|
|
|
def _err_fld(self, fld, fldnames): |
216
|
|
|
"""Unrecognized field. Print detailed Failure message.""" |
217
|
|
|
msg = ['ERROR. UNRECOGNIZED FIELD({F})'.format(F=fld)] |
218
|
|
|
actual_flds = set(self.get_prtflds_default() + self.goterm.__dict__.keys()) |
219
|
|
|
bad_flds = set(fldnames).difference(set(actual_flds)) |
220
|
|
|
if bad_flds: |
221
|
|
|
msg.append("\nGOEA RESULT FIELDS: {}".format(" ".join(self._fldsdefprt))) |
222
|
|
|
msg.append("GO FIELDS: {}".format(" ".join(self.goterm.__dict__.keys()))) |
223
|
|
|
msg.append("\nFATAL: {N} UNEXPECTED FIELDS({F})\n".format( |
224
|
|
|
N=len(bad_flds), F=" ".join(bad_flds))) |
225
|
|
|
msg.append(" {N} User-provided fields:".format(N=len(fldnames))) |
226
|
|
|
for idx, fld in enumerate(fldnames, 1): |
227
|
|
|
mrk = "ERROR -->" if fld in bad_flds else "" |
228
|
|
|
msg.append(" {M:>9} {I:>2}) {F}".format(M=mrk, I=idx, F=fld)) |
229
|
|
|
raise Exception("\n".join(msg)) |
230
|
|
|
|
231
|
|
|
|
232
|
|
|
class GOEnrichmentStudy(object): |
233
|
|
|
"""Runs Fisher's exact test, as well as multiple corrections |
234
|
|
|
""" |
235
|
|
|
# Default Excel table column widths for GOEA results |
236
|
|
|
default_fld2col_widths = { |
237
|
|
|
'NS' : 3, |
238
|
|
|
'GO' : 12, |
239
|
|
|
'alt' : 2, |
240
|
|
|
'level' : 3, |
241
|
|
|
'depth' : 3, |
242
|
|
|
'enrichment': 1, |
243
|
|
|
'name' : 60, |
244
|
|
|
'ratio_in_study': 8, |
245
|
|
|
'ratio_in_pop' : 12, |
246
|
|
|
'study_items' : 15, |
247
|
|
|
} |
248
|
|
|
|
249
|
|
|
def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): |
250
|
|
|
self.log = kws['log'] if 'log' in kws else sys.stdout |
251
|
|
|
self._run_multitest = { |
252
|
|
|
'local':lambda iargs: self._run_multitest_local(iargs), |
253
|
|
|
'statsmodels':lambda iargs: self._run_multitest_statsmodels(iargs)} |
254
|
|
|
self.pop = set(pop) |
255
|
|
|
self.pop_n = len(pop) |
256
|
|
|
self.assoc = assoc |
257
|
|
|
self.obo_dag = obo_dag |
258
|
|
|
self.alpha = alpha |
259
|
|
|
if methods is None: |
260
|
|
|
methods = ["bonferroni", "sidak", "holm"] |
261
|
|
|
self.methods = Methods(methods) |
262
|
|
|
self.pval_obj = FisherFactory(**kws).pval_obj |
263
|
|
|
|
264
|
|
|
if propagate_counts: |
265
|
|
|
sys.stderr.write("Propagating term counts to parents ..\n") |
266
|
|
|
obo_dag.update_association(assoc) |
267
|
|
|
self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log) |
268
|
|
|
|
269
|
|
|
def run_study(self, study, **kws): |
270
|
|
|
"""Run Gene Ontology Enrichment Study (GOEA) on study ids.""" |
271
|
|
|
# Key-word arguments: |
272
|
|
|
methods = Methods(kws['methods']) if 'methods' in kws else self.methods |
273
|
|
|
alpha = kws['alpha'] if 'alpha' in kws else self.alpha |
274
|
|
|
log = kws['log'] if 'log' in kws else self.log |
275
|
|
|
# Calculate uncorrected pvalues |
276
|
|
|
results = self.get_pval_uncorr(study, log) |
277
|
|
|
if not results: |
278
|
|
|
return [] |
279
|
|
|
|
280
|
|
|
if log is not None: |
281
|
|
|
log.write(" {MSG}\n".format(MSG="\n ".join(self.get_results_msg(results, study)))) |
282
|
|
|
|
283
|
|
|
# Do multipletest corrections on uncorrected pvalues and update results |
284
|
|
|
self._run_multitest_corr(results, methods, alpha, study, log) |
285
|
|
|
|
286
|
|
|
for rec in results: |
287
|
|
|
# get go term for name and level |
288
|
|
|
rec.set_goterm(self.obo_dag) |
289
|
|
|
|
290
|
|
|
# 'keep_if' can be used to keep only significant GO terms. Example: |
291
|
|
|
# >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant |
292
|
|
|
# >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if) |
293
|
|
|
if 'keep_if' in kws: |
294
|
|
|
keep_if = kws['keep_if'] |
295
|
|
|
results = [r for r in results if keep_if(r)] |
296
|
|
|
|
297
|
|
|
# Default sort order: First, sort by BP, MF, CC. Second, sort by pval |
298
|
|
|
results.sort(key=lambda r: [r.NS, r.enrichment, r.p_uncorrected]) |
299
|
|
|
return results # list of GOEnrichmentRecord objects |
300
|
|
|
|
301
|
|
|
def run_study_nts(self, study, **kws): |
302
|
|
|
"""Run GOEA on study ids. Return results as a list of namedtuples.""" |
303
|
|
|
goea_results = self.run_study(study, **kws) |
304
|
|
|
return MgrNtGOEAs(goea_results).get_goea_nts_all() |
305
|
|
|
|
306
|
|
|
def get_results_msg(self, results, study): |
307
|
|
|
"""Return summary for GOEA results.""" |
308
|
|
|
# To convert msg list to string: "\n".join(msg) |
309
|
|
|
msg = [] |
310
|
|
|
if results: |
311
|
|
|
fmt = "{M:6,} GO terms are associated with {N:6,} of {NT:6,}" |
312
|
|
|
stu_items, num_gos_stu = self.get_item_cnt(results, "study_items") |
313
|
|
|
pop_items, num_gos_pop = self.get_item_cnt(results, "pop_items") |
314
|
|
|
stu_txt = fmt.format(N=len(stu_items), M=num_gos_stu, NT=len(set(study))) |
315
|
|
|
pop_txt = fmt.format(N=len(pop_items), M=num_gos_pop, NT=self.pop_n) |
316
|
|
|
msg.append("{POP} population items".format(POP=pop_txt)) |
317
|
|
|
msg.append("{STU} study items".format(STU=stu_txt)) |
318
|
|
|
return msg |
319
|
|
|
|
320
|
|
|
def get_pval_uncorr(self, study, log=sys.stdout): |
321
|
|
|
"""Calculate the uncorrected pvalues for study items.""" |
322
|
|
|
results = [] |
323
|
|
|
study_in_pop = self.pop.intersection(study) |
324
|
|
|
# " 99% 378 of 382 study items found in population" |
325
|
|
|
go2studyitems = get_terms("study", study_in_pop, self.assoc, self.obo_dag, log) |
326
|
|
|
pop_n, study_n = self.pop_n, len(study_in_pop) |
327
|
|
|
allterms = set(go2studyitems).union(set(self.go2popitems)) |
328
|
|
|
if log is not None: |
329
|
|
|
study_n_orig = len(study) |
330
|
|
|
log.write("{R:3.0f}% {N:>6,} of {M:>6,} study items found in population({P})\n".format( |
331
|
|
|
N=study_n, M=study_n_orig, P=pop_n, R=100.0*study_n/study_n_orig)) |
332
|
|
|
log.write("Calculating {N:,} uncorrected p-values using {PFNC}\n".format( |
333
|
|
|
N=len(allterms), PFNC=self.pval_obj.name)) |
334
|
|
|
calc_pvalue = self.pval_obj.calc_pvalue |
335
|
|
|
|
336
|
|
|
for goid in allterms: |
337
|
|
|
study_items = go2studyitems.get(goid, set()) |
338
|
|
|
study_count = len(study_items) |
339
|
|
|
pop_items = self.go2popitems.get(goid, set()) |
340
|
|
|
pop_count = len(pop_items) |
341
|
|
|
|
342
|
|
|
one_record = GOEnrichmentRecord( |
343
|
|
|
GO=goid, |
344
|
|
|
p_uncorrected=calc_pvalue(study_count, study_n, pop_count, pop_n), |
345
|
|
|
study_items=study_items, |
346
|
|
|
pop_items=pop_items, |
347
|
|
|
ratio_in_study=(study_count, study_n), |
348
|
|
|
ratio_in_pop=(pop_count, pop_n)) |
349
|
|
|
|
350
|
|
|
results.append(one_record) |
351
|
|
|
|
352
|
|
|
return results |
353
|
|
|
|
354
|
|
|
def _run_multitest_corr(self, results, usrmethod_flds, alpha, study, log): |
355
|
|
|
"""Do multiple-test corrections on uncorrected pvalues.""" |
356
|
|
|
assert 0 < alpha < 1, "Test-wise alpha must fall between (0, 1)" |
357
|
|
|
pvals = [r.p_uncorrected for r in results] |
358
|
|
|
ntobj = cx.namedtuple("ntobj", "results pvals alpha nt_method study") |
359
|
|
|
|
360
|
|
|
for nt_method in usrmethod_flds: |
361
|
|
|
ntmt = ntobj(results, pvals, alpha, nt_method, study) |
362
|
|
|
self._run_multitest[nt_method.source](ntmt) |
363
|
|
|
if log is not None: |
364
|
|
|
self._log_multitest_corr(log, results, ntmt, alpha) |
365
|
|
|
|
366
|
|
|
def _log_multitest_corr(self, log, results, ntmt, alpha): |
367
|
|
|
"""Print information regarding multitest correction results.""" |
368
|
|
|
ntm = ntmt.nt_method |
369
|
|
|
attr_mult = "p_{M}".format(M=self.methods.get_fieldname(ntm.source, ntm.method)) |
370
|
|
|
sig_cnt = sum(1 for r in results if getattr(r, attr_mult) < alpha) |
371
|
|
|
log.write("{N:8,} GO terms found significant (< {A}=alpha) after ".format( |
372
|
|
|
N=sig_cnt, A=alpha)) |
373
|
|
|
log.write("multitest correction: ") |
374
|
|
|
log.write("{MSRC} {METHOD}\n".format(MSRC=ntm.source, METHOD=ntm.method)) |
375
|
|
|
|
376
|
|
|
def _run_multitest_statsmodels(self, ntmt): |
377
|
|
|
"""Use multitest mthods that have been implemented in statsmodels.""" |
378
|
|
|
# Only load statsmodels if it is used |
379
|
|
|
multipletests = self.methods.get_statsmodels_multipletests() |
380
|
|
|
results = multipletests(ntmt.pvals, ntmt.alpha, ntmt.nt_method.method) |
381
|
|
|
pvals_corrected = results[1] # reject_lst, pvals_corrected, alphacSidak, alphacBonf |
382
|
|
|
self._update_pvalcorr(ntmt, pvals_corrected) |
383
|
|
|
|
384
|
|
|
def _run_multitest_local(self, ntmt): |
385
|
|
|
"""Use multitest mthods that have been implemented locally.""" |
386
|
|
|
corrected_pvals = None |
387
|
|
|
method = ntmt.nt_method.method |
388
|
|
|
if method == "bonferroni": |
389
|
|
|
corrected_pvals = Bonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals |
390
|
|
|
elif method == "sidak": |
391
|
|
|
corrected_pvals = Sidak(ntmt.pvals, ntmt.alpha).corrected_pvals |
392
|
|
|
elif method == "holm": |
393
|
|
|
corrected_pvals = HolmBonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals |
394
|
|
|
elif method == "fdr": |
395
|
|
|
# get the empirical p-value distributions for FDR |
396
|
|
|
term_pop = getattr(self, 'term_pop', None) |
397
|
|
|
if term_pop is None: |
398
|
|
|
term_pop = count_terms(self.pop, self.assoc, self.obo_dag) |
399
|
|
|
p_val_distribution = calc_qval(len(ntmt.study), |
400
|
|
|
self.pop_n, |
401
|
|
|
self.pop, self.assoc, |
402
|
|
|
term_pop, self.obo_dag) |
403
|
|
|
corrected_pvals = FDR(p_val_distribution, |
404
|
|
|
ntmt.results, ntmt.alpha).corrected_pvals |
405
|
|
|
|
406
|
|
|
self._update_pvalcorr(ntmt, corrected_pvals) |
407
|
|
|
|
408
|
|
|
@staticmethod |
409
|
|
|
def _update_pvalcorr(ntmt, corrected_pvals): |
410
|
|
|
"""Add data members to store multiple test corrections.""" |
411
|
|
|
if corrected_pvals is None: |
412
|
|
|
return |
413
|
|
|
for rec, val in zip(ntmt.results, corrected_pvals): |
414
|
|
|
rec.set_corrected_pval(ntmt.nt_method, val) |
415
|
|
|
|
416
|
|
|
# Methods for writing results into tables: text, tab-separated, Excel spreadsheets |
417
|
|
|
def wr_txt(self, fout_txt, goea_results, prtfmt=None, **kws): |
418
|
|
|
"""Print GOEA results to text file.""" |
419
|
|
|
if not goea_results: |
420
|
|
|
sys.stdout.write(" 0 GOEA results. NOT WRITING {FOUT}\n".format(FOUT=fout_txt)) |
421
|
|
|
return |
422
|
|
|
with open(fout_txt, 'w') as prt: |
423
|
|
|
if 'title' in kws: |
424
|
|
|
prt.write("{TITLE}\n".format(TITLE=kws['title'])) |
425
|
|
|
data_nts = self.prt_txt(prt, goea_results, prtfmt, **kws) |
426
|
|
|
log = self.log if self.log is not None else sys.stdout |
427
|
|
|
log.write(" {N:>5} GOEA results for {CUR:5} study items. WROTE: {F}\n".format( |
428
|
|
|
N=len(data_nts), |
429
|
|
|
CUR=len(MgrNtGOEAs(goea_results).get_study_items()), |
430
|
|
|
F=fout_txt)) |
431
|
|
|
|
432
|
|
|
def prt_txt(self, prt, goea_results, prtfmt=None, **kws): |
433
|
|
|
"""Print GOEA results in text format.""" |
434
|
|
|
if prtfmt is None: |
435
|
|
|
prtfmt = ("{GO} {NS} {p_uncorrected:5.2e} {ratio_in_study:>6} {ratio_in_pop:>9} " |
436
|
|
|
"{depth:02} {name:40} {study_items}\n") |
437
|
|
|
prtfmt = self.adjust_prtfmt(prtfmt) |
438
|
|
|
prt_flds = RPT.get_fmtflds(prtfmt) |
439
|
|
|
data_nts = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) |
440
|
|
|
RPT.prt_txt(prt, data_nts, prtfmt, prt_flds, **kws) |
441
|
|
|
return data_nts |
442
|
|
|
|
443
|
|
|
def wr_xlsx(self, fout_xlsx, goea_results, **kws): |
444
|
|
|
"""Write a xlsx file.""" |
445
|
|
|
# kws: prt_if indent itemid2name(study_items) |
446
|
|
|
prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) |
447
|
|
|
xlsx_data = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) |
448
|
|
|
if 'fld2col_widths' not in kws: |
449
|
|
|
kws['fld2col_widths'] = {f:self.default_fld2col_widths.get(f, 8) for f in prt_flds} |
450
|
|
|
RPT.wr_xlsx(fout_xlsx, xlsx_data, **kws) |
451
|
|
|
|
452
|
|
|
def wr_tsv(self, fout_tsv, goea_results, **kws): |
453
|
|
|
"""Write tab-separated table data to file""" |
454
|
|
|
prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) |
455
|
|
|
tsv_data = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) |
456
|
|
|
RPT.wr_tsv(fout_tsv, tsv_data, **kws) |
457
|
|
|
|
458
|
|
|
def prt_tsv(self, prt, goea_results, **kws): |
459
|
|
|
"""Write tab-separated table data""" |
460
|
|
|
prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) |
461
|
|
|
tsv_data = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) |
462
|
|
|
RPT.prt_tsv(prt, tsv_data, prt_flds, **kws) |
463
|
|
|
|
464
|
|
|
@staticmethod |
465
|
|
|
def adjust_prtfmt(prtfmt): |
466
|
|
|
"""Adjust format_strings for legal values.""" |
467
|
|
|
prtfmt = prtfmt.replace("{p_holm-sidak", "{p_holm_sidak") |
468
|
|
|
prtfmt = prtfmt.replace("{p_simes-hochberg", "{p_simes_hochberg") |
469
|
|
|
return prtfmt |
470
|
|
|
|
471
|
|
|
@staticmethod |
472
|
|
|
def get_ns2nts(results, fldnames=None, **kws): |
473
|
|
|
"""Get namedtuples of GOEA results, split into BP, MF, CC.""" |
474
|
|
|
ns2nts = cx.defaultdict(list) |
475
|
|
|
nts = MgrNtGOEAs(results).get_goea_nts_all(fldnames, **kws) |
476
|
|
|
for ntgoea in nts: |
477
|
|
|
ns2nts[ntgoea.NS].append(ntgoea) |
478
|
|
|
return ns2nts |
479
|
|
|
|
480
|
|
|
@staticmethod |
481
|
|
|
def get_item_cnt(results, attrname="study_items"): |
482
|
|
|
"""Get all study or population items (e.g., geneids).""" |
483
|
|
|
items = set() |
484
|
|
|
go_cnt = 0 |
485
|
|
|
for rec in results: |
486
|
|
|
if hasattr(rec, attrname): |
487
|
|
|
items_cur = getattr(rec, attrname) |
488
|
|
|
# Only count GO term if there are items in the set. |
489
|
|
|
if len(items_cur) != 0: |
490
|
|
|
items |= items_cur |
491
|
|
|
go_cnt += 1 |
492
|
|
|
return items, go_cnt |
493
|
|
|
|
494
|
|
|
@staticmethod |
495
|
|
|
def get_prtflds_default(results): |
496
|
|
|
"""Get default fields names. Used in printing GOEA results. |
497
|
|
|
|
498
|
|
|
Researchers can control which fields they want to print in the GOEA results |
499
|
|
|
or they can use the default fields. |
500
|
|
|
""" |
501
|
|
|
if results: |
502
|
|
|
return results[0].get_prtflds_default() |
503
|
|
|
return [] |
504
|
|
|
|
505
|
|
|
@staticmethod |
506
|
|
|
def print_summary(results, min_ratio=None, indent=False, pval=0.05): |
507
|
|
|
"""Print summary.""" |
508
|
|
|
import goatools |
509
|
|
|
|
510
|
|
|
# Header contains provenance and parameters |
511
|
|
|
date = datetime.date.today() |
512
|
|
|
print("# Generated by GOATOOLS v{0} ({1})".format(goatools.__version__, date)) |
513
|
|
|
print("# min_ratio={0} pval={1}".format(min_ratio, pval)) |
514
|
|
|
|
515
|
|
|
# field names for output |
516
|
|
|
if results: |
517
|
|
|
print("\t".join(GOEnrichmentStudy.get_prtflds_default(results))) |
518
|
|
|
|
519
|
|
|
for rec in results: |
520
|
|
|
# calculate some additional statistics |
521
|
|
|
# (over_under, is_ratio_different) |
522
|
|
|
rec.update_remaining_fldsdefprt(min_ratio=min_ratio) |
523
|
|
|
|
524
|
|
|
if pval is not None and rec.p_uncorrected >= pval: |
525
|
|
|
continue |
526
|
|
|
|
527
|
|
|
if rec.is_ratio_different: |
528
|
|
|
print(rec.__str__(indent=indent)) |
529
|
|
|
|
530
|
|
|
def wr_py_goea_results(self, fout_py, goea_results, **kws): |
531
|
|
|
"""Save GOEA results into Python package containing list of namedtuples.""" |
532
|
|
|
var_name = kws.get("var_name", "goea_results") |
533
|
|
|
docstring = kws.get("docstring", "") |
534
|
|
|
sortby = kws.get("sortby", None) |
535
|
|
|
if goea_results: |
536
|
|
|
from goatools.nt_utils import wr_py_nts |
537
|
|
|
nts_goea = goea_results |
538
|
|
|
# If list has GOEnrichmentRecords or verbose namedtuples, exclude some fields. |
539
|
|
|
if hasattr(goea_results[0], "_fldsdefprt") or hasattr(goea_results[0], 'goterm'): |
540
|
|
|
# Exclude some attributes from the namedtuple when saving results |
541
|
|
|
# to a Python file because the information is redundant or verbose. |
542
|
|
|
nts_goea = MgrNtGOEAs(goea_results).get_goea_nts_prt(**kws) |
543
|
|
|
docstring = "\n".join([docstring, "# {VER}\n\n".format(VER=self.obo_dag.version)]) |
544
|
|
|
assert hasattr(nts_goea[0], '_fields') |
545
|
|
|
if sortby is None: |
546
|
|
|
sortby = lambda nt: [getattr(nt, 'namespace'), getattr(nt, 'enrichment'), |
547
|
|
|
getattr(nt, 'p_uncorrected'), getattr(nt, 'depth'), |
548
|
|
|
getattr(nt, 'GO')] |
549
|
|
|
nts_goea = sorted(nts_goea, key=sortby) |
550
|
|
|
wr_py_nts(fout_py, nts_goea, docstring, var_name) |
551
|
|
|
|
552
|
|
|
# Copyright (C) 2010-2018, H Tang et al., All rights reserved. |
553
|
|
|
|