Completed
Pull Request — master (#83)
by
unknown
01:31
created

GOEnrichmentStudy._get_pval_uncorr()   C

Complexity

Conditions 7

Size

Total Lines 65

Duplication

Lines 0
Ratio 0 %

Importance

Changes 4
Bugs 0 Features 0
Metric Value
cc 7
c 4
b 0
f 0
dl 0
loc 65
rs 6.0105

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
#!/usr/bin/env python
2
# -*- coding: UTF-8 -*-
3
4
"""
5
python %prog study.file population.file gene-association.file
6
7
This program returns P-values for functional enrichment in a cluster of
8
study genes using Fisher's exact test, and corrected for multiple testing
9
(including Bonferroni, Holm, Sidak, and false discovery rate)
10
"""
11
12
from __future__ import absolute_import
13
14
__copyright__ = "Copyright (C) 2010-2017, H Tang et al., All rights reserved."
15
__author__ = "various"
16
17
import sys
18
import collections as cx
19
import datetime
20
from functools import partial
21
import multiprocessing
22
23
24
from goatools.multiple_testing import Methods, Bonferroni, Sidak, HolmBonferroni, FDR, calc_qval
25
from goatools.ratio import get_terms, count_terms, is_ratio_different
26
import goatools.wr_tbl as RPT
27
from goatools.pvalcalc import FisherFactory
28
from .multiprocessing_tools import p_map
29
30
31
def compute_pvals(allterms, calc_pvalue, go2studyitems, go2popitems,
32
                  study_n, pop_n):
33
34
    results = {}
35
    for term in allterms:
36
37
        study_items = go2studyitems.get(term, set())
38
        study_count = len(study_items)
39
        pop_items = go2popitems.get(term, set())
40
        pop_count = len(pop_items)
41
42
        results[term] = calc_pvalue(study_count, study_n, pop_count, pop_n)
43
44
    return results
45
46
47
class GOEnrichmentRecord(object):
48
    """Represents one result (from a single GOTerm) in the GOEnrichmentStudy
49
    """
50
    namespace2NS = cx.OrderedDict([
51
        ('biological_process', 'BP'),
52
        ('molecular_function', 'MF'),
53
        ('cellular_component', 'CC')])
54
55
    # Fields seen in every enrichment result
56
    _fldsdefprt = [
57
        "GO",
58
        "NS",
59
        "enrichment",
60
        "name",
61
        "ratio_in_study",
62
        "ratio_in_pop",
63
        "p_uncorrected",
64
        "depth",
65
        "study_count",
66
        "study_items"]
67
    _fldsdeffmt = ["%s"]*3 + ["%-30s"] + ["%d/%d"] * 2 + ["%.3g"] + ["%d"] * 2 + ["%15s"]
68
69
    _flds = set(_fldsdefprt).intersection(
70
        set(['study_items', 'study_count', 'study_n', 'pop_items', 'pop_count', 'pop_n']))
71
72
    def __init__(self, **kwargs):
73
        # Methods seen in current enrichment result
74
        self._methods = []
75
        for k, v in kwargs.items():
76
            setattr(self, k, v)
77
            if k == 'ratio_in_study':
78
                setattr(self, 'study_count', v[0])
79
                setattr(self, 'study_n', v[1])
80
            if k == 'ratio_in_pop':
81
                setattr(self, 'pop_count', v[0])
82
                setattr(self, 'pop_n', v[1])
83
        self._init_enrichment()
84
        self.goterm = None  # the reference to the GOTerm
85
86
    def get_method_name(self):
87
        """Return name of first method in the _methods list."""
88
        return self._methods[0].fieldname
89
90
    def get_pvalue(self):
91
        """Returns pval for 1st method, if it exists. Else returns uncorrected pval."""
92
        if self._methods:
93
            return getattr(self, "p_{m}".format(m=self.get_method_name()))
94
        return getattr(self, "p_uncorrected")
95
96
    def set_corrected_pval(self, nt_method, pvalue):
97
        """Add object attribute based on method name."""
98
        self._methods.append(nt_method)
99
        fieldname = "".join(["p_", nt_method.fieldname])
100
        setattr(self, fieldname, pvalue)
101
102
    def __str__(self, indent=False):
103
        field_data = [getattr(self, f, "n.a.") for f in self._fldsdefprt[:-1]] + \
104
                     [getattr(self, "p_{}".format(m.fieldname)) for m in self._methods] + \
105
                     [", ".join(sorted(getattr(self, self._fldsdefprt[-1], set())))]
106
        fldsdeffmt = self._fldsdeffmt
107
        field_formatter = fldsdeffmt[:-1] + ["%.3g"]*len(self._methods) + [fldsdeffmt[-1]]
108
        self._chk_fields(field_data, field_formatter)
109
110
        # default formatting only works for non-"n.a" data
111
        for i, f in enumerate(field_data):
112
            if f == "n.a.":
113
                field_formatter[i] = "%s"
114
115
        # print dots to show the level of the term
116
        dots = self.get_indent_dots() if indent else ""
117
        prtdata = "\t".join(a % b for (a, b) in zip(field_formatter, field_data))
118
        return "".join([dots, prtdata])
119
120
    def get_indent_dots(self):
121
        """Get a string of dots ("....") representing the level of the GO term."""
122
        return "." * self.goterm.level if self.goterm is not None else ""
123
124
    @staticmethod
125
    def _chk_fields(field_data, field_formatter):
126
        """Check that expected fields are present."""
127
        if len(field_data) == len(field_formatter):
128
            return
129
        len_dat = len(field_data)
130
        len_fmt = len(field_formatter)
131
        msg = [
132
            "FIELD DATA({d}) != FORMATTER({f})".format(d=len_dat, f=len_fmt),
133
            "DAT({N}): {D}".format(N=len_dat, D=field_data),
134
            "FMT({N}): {F}".format(N=len_fmt, F=field_formatter)]
135
        raise Exception("\n".join(msg))
136
137
    def __repr__(self):
138
        return "GOEnrichmentRecord({GO})".format(GO=self.GO)
139
140
    def set_goterm(self, goid):
141
        """Set goterm and copy GOTerm's name and namespace."""
142
        self.goterm = goid.get(self.GO, None)
143
        present = self.goterm is not None
144
        self.name = self.goterm.name if present else "n.a."
145
        self.NS = self.namespace2NS[self.goterm.namespace] if present else "XX"
146
147
    def _init_enrichment(self):
148
        """Mark as 'enriched' or 'purified'."""
149
        self.enrichment = 'e' if ((1.0 * self.study_count / self.study_n) >
150
                                  (1.0 * self.pop_count / self.pop_n)) else 'p'
151
152
    def update_remaining_fldsdefprt(self, min_ratio=None):
153
        """Finish updating self (GOEnrichmentRecord) field, is_ratio_different."""
154
        self.is_ratio_different = is_ratio_different(min_ratio, self.study_count,
155
                                                     self.study_n, self.pop_count, self.pop_n)
156
157
158
    # -------------------------------------------------------------------------------------
159
    # Methods for getting flat namedtuple values from GOEnrichmentRecord object
160
    def get_prtflds_default(self):
161
        """Get default fields."""
162
        return self._fldsdefprt[:-1] + \
163
               ["p_{M}".format(M=m.fieldname) for m in self._methods] + \
164
               [self._fldsdefprt[-1]]
165
166
    def get_prtflds_all(self):
167
        """When converting to a namedtuple, get all possible fields in their original order."""
168
        flds = []
169
        dont_add = set(['_parents', '_methods'])
170
        # Fields: GO NS enrichment name ratio_in_study ratio_in_pop p_uncorrected
171
        #         depth study_count p_sm_bonferroni p_fdr_bh study_items
172
        self._flds_append(flds, self.get_prtflds_default(), dont_add)
173
        # Fields: GO NS goterm
174
        #         ratio_in_pop pop_n pop_count pop_items name
175
        #         ratio_in_study study_n study_count study_items
176
        #         _methods enrichment p_uncorrected p_sm_bonferroni p_fdr_bh
177
        self._flds_append(flds, vars(self).keys(), dont_add)
178
        # Fields: name level is_obsolete namespace id depth parents children _parents alt_ids
179
        self._flds_append(flds, vars(self.goterm).keys(), dont_add)
180
        return flds
181
182
    @staticmethod
183
    def _flds_append(flds, addthese, dont_add):
184
        """Retain order of fields as we add them once to the list."""
185
        for fld in addthese:
186
            if fld not in flds and fld not in dont_add:
187
                flds.append(fld)
188
189
    def get_field_values(self, fldnames, rpt_fmt=True):
190
        """Get flat namedtuple fields for one GOEnrichmentRecord."""
191
        row = []
192
        # Loop through each user field desired
193
        for fld in fldnames:
194
            # 1. Check the GOEnrichmentRecord's attributes
195
            val = getattr(self, fld, None)
196
            if val is not None:
197
                if rpt_fmt:
198
                    val = self._get_rpt_fmt(fld, val)
199
                row.append(val)
200
            else:
201
                # 2. Check the GO object for the field
202
                val = getattr(self.goterm, fld, None)
203
                if rpt_fmt:
204
                    val = self._get_rpt_fmt(fld, val)
205
                if val is not None:
206
                    row.append(val)
207
                else:
208
                    # 3. Field not found, raise Exception
209
                    self._err_fld(fld, fldnames, row)
210
            if rpt_fmt:
211
                assert not isinstance(val, list), \
212
                   "UNEXPECTED LIST: FIELD({F}) VALUE({V}) FMT({P})".format(
213
                       P=rpt_fmt, F=fld, V=val)
214
        return row
215
216
    @staticmethod
217
    def _get_rpt_fmt(fld, val):
218
        """Return values in a format amenable to printing in a table."""
219
        if fld.startswith("ratio_"):
220
            return "{N}/{TOT}".format(N=val[0], TOT=val[1])
221
        elif fld in set(['study_items', 'pop_items', 'alt_ids']):
222
            return ", ".join([str(v) for v in sorted(val)])
223
        return val
224
225
    def _err_fld(self, fld, fldnames):
226
        """Unrecognized field. Print detailed Failure message."""
227
        msg = ['ERROR. UNRECOGNIZED FIELD({F})'.format(F=fld)]
228
        actual_flds = set(self.get_prtflds_default() + self.goterm.__dict__.keys())
229
        bad_flds = set(fldnames).difference(set(actual_flds))
230
        if bad_flds:
231
            msg.append("\nGOEA RESULT FIELDS: {}".format(" ".join(self._fldsdefprt)))
232
            msg.append("GO FIELDS: {}".format(" ".join(self.goterm.__dict__.keys())))
233
            msg.append("\nFATAL: {N} UNEXPECTED FIELDS({F})\n".format(
234
                N=len(bad_flds), F=" ".join(bad_flds)))
235
            msg.append("  {N} User-provided fields:".format(N=len(fldnames)))
236
            for idx, fld in enumerate(fldnames, 1):
237
                mrk = "ERROR -->" if fld in bad_flds else ""
238
                msg.append("  {M:>9} {I:>2}) {F}".format(M=mrk, I=idx, F=fld))
239
        raise Exception("\n".join(msg))
240
241
242
class GOEnrichmentStudy(object):
243
    """Runs Fisher's exact test, as well as multiple corrections
244
    """
245
    # Default Excel table column widths for GOEA results
246
    default_fld2col_widths = {
247
        'NS'        :  3,
248
        'GO'        : 12,
249
        'level'     :  3,
250
        'enrichment':  1,
251
        'name'      : 60,
252
        'ratio_in_study':  8,
253
        'ratio_in_pop'  : 12,
254
        'study_items'   : 15,
255
    }
256
257
    def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws):
258
        self.log = kws.get('log', sys.stdout)
259
        # -1 avoids freezing of the machine:
260
        self.n_cores = kws.get('n_cores', multiprocessing.cpu_count() - 1)
261
        self._run_multitest = {
262
            'local':lambda iargs: self._run_multitest_local(iargs),
263
            'statsmodels':lambda iargs: self._run_multitest_statsmodels(iargs)}
264
        self.pop = pop
265
        self.pop_n = len(pop)
266
        self.assoc = assoc
267
        self.obo_dag = obo_dag
268
        self.alpha = alpha
269
        if methods is None:
270
            methods = ["bonferroni", "sidak", "holm"]
271
        self.methods = Methods(methods)
272
        self.pval_obj = FisherFactory(**kws).pval_obj
273
274
        if propagate_counts:
275
            sys.stderr.write("Propagating term counts to parents ..\n")
276
            obo_dag.update_association(assoc)
277
        self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log)
278
279
    def run_study(self, study, **kws):
280
        """Run Gene Ontology Enrichment Study (GOEA) on study ids."""
281
        # Key-word arguments:
282
        methods = Methods(kws['methods']) if 'methods' in kws else self.methods
283
        alpha = kws['alpha'] if 'alpha' in kws else self.alpha
284
        log = kws['log'] if 'log' in kws else self.log
285
        # Calculate uncorrected pvalues
286
        results = self._get_pval_uncorr(study, log)
287
        if not results:
288
            return []
289
290
        # Do multipletest corrections on uncorrected pvalues and update results
291
        self._run_multitest_corr(results, methods, alpha, study, log)
292
293
        for rec in results:
294
            # get go term for name and level
295
            rec.set_goterm(self.obo_dag)
296
297
        # 'keep_if' can be used to keep only significant GO terms. Example:
298
        #     >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant
299
        #     >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if)
300
        if 'keep_if' in kws:
301
            keep_if = kws['keep_if']
302
            results = [r for r in results if keep_if(r)]
303
304
        # Default sort order: First, sort by BP, MF, CC. Second, sort by pval
305
        results.sort(key=lambda r: [r.NS, r.p_uncorrected])
306
307
        if log is not None:
308
            log.write("  {MSG}\n".format(MSG="\n  ".join(self.get_results_msg(results, study))))
309
310
        return results # list of GOEnrichmentRecord objects
311
312
    def run_study_nts(self, study, **kws):
313
        """Run GOEA on study ids. Return results as a list of namedtuples."""
314
        goea_results = self.run_study(study, **kws)
315
        return get_goea_nts_all(goea_results)
316
317
    def get_results_msg(self, results, study):
318
        """Return summary for GOEA results."""
319
        # To convert msg list to string: "\n".join(msg)
320
        msg = []
321
        if results:
322
            stu_items, num_gos_stu = self.get_item_cnt(results, "study_items")
323
            pop_items, num_gos_pop = self.get_item_cnt(results, "pop_items")
324
            msg.append("{M:,} GO terms are associated with {N:,} of {NT:,} study items".format(
325
                N=len(stu_items), NT=len(set(study)), M=num_gos_stu))
326
            msg.append("{M:,} GO terms are associated with {N:,} of {NT:,} population items".format(
327
                N=len(pop_items), NT=self.pop_n, M=num_gos_pop))
328
        return msg
329
330
    def _get_pval_uncorr(self, study, log=sys.stdout):
331
        """Calculate the uncorrected pvalues for study items."""
332
        if log is not None:
333
            log.write("Calculating uncorrected p-values using {PFNC}\n".format(PFNC=self.pval_obj.name))
334
        go2studyitems = get_terms("study", study, self.assoc, self.obo_dag, log)
335
        pop_n, study_n = self.pop_n, len(study)
336
        allterms = set(go2studyitems.keys()).union(set(self.go2popitems.keys()))
337
338
        n_cores = self.n_cores
339
340
        # for smaller sizes starup/shutdown of a multiprocessing Pool is too expensive
341
        # limit is based on some heursistincs on a recent Mac.
342
        if len(allterms) < 100:
343
            n_cores = 1
344
345
        log.write("use {} cores for computing pvalues\n".format(n_cores))
346
347
        allterms = list(allterms)
348
        fragments = [allterms[i::n_cores] for i in range(n_cores)]
349
350
        # bind arguments, so that remote_func only depends on fragment of terms to process:
351
        calc_pvalue = self.pval_obj.calc_pvalue
352
        remote_func = partial(compute_pvals, calc_pvalue=calc_pvalue, go2studyitems=go2studyitems,
353
                              go2popitems=self.go2popitems, study_n=study_n, pop_n=pop_n)
354
355
356
        if n_cores == 1:
357
            all_p_values = [remote_func(fragments[0])]
358
        else:
359
            # if self.pval_obj.log is a file handle, which we can not serialize, we could not
360
            # transfer self.pval_obj.calc_pvalue to another python process with multiprocessing.
361
            # therefore we "patch" the object which will later be restored again.
362
            old = self.pval_obj.log
363
            self.pval_obj.log = None
364
            p = multiprocessing.Pool(n_cores)
365
            try:
366
                all_p_values = p_map(p, remote_func, fragments)
367
            finally:
368
                # restore patched file handle
369
                self.pval_obj.log = old
370
                # release memory
371
                p.terminate()
372
373
        results = []
374
375
        for p_values_map in all_p_values:
376
377
            for term, p_value in p_values_map.items():
378
379
                study_items = go2studyitems.get(term, set())
380
                study_count = len(study_items)
381
                pop_items = self.go2popitems.get(term, set())
382
                pop_count = len(pop_items)
383
384
                one_record = GOEnrichmentRecord(
385
                    GO=term,
386
                    p_uncorrected=p_value,
387
                    study_items=study_items,
388
                    pop_items=pop_items,
389
                    ratio_in_study=(study_count, study_n),
390
                    ratio_in_pop=(pop_count, pop_n))
391
392
                results.append(one_record)
393
394
        return results
395
396
    def _run_multitest_corr(self, results, usr_methods, alpha, study, log):
397
        """Do multiple-test corrections on uncorrected pvalues."""
398
        assert 0 < alpha < 1, "Test-wise alpha must fall between (0, 1)"
399
        pvals = [r.p_uncorrected for r in results]
400
        NtMt = cx.namedtuple("NtMt", "results pvals alpha nt_method study")
401
402
        for nt_method in usr_methods:
403
            ntmt = NtMt(results, pvals, alpha, nt_method, study)
404
            if log is not None:
405
                log.write("Running multitest correction: {MSRC} {METHOD}\n".format(
406
                    MSRC=ntmt.nt_method.source, METHOD=ntmt.nt_method.method))
407
            self._run_multitest[nt_method.source](ntmt)
408
409
    def _run_multitest_statsmodels(self, ntmt):
410
        """Use multitest mthods that have been implemented in statsmodels."""
411
        # Only load statsmodels if it is used
412
        multipletests = self.methods.get_statsmodels_multipletests()
413
        results = multipletests(ntmt.pvals, ntmt.alpha, ntmt.nt_method.method)
414
        pvals_corrected = results[1] # reject_lst, pvals_corrected, alphacSidak, alphacBonf
415
        self._update_pvalcorr(ntmt, pvals_corrected)
416
417
    def _run_multitest_local(self, ntmt):
418
        """Use multitest mthods that have been implemented locally."""
419
        corrected_pvals = None
420
        method = ntmt.nt_method.method
421
        if method == "bonferroni":
422
            corrected_pvals = Bonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals
423
        elif method == "sidak":
424
            corrected_pvals = Sidak(ntmt.pvals, ntmt.alpha).corrected_pvals
425
        elif method == "holm":
426
            corrected_pvals = HolmBonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals
427
        elif method == "fdr":
428
            # get the empirical p-value distributions for FDR
429
            term_pop = getattr(self, 'term_pop', None)
430
            if term_pop is None:
431
                term_pop = count_terms(self.pop, self.assoc, self.obo_dag)
432
            p_val_distribution = calc_qval(len(ntmt.study),
433
                                           self.pop_n,
434
                                           self.pop, self.assoc,
435
                                           term_pop, self.obo_dag)
436
            corrected_pvals = FDR(p_val_distribution,
437
                                  ntmt.results, ntmt.alpha).corrected_pvals
438
439
        self._update_pvalcorr(ntmt, corrected_pvals)
440
441
    @staticmethod
442
    def _update_pvalcorr(ntmt, corrected_pvals):
443
        """Add data members to store multiple test corrections."""
444
        if corrected_pvals is None:
445
            return
446
        for rec, val in zip(ntmt.results, corrected_pvals):
447
            rec.set_corrected_pval(ntmt.nt_method, val)
448
449
    # Methods for writing results into tables: text, tab-separated, Excel spreadsheets
450
    def wr_txt(self, fout_txt, goea_results, prtfmt=None, **kws):
451
        """Print GOEA results to text file."""
452
        if not goea_results:
453
            sys.stdout.write("      0 GOEA results. NOT WRITING {FOUT}\n".format(FOUT=fout_txt))
454
            return
455
        with open(fout_txt, 'w') as prt:
456
            data_nts = self.prt_txt(prt, goea_results, prtfmt, **kws)
457
            log = self.log if self.log is not None else sys.stdout
458
            log.write("  {N:>5} GOEA results for {CUR:5} study items. WROTE: {F}\n".format(
459
                N=len(data_nts),
460
                CUR=len(get_study_items(goea_results)),
461
                F=fout_txt))
462
463
    def prt_txt(self, prt, goea_results, prtfmt=None, **kws):
464
        """Print GOEA results in text format."""
465
        if prtfmt is None:
466
            prtfmt = "{GO} {NS} {p_uncorrected:5.2e} {study_count:>5} {name}\n"
467
        prtfmt = self.adjust_prtfmt(prtfmt)
468
        prt_flds = RPT.get_fmtflds(prtfmt)
469
        data_nts = get_goea_nts_prt(goea_results, prt_flds, **kws)
470
        RPT.prt_txt(prt, data_nts, prtfmt, prt_flds, **kws)
471
        return data_nts
472
473
    def wr_xlsx(self, fout_xlsx, goea_results, **kws):
474
        """Write a xlsx file."""
475
        # kws: prt_if indent
476
        prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results))
477
        xlsx_data = get_goea_nts_prt(goea_results, prt_flds, **kws)
478
        if 'fld2col_widths' not in kws:
479
            kws['fld2col_widths'] = {f:self.default_fld2col_widths.get(f, 8) for f in prt_flds}
480
        RPT.wr_xlsx(fout_xlsx, xlsx_data, **kws)
481
482
    def wr_tsv(self, fout_tsv, goea_results, **kws):
483
        """Write tab-separated table data to file"""
484
        prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results))
485
        tsv_data = get_goea_nts_prt(goea_results, prt_flds, **kws)
486
        RPT.wr_tsv(fout_tsv, tsv_data, **kws)
487
488
    def prt_tsv(self, prt, goea_results, **kws):
489
        """Write tab-separated table data"""
490
        prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results))
491
        tsv_data = get_goea_nts_prt(goea_results, prt_flds, **kws)
492
        RPT.prt_tsv(prt, tsv_data, prt_flds, **kws)
493
494
    @staticmethod
495
    def adjust_prtfmt(prtfmt):
496
        """Adjust format_strings for legal values."""
497
        prtfmt = prtfmt.replace("{p_holm-sidak", "{p_holm_sidak")
498
        prtfmt = prtfmt.replace("{p_simes-hochberg", "{p_simes_hochberg")
499
        return prtfmt
500
501
    @staticmethod
502
    def get_NS2nts(results, fldnames=None, **kws):
503
        """Get namedtuples of GOEA results, split into BP, MF, CC."""
504
        NS2nts = cx.defaultdict(list)
505
        nts = get_goea_nts_all(results, fldnames, **kws)
506
        for nt in nts:
507
            NS2nts[nt.NS].append(nt)
508
        return NS2nts
509
510
    @staticmethod
511
    def get_item_cnt(results, attrname="study_items"):
512
        """Get all study or population items (e.g., geneids)."""
513
        items = set()
514
        go_cnt = 0
515
        for rec in results:
516
            if hasattr(rec, attrname):
517
                items_cur = getattr(rec, attrname)
518
                # Only count GO term if there are items in the set.
519
                if len(items_cur) != 0:
520
                    items |= items_cur
521
                    go_cnt += 1
522
        return items, go_cnt
523
524
    @staticmethod
525
    def get_prtflds_default(results):
526
        """Get default fields names. Used in printing GOEA results.
527
528
           Researchers can control which fields they want to print in the GOEA results
529
           or they can use the default fields.
530
        """
531
        if results:
532
            return results[0].get_prtflds_default()
533
        return []
534
535
    @staticmethod
536
    def print_summary(results, min_ratio=None, indent=False, pval=0.05):
537
        """Print summary."""
538
        from .version import __version__ as version
539
540
        # Header contains provenance and parameters
541
        print("# Generated by GOATOOLS v{0} ({1})".format(version, datetime.date.today()))
542
        print("# min_ratio={0} pval={1}".format(min_ratio, pval))
543
544
        # field names for output
545
        if results:
546
            print("\t".join(GOEnrichmentStudy.get_prtflds_default(results)))
547
548
        for rec in results:
549
            # calculate some additional statistics
550
            # (over_under, is_ratio_different)
551
            rec.update_remaining_fldsdefprt(min_ratio=min_ratio)
552
553
            if pval is not None and rec.p_uncorrected >= pval:
554
                continue
555
556
            if rec.is_ratio_different:
557
                print(rec.__str__(indent=indent))
558
559
    def wr_py_goea_results(self, fout_py, goea_results, **kws):
560
        """Save GOEA results into Python package containing list of namedtuples."""
561
        var_name = kws.get("var_name", "goea_results")
562
        docstring = kws.get("docstring", "")
563
        sortby = kws.get("sortby", None)
564
        if goea_results:
565
            from goatools.nt_utils import wr_py_nts
566
            nts_goea = goea_results
567
            # If list has GOEnrichmentRecords or verbose namedtuples, exclude some fields.
568
            if hasattr(goea_results[0], "_fldsdefprt") or hasattr(goea_results[0], 'goterm'):
569
                # Exclude some attributes from the namedtuple when saving results
570
                # to a Python file because the information is redundant or verbose.
571
                nts_goea = get_goea_nts_prt(goea_results)
572
            docstring = "\n".join([docstring, "# {VER}\n\n".format(VER=self.obo_dag.version)])
573
            assert hasattr(nts_goea[0], '_fields')
574
            if sortby is None:
575
                sortby = lambda nt: getattr(nt, 'p_uncorrected')
576
            nts_goea = sorted(nts_goea, key=sortby)
577
            wr_py_nts(fout_py, nts_goea, docstring, var_name)
578
579
def get_study_items(goea_results):
580
    """Get all study items (e.g., geneids)."""
581
    study_items = set()
582
    for rec in goea_results:
583
        study_items |= rec.study_items
584
    return study_items
585
586
def get_goea_nts_prt(goea_results, fldnames=None, **usr_kws):
587
    """Return list of namedtuples removing fields which are redundant or verbose."""
588
    kws = usr_kws.copy()
589
    if 'not_fldnames' not in kws:
590
        kws['not_fldnames'] = ['goterm', 'parents', 'children', 'id']
591
    if 'rpt_fmt' not in kws:
592
        kws['rpt_fmt'] = True
593
    return get_goea_nts_all(goea_results, fldnames, **kws)
594
595
def get_goea_nts_all(goea_results, fldnames=None, **kws):
596
    """Get namedtuples containing user-specified (or default) data from GOEA results.
597
598
        Reformats data from GOEnrichmentRecord objects into lists of
599
        namedtuples so the generic table writers may be used.
600
    """
601
    data_nts = [] # A list of namedtuples containing GOEA results
602
    if not goea_results:
603
        return data_nts
604
    keep_if = kws.get('keep_if', None)
605
    rpt_fmt = kws.get('rpt_fmt', False)
606
    indent = kws.get('indent', False)
607
    # I. FIELD (column) NAMES
608
    not_fldnames = kws.get('not_fldnames', None)
609
    if fldnames is None:
610
        fldnames = get_fieldnames(goea_results[0])
611
    # Ia. Explicitly exclude specific fields from named tuple
612
    if not_fldnames is not None:
613
        fldnames = [f for f in fldnames if f not in not_fldnames]
614
    nttyp = cx.namedtuple("NtGoeaResults", " ".join(fldnames))
615
    goid_idx = fldnames.index("GO") if 'GO' in fldnames else None
616
    # II. Loop through GOEA results stored in a GOEnrichmentRecord object
617
    for goerec in goea_results:
618
        vals = get_field_values(goerec, fldnames, rpt_fmt)
619
        if indent:
620
            vals[goid_idx] = "".join([goerec.get_indent_dots(), vals[goid_idx]])
621
        ntobj = nttyp._make(vals)
622
        if keep_if is None or keep_if(ntobj):
623
            data_nts.append(ntobj)
624
    return data_nts
625
626
def get_field_values(item, fldnames, rpt_fmt=None):
627
    """Return fieldnames and values of either a namedtuple or GOEnrichmentRecord."""
628
    if hasattr(item, "_fldsdefprt"): # Is a GOEnrichmentRecord
629
        return item.get_field_values(fldnames, rpt_fmt)
630
    if hasattr(item, "_fields"): # Is a namedtuple
631
        return [getattr(item, f) for f in fldnames]
632
633
def get_fieldnames(item):
634
    """Return fieldnames of either a namedtuple or GOEnrichmentRecord."""
635
    if hasattr(item, "_fldsdefprt"): # Is a GOEnrichmentRecord
636
        return item.get_prtflds_all()
637
    if hasattr(item, "_fields"): # Is a namedtuple
638
        return item._fields
639
640
# Copyright (C) 2010-2017, H Tang et al., All rights reserved.
641