read_ncbi_gene2go()   D
last analyzed

Complexity

Conditions 13

Size

Total Lines 38

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 13
dl 0
loc 38
rs 4.2
c 0
b 0
f 0

How to fix   Complexity   

Complexity

Complex classes like read_ncbi_gene2go() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""
2
Routines to read in association file between genes and GO terms.
3
"""
4
5
__copyright__ = "Copyright (C) 2010-2018, H Tang et al. All rights reserved."
6
__author__ = "various"
7
8
from collections import defaultdict
9
import os
10
import sys
11
import wget
12
from goatools.base import dnld_file
13
from goatools.semantic import TermCounts
14
from goatools.anno.gaf_reader import GafReader
15
16
def dnld_assc(assc_name, go2obj, prt=sys.stdout):
17
    """Download association from http://geneontology.org/gene-associations."""
18
    # Example assc_name: "gene_association.tair"
19
    # Download the Association
20
    dirloc, assc_base = os.path.split(assc_name)
21
    if not dirloc:
22
        dirloc = os.getcwd()
23
    assc_locfile = os.path.join(dirloc, assc_base) if not dirloc else assc_name
24
    if not os.path.isfile(assc_locfile):
25
        assc_http = "http://geneontology.org/gene-associations/"
26
        for ext in ['gz']:
27
            src = os.path.join(assc_http, "{ASSC}.{EXT}".format(ASSC=assc_base, EXT=ext))
28
            dnld_file(src, assc_locfile, prt, loading_bar=None)
29
    # Read the downloaded association
30
    assc_orig = read_gaf(assc_locfile, prt)
31
    if go2obj is None:
32
        return assc_orig
33
    # If a GO DAG is provided, use only GO IDs present in the GO DAG
34
    assc = {}
35
    goids_dag = set(go2obj.keys())
36
    for gene, goids_cur in assc_orig.items():
37
        assc[gene] = goids_cur.intersection(goids_dag)
38
    return assc
39
40
def read_associations(assoc_fn, no_top=False):
41
    """
42
    Reads a gene id go term association file. The format of the file
43
    is as follows:
44
45
    AAR1	GO:0005575;GO:0003674;GO:0006970;GO:0006970;GO:0040029
46
    AAR2	GO:0005575;GO:0003674;GO:0040029;GO:0009845
47
    ACD5	GO:0005575;GO:0003674;GO:0008219
48
    ACL1	GO:0005575;GO:0003674;GO:0009965;GO:0010073
49
    ACL2	GO:0005575;GO:0003674;GO:0009826
50
    ACL3	GO:0005575;GO:0003674;GO:0009826;GO:0009965
51
52
    Also, the following format is accepted (gene ids are repeated):
53
54
    AAR1	GO:0005575
55
    AAR1    GO:0003674
56
    AAR1    GO:0006970
57
    AAR2	GO:0005575
58
    AAR2    GO:0003674
59
    AAR2    GO:0040029
60
61
    :param assoc_fn: file name of the association
62
    :return: dictionary having keys: gene id, values set of GO terms
63
    """
64
    assoc = defaultdict(set)
65
    top_terms = set(['GO:0008150', 'GO:0003674', 'GO:0005575']) # BP, MF, CC
66
    for row in open(assoc_fn, 'r'):
67
        atoms = row.split()
68
        if len(atoms) == 2:
69
            gene_id, go_terms = atoms
70
        elif len(atoms) > 2 and row.count('\t') == 1:
71
            gene_id, go_terms = row.split("\t")
72
        else:
73
            continue
74
        gos = set(go_terms.split(";"))
75
        if no_top:
76
            gos = gos.difference(top_terms)
77
        assoc[gene_id] |= gos
78
79
    return assoc
80
81
def get_assoc_ncbi_taxids(taxids, force_dnld=False, loading_bar=True, **kws):
82
    """Download NCBI's gene2go. Return annotations for user-specified taxid(s)."""
83
    fin = kws['gene2go'] if 'gene2go' in kws else os.path.join(os.getcwd(), "gene2go")
84
    dnld_ncbi_gene_file(fin, force_dnld, loading_bar=loading_bar)
85
    return read_ncbi_gene2go(fin, taxids, **kws)
86
87
def dnld_ncbi_gene_file(fin, force_dnld=False, log=sys.stdout, loading_bar=True):
88
    """Download a file from NCBI Gene's ftp server."""
89
    if not os.path.exists(fin) or force_dnld:
90
        import gzip
91
        fin_dir, fin_base = os.path.split(fin)
92
        fin_gz = "{F}.gz".format(F=fin_base)
93
        fin_gz = os.path.join(fin_dir, fin_gz)
94
        if os.path.exists(fin_gz):
95
            os.remove(fin_gz)
96
        fin_ftp = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/{F}.gz".format(F=fin_base)
97
        if log is not None:
98
            log.write("  DOWNLOADING GZIP: {GZ}\n".format(GZ=fin_ftp))
99
        if loading_bar:
100
            loading_bar = wget.bar_adaptive
101
        wget.download(fin_ftp, bar=loading_bar)
102
        with gzip.open(fin_gz, 'rb') as zstrm:
103
            if log is not None:
104
                log.write("\n  READ GZIP:  {F}\n".format(F=fin_gz))
105
            with open(fin, 'wb') as ostrm:
106
                ostrm.write(zstrm.read())
107
                if log is not None:
108
                    log.write("  WROTE UNZIPPED: {F}\n".format(F=fin))
109
110
def read_ncbi_gene2go(fin_gene2go, taxids=None, **kws):
111
    """Read NCBI's gene2go. Return gene2go data for user-specified taxids."""
112
    # Written by DV Klopfenstein
113
    # kws: taxid2asscs evidence_set
114
    # Simple associations
115
    id2gos = defaultdict(set)
116
    # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
117
    # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))
118
    taxid2asscs = kws.get('taxid2asscs', None)
119
    evs = kws.get('evidence_set', None)
120
    # By default, return id2gos. User can cause go2geneids to be returned by:
121
    #   >>> read_ncbi_gene2go(..., go2geneids=True
122
    b_geneid2gos = not kws.get('go2geneids', False)
123
    if taxids is None: # Default taxid is Human
124
        taxids = [9606]
125
    with open(fin_gene2go) as ifstrm:
126
        for line in ifstrm:
127
            if line[0] != '#': # Line contains data. Not a comment
128
                line = line.rstrip() # chomp
129
                flds = line.split('\t')
130
                if len(flds) >= 5:
131
                    taxid_curr, geneid, go_id, evidence, qualifier = flds[:5]
132
                    taxid_curr = int(taxid_curr)
133
                    # NOT: Used when gene is expected to have function F, but does NOT.
134
                    # ND : GO function not seen after exhaustive annotation attempts to the gene.
135
                    if taxid_curr in taxids and qualifier != 'NOT' and evidence != 'ND':
136
                        # Optionally specify a subset of GOs based on their evidence.
137
                        if evs is None or evidence in evs:
138
                            geneid = int(geneid)
139
                            if b_geneid2gos:
140
                                id2gos[geneid].add(go_id)
141
                            else:
142
                                id2gos[go_id].add(geneid)
143
                            if taxid2asscs is not None:
144
                                taxid2asscs[taxid_curr]['GeneID2GOs'][geneid].add(go_id)
145
                                taxid2asscs[taxid_curr]['GO2GeneIDs'][go_id].add(geneid)
146
        sys.stdout.write("  {N:,} items READ: {ASSC}\n".format(N=len(id2gos), ASSC=fin_gene2go))
147
    return id2gos # return simple associations
148
149
def get_gaf_hdr(fin_gaf):
150
    """Read Gene Association File (GAF). Return GAF version and data info."""
151
    return GafReader(fin_gaf, hdr_only=True).hdr
152
153
def read_gaf(fin_gaf, prt=sys.stdout, **kws):
154
    """Read Gene Association File (GAF). Return data."""
155
    # keyword arguments for choosing which GO IDs to keep
156
    taxid2asscs = kws.get('taxid2asscs', None)
157
    b_geneid2gos = not kws.get('go2geneids', False)
158
    evs = kws.get('evidence_set', None)
159
    eval_nd = get_nd(kws.get('keep_ND', False))
160
    eval_not = get_not(kws.get('keep_NOT', False))
161
    # keyword arguments what is read from GAF.
162
    hdr_only = kws.get('hdr_only', None) # Read all data from GAF by default
163
    # Read GAF file
164
    # Simple associations
165
    id2gos = defaultdict(set)
166
    # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
167
    gafobj = GafReader(fin_gaf, hdr_only, prt, **kws)
168
    # Optionally specify a subset of GOs based on their evidence.
169
    # By default, return id2gos. User can cause go2geneids to be returned by:
170
    #   >>> read_ncbi_gene2go(..., go2geneids=True
171
    for ntgaf in gafobj.associations:
172
        if eval_nd(ntgaf) and eval_not(ntgaf):
173
            if evs is None or ntgaf.Evidence_Code in evs:
174
                taxid = ntgaf.Taxon[0]
175
                geneid = ntgaf.DB_ID
176
                go_id = ntgaf.GO_ID
177
                if b_geneid2gos:
178
                    id2gos[geneid].add(go_id)
179
                else:
180
                    id2gos[go_id].add(geneid)
181
                if taxid2asscs is not None:
182
                    taxid2asscs[taxid]['ID2GOs'][geneid].add(go_id)
183
                    taxid2asscs[taxid]['GO2IDs'][go_id].add(geneid)
184
    return id2gos # return simple associations
185
186
def get_nd(keep_nd):
187
    """Allow GAF values always or never."""
188
    if keep_nd:
189
        return lambda nt: True
190
    return lambda nt: nt.Evidence_Code != 'ND'
191
192
def get_not(keep_not):
193
    """Allow GAF values always or never."""
194
    if keep_not:
195
        return lambda nt: True
196
    return lambda nt: 'NOT' not in nt.Qualifier
197
198
def get_b2aset(a2bset):
199
    """Given gene2gos, return go2genes. Given go2genes, return gene2gos."""
200
    b2aset = {}
201
    for a_item, bset in a2bset.items():
202
        for b_item in bset:
203
            if b_item in b2aset:
204
                b2aset[b_item].add(a_item)
205
            else:
206
                b2aset[b_item] = set([a_item])
207
    return b2aset
208
209
def get_assc_pruned(assc_geneid2gos, min_genecnt=None, max_genecnt=None, prt=sys.stdout):
210
    """Remove GO IDs associated with large numbers of genes. Used in stochastic simulations."""
211
    # DEFN WAS: get_assc_pruned(assc_geneid2gos, max_genecnt=None, prt=sys.stdout):
212
    #      ADDED min_genecnt argument and functionality
213
    if max_genecnt is None and min_genecnt is None:
214
        return assc_geneid2gos, set()
215
    go2genes_orig = get_b2aset(assc_geneid2gos)
216
    # go2genes_prun = {go:gs for go, gs in go2genes_orig.items() if len(gs) <= max_genecnt}
217
    go2genes_prun = {}
218
    for goid, genes in go2genes_orig.items():
219
        num_genes = len(genes)
220
        if (min_genecnt is None or num_genes >= min_genecnt) and \
221
           (max_genecnt is None or num_genes <= max_genecnt):
222
            go2genes_prun[goid] = genes
223
    num_was = len(go2genes_orig)
224
    num_now = len(go2genes_prun)
225
    gos_rm = set(go2genes_orig.keys()).difference(set(go2genes_prun.keys()))
226
    assert num_was-num_now == len(gos_rm)
227
    if prt is not None:
228
        if min_genecnt is None:
229
            min_genecnt = 1
230
        if max_genecnt is None:
231
            max_genecnt = "Max"
232
        prt.write("{N:4} GO IDs pruned. Kept {NOW} GOs assc w/({m} to {M} genes)\n".format(
233
            m=min_genecnt, M=max_genecnt, N=num_was-num_now, NOW=num_now))
234
    return get_b2aset(go2genes_prun), gos_rm
235
236
def read_annotations(**kws):
237
    """Read annotations from either a GAF file or NCBI's gene2go file."""
238
    if 'gaf' not in kws and 'gene2go' not in kws:
239
        return
240
    gene2gos = None
241
    if 'gaf' in kws:
242
        gene2gos = read_gaf(kws['gaf'], prt=sys.stdout)
243
    elif 'gene2go' in kws:
244
        gene2gos = read_ncbi_gene2go(kws['gene2go'], taxids=[kws['taxid']])
245
    if not gene2gos:
246
        raise RuntimeError("NO ASSOCIATIONS LOADED")
247
    return gene2gos
248
249
def get_tcntobj(go2obj, **kws):
250
    """Return a TermCounts object if the user provides an annotation file, otherwise None."""
251
    # kws: gaf gene2go
252
    annots = read_annotations(**kws)
253
    if annots:
254
        return TermCounts(go2obj, annots)
255
256
257
# Copyright (C) 2010-2018, H Tang et al. All rights reserved."
258