1
|
|
|
"""Functions to read text or tsv files containing GO IDs and sections of GO IDs.""" |
2
|
|
|
|
3
|
|
|
from __future__ import print_function |
4
|
|
|
|
5
|
|
|
import os |
6
|
|
|
import sys |
7
|
|
|
import re |
8
|
|
|
from goatools.gosubdag.go_tasks import chk_goids |
9
|
|
|
from goatools.grouper.hdrgos import HdrgosSections |
10
|
|
|
from goatools.grouper.grprobj import Grouper |
11
|
|
|
from goatools.grouper.tasks import SummarySec2dHdrGos |
12
|
|
|
|
13
|
|
|
__copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, All rights reserved." |
14
|
|
|
__author__ = "DV Klopfenstein" |
15
|
|
|
|
16
|
|
|
|
17
|
|
|
def read_sections(sections_file, exclude_ungrouped=False, prt=sys.stdout): |
18
|
|
|
"""Get sections and GO grouping hdrgos from file, if sections exist.""" |
19
|
|
|
if sections_file is None: |
20
|
|
|
return None |
21
|
|
|
if os.path.exists(sections_file): |
22
|
|
|
file_contents = read_goids(sections_file, False, exclude_ungrouped) |
23
|
|
|
return file_contents.get('sections', None) |
24
|
|
|
if prt: |
25
|
|
|
prt.write("CANNOT READ: {SEC}\n".format(SEC=sections_file)) |
26
|
|
|
|
27
|
|
|
def read_goids(fin_txt, get_goids_only=False, exclude_ungrouped=False, prt=sys.stdout): |
28
|
|
|
"""Get user list of GO IDs either from a list or from GO IDs on the command-line""" |
29
|
|
|
return ReadGoids().read_txt(fin_txt, get_goids_only, exclude_ungrouped, prt) |
30
|
|
|
|
31
|
|
|
|
32
|
|
|
class ReadGoids(object): |
33
|
|
|
"""Get user list of GO IDs either from a list or from GO IDs on the command-line""" |
34
|
|
|
|
35
|
|
|
srch_section = re.compile(r'^#?\s*SECTION:\s*(\S.*\S)\s*$', flags=re.IGNORECASE) |
36
|
|
|
|
37
|
|
|
def __init__(self): |
38
|
|
|
self.goids_fin = [] |
39
|
|
|
self.sections_seen = [] |
40
|
|
|
self.section2goids = {} |
41
|
|
|
|
42
|
|
|
def read_txt(self, fin_txt, get_goids_only, exclude_ungrouped, prt=sys.stdout): |
43
|
|
|
"""Get user list of GO IDs either from a list or from GO IDs on the command-line""" |
44
|
|
|
goids_fin = self._read_txt(fin_txt, get_goids_only, exclude_ungrouped) |
45
|
|
|
# Report unused sections, if any |
46
|
|
|
if len(self.section2goids) != len(self.sections_seen): |
47
|
|
|
self._rpt_unused_sections(prt) |
48
|
|
|
# If there are no sections, then goids_fin holds all GO IDs in file |
49
|
|
|
if not self.sections_seen: |
50
|
|
|
self.goids_fin = goids_fin |
51
|
|
|
# Print summary of GO IDs read |
52
|
|
|
if prt is not None: |
53
|
|
|
self._prt_read_msg(prt, fin_txt, exclude_ungrouped) |
54
|
|
|
|
55
|
|
|
if goids_fin: |
56
|
|
|
return self.internal_get_goids_or_sections() |
57
|
|
|
else: |
58
|
|
|
sys.stdout.write( |
59
|
|
|
"\n**WARNING: GO IDs MUST BE THE FIRST 10 CHARACTERS OF EACH LINE\n\n") |
60
|
|
|
|
61
|
|
|
def _read_txt(self, fin_txt, get_goids_only, exclude_ungrouped): |
62
|
|
|
"""Read GO file. Store results in: section2goids sections_seen. Return goids_fin.""" |
63
|
|
|
goids_sec = [] |
64
|
|
|
with open(fin_txt) as istrm: |
65
|
|
|
# Lines starting with a GO ID will have that GO ID read and stored. |
66
|
|
|
# * Lines that do not start with a GO ID will be ignored. |
67
|
|
|
# * Text after the 10 characters in a GO ID will be ignored. |
68
|
|
|
section_name = None |
69
|
|
|
for line in istrm: |
70
|
|
|
if line[:3] == "GO:": |
71
|
|
|
goids_sec.append(line[:10]) |
72
|
|
|
elif not get_goids_only and ":" in line: |
73
|
|
|
mtch = self.srch_section.match(line) |
74
|
|
|
if mtch: |
75
|
|
|
secstr = mtch.group(1) |
76
|
|
|
if section_name is not None and goids_sec: |
77
|
|
|
self.section2goids[section_name] = goids_sec |
78
|
|
|
if not exclude_ungrouped or secstr != HdrgosSections.secdflt: |
79
|
|
|
section_name = secstr |
80
|
|
|
self.sections_seen.append(section_name) |
81
|
|
|
else: |
82
|
|
|
section_name = None |
83
|
|
|
goids_sec = [] |
84
|
|
|
if section_name is not None and goids_sec: |
85
|
|
|
self.section2goids[section_name] = goids_sec |
86
|
|
|
return goids_sec |
87
|
|
|
|
88
|
|
|
def _rpt_unused_sections(self, prt): |
89
|
|
|
"""Report unused sections.""" |
90
|
|
|
sections_unused = set(self.sections_seen).difference(self.section2goids.keys()) |
91
|
|
|
for sec in sections_unused: |
92
|
|
|
prt.write(" UNUSED SECTION: {SEC}\n".format(SEC=sec)) |
93
|
|
|
|
94
|
|
|
def internal_get_goids_or_sections(self): |
95
|
|
|
"""Return GO IDs, Sections/GOs, or None.""" |
96
|
|
|
if self.goids_fin: |
97
|
|
|
chk_goids(self.goids_fin, "read_goids") |
98
|
|
|
return {'goids' : self.goids_fin} |
99
|
|
|
else: |
100
|
|
|
# Convert dict into 2D list retaining original section order |
101
|
|
|
sections_2d = [] |
102
|
|
|
for section_name in self.sections_seen: |
103
|
|
|
if section_name in self.section2goids: |
104
|
|
|
goids = self.section2goids.get(section_name) |
105
|
|
|
chk_goids(goids, "GO IDs IN SECTION({S})".format(S=section_name)) |
106
|
|
|
sections_2d.append((section_name, goids)) |
107
|
|
|
return {'sections' : sections_2d} |
108
|
|
|
|
109
|
|
|
def _prt_read_msg(self, prt, fin_txt, exclude_ungrouped): |
110
|
|
|
"""Print which file was read and the number of GO IDs found.""" |
111
|
|
|
if self.sections_seen or exclude_ungrouped: |
112
|
|
|
# dat = Grouper.get_summary_data(self.section2goids.items(), HdrgosSections.secdflt) |
113
|
|
|
dat = SummarySec2dHdrGos().summarize_sec2hdrgos(self.section2goids.items()) |
114
|
|
|
sys.stdout.write(Grouper.fmtsum.format( |
115
|
|
|
GO_DESC='hdr', SECs=len(dat['S']), GOs=len(dat['G']), |
116
|
|
|
UNGRP="N/A", undesc="unused", ACTION="READ: ", FILE=fin_txt)) |
117
|
|
|
elif self.goids_fin: |
118
|
|
|
prt.write(" {G} GO IDs READ: {FIN}\n".format(G=len(self.goids_fin), FIN=fin_txt)) |
119
|
|
|
|
120
|
|
|
|
121
|
|
|
# Copyright (C) 2016-2018, DV Klopfenstein, All rights reserved. |
122
|
|
|
|