ReadGoids._read_txt()   D
last analyzed

Complexity

Conditions 13

Size

Total Lines 26

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 13
c 1
b 0
f 0
dl 0
loc 26
rs 4.2

How to fix   Complexity   

Complexity

Complex classes like ReadGoids._read_txt() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Functions to read text or tsv files containing GO IDs and sections of GO IDs."""
2
3
from __future__ import print_function
4
5
import os
6
import sys
7
import re
8
from goatools.gosubdag.go_tasks import chk_goids
9
from goatools.grouper.hdrgos import HdrgosSections
10
from goatools.grouper.grprobj import Grouper
11
from goatools.grouper.tasks import SummarySec2dHdrGos
12
13
__copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, All rights reserved."
14
__author__ = "DV Klopfenstein"
15
16
17
def read_sections(sections_file, exclude_ungrouped=False, prt=sys.stdout):
18
    """Get sections and GO grouping hdrgos from file, if sections exist."""
19
    if sections_file is None:
20
        return None
21
    if os.path.exists(sections_file):
22
        file_contents = read_goids(sections_file, False, exclude_ungrouped)
23
        return file_contents.get('sections', None)
24
    if prt:
25
        prt.write("CANNOT READ: {SEC}\n".format(SEC=sections_file))
26
27
def read_goids(fin_txt, get_goids_only=False, exclude_ungrouped=False, prt=sys.stdout):
28
    """Get user list of GO IDs either from a list or from GO IDs on the command-line"""
29
    return ReadGoids().read_txt(fin_txt, get_goids_only, exclude_ungrouped, prt)
30
31
32
class ReadGoids(object):
33
    """Get user list of GO IDs either from a list or from GO IDs on the command-line"""
34
35
    srch_section = re.compile(r'^#?\s*SECTION:\s*(\S.*\S)\s*$', flags=re.IGNORECASE)
36
37
    def __init__(self):
38
        self.goids_fin = []
39
        self.sections_seen = []
40
        self.section2goids = {}
41
42
    def read_txt(self, fin_txt, get_goids_only, exclude_ungrouped, prt=sys.stdout):
43
        """Get user list of GO IDs either from a list or from GO IDs on the command-line"""
44
        goids_fin = self._read_txt(fin_txt, get_goids_only, exclude_ungrouped)
45
        # Report unused sections, if any
46
        if len(self.section2goids) != len(self.sections_seen):
47
            self._rpt_unused_sections(prt)
48
        # If there are no sections, then goids_fin holds all GO IDs in file
49
        if not self.sections_seen:
50
            self.goids_fin = goids_fin
51
        # Print summary of GO IDs read
52
        if prt is not None:
53
            self._prt_read_msg(prt, fin_txt, exclude_ungrouped)
54
55
        if goids_fin:
56
            return self.internal_get_goids_or_sections()
57
        else:
58
            sys.stdout.write(
59
                "\n**WARNING: GO IDs MUST BE THE FIRST 10 CHARACTERS OF EACH LINE\n\n")
60
61
    def _read_txt(self, fin_txt, get_goids_only, exclude_ungrouped):
62
        """Read GO file. Store results in: section2goids sections_seen. Return goids_fin."""
63
        goids_sec = []
64
        with open(fin_txt) as istrm:
65
            # Lines starting with a GO ID will have that GO ID read and stored.
66
            #   * Lines that do not start with a GO ID will be ignored.
67
            #   * Text after the 10 characters in a GO ID will be ignored.
68
            section_name = None
69
            for line in istrm:
70
                if line[:3] == "GO:":
71
                    goids_sec.append(line[:10])
72
                elif not get_goids_only and ":" in line:
73
                    mtch = self.srch_section.match(line)
74
                    if mtch:
75
                        secstr = mtch.group(1)
76
                        if section_name is not None and goids_sec:
77
                            self.section2goids[section_name] = goids_sec
78
                        if not exclude_ungrouped or secstr != HdrgosSections.secdflt:
79
                            section_name = secstr
80
                            self.sections_seen.append(section_name)
81
                        else:
82
                            section_name = None
83
                        goids_sec = []
84
            if section_name is not None and goids_sec:
85
                self.section2goids[section_name] = goids_sec
86
        return goids_sec
87
88
    def _rpt_unused_sections(self, prt):
89
        """Report unused sections."""
90
        sections_unused = set(self.sections_seen).difference(self.section2goids.keys())
91
        for sec in sections_unused:
92
            prt.write("  UNUSED SECTION: {SEC}\n".format(SEC=sec))
93
94
    def internal_get_goids_or_sections(self):
95
        """Return GO IDs, Sections/GOs, or None."""
96
        if self.goids_fin:
97
            chk_goids(self.goids_fin, "read_goids")
98
            return {'goids' : self.goids_fin}
99
        else:
100
            # Convert dict into 2D list retaining original section order
101
            sections_2d = []
102
            for section_name in self.sections_seen:
103
                if section_name in self.section2goids:
104
                    goids = self.section2goids.get(section_name)
105
                    chk_goids(goids, "GO IDs IN SECTION({S})".format(S=section_name))
106
                    sections_2d.append((section_name, goids))
107
            return {'sections' : sections_2d}
108
109
    def _prt_read_msg(self, prt, fin_txt, exclude_ungrouped):
110
        """Print which file was read and the number of GO IDs found."""
111
        if self.sections_seen or exclude_ungrouped:
112
            # dat = Grouper.get_summary_data(self.section2goids.items(), HdrgosSections.secdflt)
113
            dat = SummarySec2dHdrGos().summarize_sec2hdrgos(self.section2goids.items())
114
            sys.stdout.write(Grouper.fmtsum.format(
115
                GO_DESC='hdr', SECs=len(dat['S']), GOs=len(dat['G']),
116
                UNGRP="N/A", undesc="unused", ACTION="READ: ", FILE=fin_txt))
117
        elif self.goids_fin:
118
            prt.write("  {G} GO IDs READ: {FIN}\n".format(G=len(self.goids_fin), FIN=fin_txt))
119
120
121
# Copyright (C) 2016-2018, DV Klopfenstein, All rights reserved.
122