Passed
Push — master ( 41c0d7...aa2adb )
by Marcin
07:40
created

RNAalignment._include_column()   A

Complexity

Conditions 3

Size

Total Lines 7
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 6
nop 2
dl 0
loc 7
rs 10
c 0
b 0
f 0
1
#!/usr/bin/env python
2
3
"""RNAalignment
4
5
Example::
6
7
    # STOCKHOLM 1.0
8
9
    AACY023581040                --CGUUGGACU------AAA--------AGUCGGAAGUAAGC-----AAU-C------GCUGAAGCAACGC---
10
    AJ630128                     AUCGUUCAUUCGCUAUUCGCA-AAUAGCGAACGCAA--AAG------CCG-A-------CUGAAGGAACGGGAC
11
    target                       --CGUUGACCCAG----GAAA-----CUGGGCGGAAGUAAGGCCCAUUGCACUCCGGGCCUGAAGCAACGCG--
12
    #=GC SS_cons                 ::(((((,<<<<<<<.._____..>>>>>>>,,,,,,,,<<<<...._____.....>>>>,,,,)))))::::
13
    x                            --xxxxxxxxx-----------------xxxxxxxx--xxx------------------xxxxxxxxxxx----
14
    #=GC RF                      AUCGUUCAuCucccc..uuuuu..ggggaGaCGGAAGUAGGca....auaaa.....ugCCGAAGGAACGCguu
15
//
16
17
x line is used to pick resides to calculate RMSD.
18
19
x line could be renamed to EvoClust.  When no selector sequence is available
20
the ``#=GC RF`` reference annotation is used instead, and if neither is
21
present the selector is derived from columns without gaps."""
22
from __future__ import print_function
23
import warnings
24
warnings.filterwarnings("ignore")
25
26
from Bio import AlignIO
27
28
class RNAalignment:
29
    """RNAalignemnt"""
30
31
    def __init__(self, fn, verbose=False):
32
        """Load the alignment in the Stockholm format using biopython"""
33
        self.verbose = verbose
34
        self.alignment = AlignIO.read(open(fn), "stockholm")
35
        if self.verbose:
36
            print('Alignment records:')
37
            for record in self.alignment:
38
                print(' ', record.id, str(record.seq))
39
        self._selection_track, self._selection_source = self._get_selection_track()
40
        if self.verbose:
41
            print('Selector source:', self._selection_source)
42
            print('Selector (x-line):', self._selection_track)
43
44
    def _get_selection_track(self):
45
        """Return a string that marks positions to be used for RMSD calculation.
46
47
        Historically an "x" (or EvoClust) sequence was appended to the alignment
48
        and the x characters were used as selectors.  Modern Infernal-generated
49
        Stockholm files tend to provide a ``#=GC RF`` line instead.  Support both
50
        by checking for an explicit sequence first and falling back to the
51
        reference annotation.
52
        """
53
54
        def is_selector_record(record_id):
55
            rid = record_id.strip().lower()
56
            return rid in ("x", "evoclust")
57
58
        for record in reversed(self.alignment):
59
            if is_selector_record(record.id):
60
                return str(record.seq), "sequence"
61
62
        column_annotations = getattr(self.alignment, "column_annotations", {})
63
        reference = column_annotations.get("reference_annotation")
64
        if reference:
65
            return reference, "reference_annotation"
66
67
        auto_selector = self._build_gapless_selector()
68
        if auto_selector:
69
            return auto_selector, "gapless_columns"
70
71
        raise Exception(
72
            "Alignment must contain an EvoClust/x line, a #=GC RF reference annotation, or columns without gaps"
73
        )
74
75
    def get_range(self, seqid, offset=0, verbose=None):
76
        """Get a list of positions for selected residues based on the last line of the alignment!
77
78
        If seqis not found in the alignment, raise an exception, like ::
79
80
            Exception: Seq not found in the alignment: 'CP000879.1/21644622164546
81
82
        .. warning:: EvoClust lines has to be -1 in the alignemnt."""
83
        # evoclust/reference line
84
        x = self._selection_track  # ---(((((((----xxxxx-- or RF annotation
85
86
        if verbose is None:
87
            verbose = self.verbose
88
89
        x_range = []
90
        seq_found = False
91
        for record in self.alignment:
92
            if record.id.strip().lower() in ("x", "evoclust"):
93
                continue
94
            if record.id == seqid.strip():
95
                seq_found = True
96
                spos = 0
97
                for xi, si in zip(x, record.seq):
98
                    if si != '-':
99
                        spos += 1
100
                    if self._include_column(xi):
101
                        # print xi, si,
102
                        # print si, spos
103
                        x_range.append(spos + offset)
104
        # if verbose: print '  # selected residues:', len(x_range)
105
        if verbose:
106
            print(' Selected residues for %s: %s' % (seqid, x_range))
107
        if not seq_found:
108
            raise Exception('Seq not found in the alignment: %s' % seqid)
109
        if not x_range:
110
            raise Exception('Seq not found or selector line is malformed')
111
        return x_range
112
113
    def _include_column(self, selector_char):
114
        """Return True when the selector character marks a column to use."""
115
        if not selector_char:
116
            return False
117
        if selector_char in ('-', '.', ' '):
118
            return False
119
        return True
120
121
    def _build_gapless_selector(self):
122
        """Build a selector string that keeps columns without gaps in any sequence."""
123
        sequences = [str(record.seq) for record in self.alignment
124
                     if record.id.strip().lower() not in ("x", "evoclust")]
125
        if not sequences:
126
            return None
127
128
        selector_chars = []
129
        for column in zip(*sequences):
130
            if all(base != '-' for base in column):
131
                selector_chars.append('x')
132
            else:
133
                selector_chars.append('-')
134
        selector = ''.join(selector_chars)
135
        if 'x' not in selector:
136
            return None
137
        if self.verbose:
138
            print(' Inferred selector from gapless columns.')
139
        return selector
140
141
142
if __name__ == '__main__':
143
    ra = RNAalignment('test_data/rp13finalX.stk')
144
    print(len(ra.get_range('rp13target', offset=0)))
145
    print(len(ra.get_range('cp0016', offset=0)))
146
    print(len(ra.get_range('NZ_ABBD01000528', offset=0)))
147
148
    print(ra.get_range('rp13target', offset=0))
149
    print(ra.get_range('cp0016', offset=0))
150
    print(ra.get_range('NZ_ABBD01000528', offset=0))
151