|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
|
|
3
|
|
|
"""RNAalignment |
|
4
|
|
|
|
|
5
|
|
|
Example:: |
|
6
|
|
|
|
|
7
|
|
|
# STOCKHOLM 1.0 |
|
8
|
|
|
|
|
9
|
|
|
AACY023581040 --CGUUGGACU------AAA--------AGUCGGAAGUAAGC-----AAU-C------GCUGAAGCAACGC--- |
|
10
|
|
|
AJ630128 AUCGUUCAUUCGCUAUUCGCA-AAUAGCGAACGCAA--AAG------CCG-A-------CUGAAGGAACGGGAC |
|
11
|
|
|
target --CGUUGACCCAG----GAAA-----CUGGGCGGAAGUAAGGCCCAUUGCACUCCGGGCCUGAAGCAACGCG-- |
|
12
|
|
|
#=GC SS_cons ::(((((,<<<<<<<.._____..>>>>>>>,,,,,,,,<<<<...._____.....>>>>,,,,))))):::: |
|
13
|
|
|
x --xxxxxxxxx-----------------xxxxxxxx--xxx------------------xxxxxxxxxxx---- |
|
14
|
|
|
#=GC RF AUCGUUCAuCucccc..uuuuu..ggggaGaCGGAAGUAGGca....auaaa.....ugCCGAAGGAACGCguu |
|
15
|
|
|
// |
|
16
|
|
|
|
|
17
|
|
|
x line is used to pick resides to calculate RMSD. |
|
18
|
|
|
|
|
19
|
|
|
x line could be renamed to EvoClust. When no selector sequence is available |
|
20
|
|
|
the ``#=GC RF`` reference annotation is used instead, and if neither is |
|
21
|
|
|
present the selector is derived from columns without gaps.""" |
|
22
|
|
|
from __future__ import print_function |
|
23
|
|
|
import warnings |
|
24
|
|
|
warnings.filterwarnings("ignore") |
|
25
|
|
|
|
|
26
|
|
|
from Bio import AlignIO |
|
27
|
|
|
|
|
28
|
|
|
class RNAalignment: |
|
29
|
|
|
"""RNAalignemnt""" |
|
30
|
|
|
|
|
31
|
|
|
def __init__(self, fn, verbose=False): |
|
32
|
|
|
"""Load the alignment in the Stockholm format using biopython""" |
|
33
|
|
|
self.verbose = verbose |
|
34
|
|
|
self.alignment = AlignIO.read(open(fn), "stockholm") |
|
35
|
|
|
if self.verbose: |
|
36
|
|
|
print('Alignment records:') |
|
37
|
|
|
for record in self.alignment: |
|
38
|
|
|
print(' ', record.id, str(record.seq)) |
|
39
|
|
|
self._selection_track, self._selection_source = self._get_selection_track() |
|
40
|
|
|
if self.verbose: |
|
41
|
|
|
print('Selector source:', self._selection_source) |
|
42
|
|
|
print('Selector (x-line):', self._selection_track) |
|
43
|
|
|
|
|
44
|
|
|
def _get_selection_track(self): |
|
45
|
|
|
"""Return a string that marks positions to be used for RMSD calculation. |
|
46
|
|
|
|
|
47
|
|
|
Historically an "x" (or EvoClust) sequence was appended to the alignment |
|
48
|
|
|
and the x characters were used as selectors. Modern Infernal-generated |
|
49
|
|
|
Stockholm files tend to provide a ``#=GC RF`` line instead. Support both |
|
50
|
|
|
by checking for an explicit sequence first and falling back to the |
|
51
|
|
|
reference annotation. |
|
52
|
|
|
""" |
|
53
|
|
|
|
|
54
|
|
|
def is_selector_record(record_id): |
|
55
|
|
|
rid = record_id.strip().lower() |
|
56
|
|
|
return rid in ("x", "evoclust") |
|
57
|
|
|
|
|
58
|
|
|
for record in reversed(self.alignment): |
|
59
|
|
|
if is_selector_record(record.id): |
|
60
|
|
|
return str(record.seq), "sequence" |
|
61
|
|
|
|
|
62
|
|
|
column_annotations = getattr(self.alignment, "column_annotations", {}) |
|
63
|
|
|
reference = column_annotations.get("reference_annotation") |
|
64
|
|
|
if reference: |
|
65
|
|
|
return reference, "reference_annotation" |
|
66
|
|
|
|
|
67
|
|
|
auto_selector = self._build_gapless_selector() |
|
68
|
|
|
if auto_selector: |
|
69
|
|
|
return auto_selector, "gapless_columns" |
|
70
|
|
|
|
|
71
|
|
|
raise Exception( |
|
72
|
|
|
"Alignment must contain an EvoClust/x line, a #=GC RF reference annotation, or columns without gaps" |
|
73
|
|
|
) |
|
74
|
|
|
|
|
75
|
|
|
def get_range(self, seqid, offset=0, verbose=None): |
|
76
|
|
|
"""Get a list of positions for selected residues based on the last line of the alignment! |
|
77
|
|
|
|
|
78
|
|
|
If seqis not found in the alignment, raise an exception, like :: |
|
79
|
|
|
|
|
80
|
|
|
Exception: Seq not found in the alignment: 'CP000879.1/21644622164546 |
|
81
|
|
|
|
|
82
|
|
|
.. warning:: EvoClust lines has to be -1 in the alignemnt.""" |
|
83
|
|
|
# evoclust/reference line |
|
84
|
|
|
x = self._selection_track # ---(((((((----xxxxx-- or RF annotation |
|
85
|
|
|
|
|
86
|
|
|
if verbose is None: |
|
87
|
|
|
verbose = self.verbose |
|
88
|
|
|
|
|
89
|
|
|
x_range = [] |
|
90
|
|
|
seq_found = False |
|
91
|
|
|
for record in self.alignment: |
|
92
|
|
|
if record.id.strip().lower() in ("x", "evoclust"): |
|
93
|
|
|
continue |
|
94
|
|
|
if record.id == seqid.strip(): |
|
95
|
|
|
seq_found = True |
|
96
|
|
|
spos = 0 |
|
97
|
|
|
for xi, si in zip(x, record.seq): |
|
98
|
|
|
if si != '-': |
|
99
|
|
|
spos += 1 |
|
100
|
|
|
if self._include_column(xi): |
|
101
|
|
|
# print xi, si, |
|
102
|
|
|
# print si, spos |
|
103
|
|
|
x_range.append(spos + offset) |
|
104
|
|
|
# if verbose: print ' # selected residues:', len(x_range) |
|
105
|
|
|
if verbose: |
|
106
|
|
|
print(' Selected residues for %s: %s' % (seqid, x_range)) |
|
107
|
|
|
if not seq_found: |
|
108
|
|
|
raise Exception('Seq not found in the alignment: %s' % seqid) |
|
109
|
|
|
if not x_range: |
|
110
|
|
|
raise Exception('Seq not found or selector line is malformed') |
|
111
|
|
|
return x_range |
|
112
|
|
|
|
|
113
|
|
|
def _include_column(self, selector_char): |
|
114
|
|
|
"""Return True when the selector character marks a column to use.""" |
|
115
|
|
|
if not selector_char: |
|
116
|
|
|
return False |
|
117
|
|
|
if selector_char in ('-', '.', ' '): |
|
118
|
|
|
return False |
|
119
|
|
|
return True |
|
120
|
|
|
|
|
121
|
|
|
def _build_gapless_selector(self): |
|
122
|
|
|
"""Build a selector string that keeps columns without gaps in any sequence.""" |
|
123
|
|
|
sequences = [str(record.seq) for record in self.alignment |
|
124
|
|
|
if record.id.strip().lower() not in ("x", "evoclust")] |
|
125
|
|
|
if not sequences: |
|
126
|
|
|
return None |
|
127
|
|
|
|
|
128
|
|
|
selector_chars = [] |
|
129
|
|
|
for column in zip(*sequences): |
|
130
|
|
|
if all(base != '-' for base in column): |
|
131
|
|
|
selector_chars.append('x') |
|
132
|
|
|
else: |
|
133
|
|
|
selector_chars.append('-') |
|
134
|
|
|
selector = ''.join(selector_chars) |
|
135
|
|
|
if 'x' not in selector: |
|
136
|
|
|
return None |
|
137
|
|
|
if self.verbose: |
|
138
|
|
|
print(' Inferred selector from gapless columns.') |
|
139
|
|
|
return selector |
|
140
|
|
|
|
|
141
|
|
|
|
|
142
|
|
|
if __name__ == '__main__': |
|
143
|
|
|
ra = RNAalignment('test_data/rp13finalX.stk') |
|
144
|
|
|
print(len(ra.get_range('rp13target', offset=0))) |
|
145
|
|
|
print(len(ra.get_range('cp0016', offset=0))) |
|
146
|
|
|
print(len(ra.get_range('NZ_ABBD01000528', offset=0))) |
|
147
|
|
|
|
|
148
|
|
|
print(ra.get_range('rp13target', offset=0)) |
|
149
|
|
|
print(ra.get_range('cp0016', offset=0)) |
|
150
|
|
|
print(ra.get_range('NZ_ABBD01000528', offset=0)) |
|
151
|
|
|
|