RNAalignment.find_core()   B
last analyzed

Complexity

Conditions 8

Size

Total Lines 32
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 19
nop 2
dl 0
loc 32
rs 7.3333
c 0
b 0
f 0
1
#!/usr/bin/env python2
2
"""RNAalignment - a module to work with RNA sequence alignments.
3
4
To see a full demo what you can do with this util, please take a look at the jupiter notebook (https://github.com/mmagnus/rna-pdb-tools/blob/master/rna_tools/tools/rna_alignment/rna_alignment.ipynb)
5
6
Load an alignment in the Stockholm::
7
8
    alignment = ra.RNAalignment('test_data/RF00167.stockholm.sto')
9
10
 or fasta format::
11
12
     import rna_alignment as ra
13
     alignment = ra.fasta2stokholm(alignment.fasta)
14
     alignment = ra.RNAalignment
15
16
Parameters of the aligmnent::
17
18
     print(alignment.describe())
19
20
Consensus SS::
21
22
    print(alignment.ss_cons_with_pk)
23
24
Get sequnce/s from teh aligment::
25
26
    >>> seq = a.io[0]
27
28
"""
29
30
from Bio import AlignIO
31
from Bio.Phylo.TreeConstruction import DistanceCalculator
32
from rna_tools import SecondaryStructure
33
from rna_tools.rna_tools_config import RCHIE_PATH
34
from collections import OrderedDict
35
36
import tempfile
37
import subprocess
38
import os
39
import shutil
40
import re
41
import gzip
42
43
44
class RNAalignmentError(Exception):
45
    pass
46
47
48
class RChieError(Exception):
49
    pass
50
51
52
class RFAMFetchError(Exception):
53
    pass
54
55
56
class RChie:
57
    """RChie - plotting arc diagrams of RNA secondary structures.
58
59
    .. image:: ../pngs/rchie.png
60
61
    http://www.e-rna.org/r-chie/
62
63
    The offline version of R-chie, which requires first installing R4RNA is available here, or clone our git repository here
64
    How to install it:
65
66
    - Ensure R is installed already, or download it freely from http://www.r-project.org/
67
    - Download the R4RNA (https://github.com/jujubix/r-chie), open R and install the package::
68
69
            install.packages("<path_to_file>/R4RNA", repos = NULL, type="source")
70
            # Install the optparse and RColorBrewer
71
            install.packages('optparse')
72
            install.packages('RColorBrewer')
73
74
    - Go to rna_tools/rna_tools_config_local.py and set RCHIE_PATH to the folder with RChie, e.g. ``"/home/magnus/work/opt/r-chie/"``.
75
76
    To test if Rchie works on your machine (from rna_align folder)::
77
78
      <path to your rchie>/rchie.R --msafile test_data/rchie_test_files/fasta.txt test_data/rchie_test_files/helix.txt
79
80
    you should have rchie.png file in the folder.
81
82
    More at http://www.e-rna.org/r-chie/download.cgi
83
84
    Cite: Daniel Lai, Jeff R. Proctor, Jing Yun A. Zhu, and Irmtraud M. Meyer (2012) R-chie: a web server and R package for visualizing RNA secondary structures. Nucleic Acids Research, first published online March 19, 2012. doi:10.1093/nar/gks241
85
    """
86
87
    def __init__(self):
88
        pass
89
90
    def plot_cov(self, seqs, ss_cons, plot_fn='rchie.png', verbose=False):
91
        """Plot an RChie plot_conv.
92
93
        :param seqs: seqs in the fasta format
94
        :param ss_cons: a string of secondary structure consensus, use only ``().``. Works with pseuoknots.
95
        """
96
        fasta_alignment = tempfile.NamedTemporaryFile(delete=False)
97
        with open(fasta_alignment.name, 'w') as f:
98
            f.write(seqs)
99
100
        plot = tempfile.NamedTemporaryFile(delete=False)
101
        plot.name += '.png'
102
        ss = tempfile.NamedTemporaryFile(delete=False)
103
        with open(ss.name, 'w') as f:
104
            f.write(ss_cons)
105
        if not RCHIE_PATH:
106
            raise RChieError('RChie path not set up!')
107
        cmd = RCHIE_PATH + \
108
            "rchie.R --msafile='%s' --format1 vienna '%s' --output '%s'" % (
109
                fasta_alignment.name, ss.name, plot.name)
110
        if verbose:
111
            print(cmd)
112
        o = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
113
        out = o.stdout.read().strip()
114
        err = o.stderr.read().strip()
115
        # *****PROCESSING MSA FILE*****
116
        # Error in readFasta(opt$msafile, filter = TRUE) : no FASTA sequences found
117
        # Error: ERROR: Invalid FASTA file
118
        # Execution halted
119
        if "error" in str(err).lower():
120
            raise Exception('\n'.join([cmd, err]))
121
        if verbose:
122
            print('\n'.join([cmd, err]))
123
        self.plotfn = plot.name
124
        if verbose:
125
            print(self.plotfn)
126
        if plot_fn:
127
            shutil.move(plot.name, plot_fn)
128
            print('Rchie: plot saved to %s' % plot_fn)
129
        from IPython.display import Image
130
        return Image(filename=plot_fn)
131
132
    def show(self):
133
        from IPython.display import Image
134
        return Image(filename=self.plotfn)
135
136
    def write(self, outfn):
137
        shutil.copyfile(self.plotfn, outfn)
138
        print('Write to %s' % outfn)
139
140
141
class RNASeq(object):
142
    """RNASeq.
143
144
    Args:
145
146
       id (str)    : id of a sequence
147
       seq (str)   : seq, it be uppercased.
148
       ss (str)    : secondary structure, default None
149
150
    Attributes:
151
152
       seq_no_gaps(str) : seq.replace('-', '')
153
       ss_no_gaps(str)  : ss.replace('-', '')
154
155
    .. warning::
156
157
       >>> if 'EF' in s.id: print('Y')
158
       Y
159
       >>> if 'EF' in s: print('Y')       
160
       # nothing
161
       
162
    """
163
164
    def __init__(self, id, seq, ss=None):
165
        self.id = id
166
        self.seq = seq.upper()
167
168
        # self.ss_raw = ss  # this will not be changed after remove_gaps.
169
        # so maybe don't use ss_raw at call
170
        self.ss = ss
171
        self.ss = self.get_ss_std()
172
173
        self.seq_no_gaps = seq.replace('-', '')
174
        self.ss_no_gaps = ss.replace('-', '')
175
176
    #@property
177
    def get_ss_std(self):
178
        nss = ''
179
        for s in self.ss:
180
            nss += get_rfam_ss_notat_to_dot_bracket_notat(s)
181
        return nss
182
183
    def __repr__(self):
184
        return self.id
185
186
    def __len__(self):
187
        return len(self.seq)
188
189
    def __getitem__(self, i):
190
        if self.ss:
191
            return RNASeq(self.id + '_slice', self.seq[i], self.ss[i])
192
        else:
193
            return RNASeq(self.id + '_slice', self.seq[i])
194
195
    def remove_columns(self, to_remove):
196
        """indexing from 0"""
197
        nseq = ''
198
        for i, s in enumerate(self.seq):
199
            if i not in to_remove:
200
                nseq += s
201
202
        nss = ''
203
        if self.ss:
204
            for i, s in enumerate(self.ss):
205
                if i not in to_remove:
206
                    nss += s
207
        self.seq = nseq
208
        self.ss = nss
209
210
    def draw_ss(self, title='', verbose=False, resolution=1.5):
211
        """Draw secondary structure of RNA with VARNA.
212
213
        VARNA: Visualization Applet for RNA
214
        A Java lightweight component and applet for drawing the RNA secondary structure
215
216
        .. image :: ../pngs/varna.png
217
218
        Cite: VARNA: Interactive drawing and editing of the RNA secondary structure Kevin Darty, Alain Denise and Yann Ponty Bioinformatics, pp. 1974-197,, Vol. 25, no. 15, 2009
219
220
        http://varna.lri.fr/"""
221
        drawfn = tempfile.NamedTemporaryFile(delete=False).name + '.png'
222
        SecondaryStructure.draw_ss(title, self.seq, self.ss, drawfn, resolution, verbose=verbose)
223
        from IPython.display import Image
224
        return Image(filename=drawfn)
225
226
    def remove_gaps(self, check_bps=True, only_canonical=True, allow_gu=True):
227
        """Remove gaps from seq and secondary structure of the seq.
228
229
        Args:
230
231
            check_bps (bool)      : fix mistakes as
232
            only_canonical (bool) : keep in ss only pairs GC, AU
233
            allow_gu (bool)       : keep in ss also GU pair
234
235
        .. image :: ../pngs/ss_misgap.png
236
237
        A residue "paired" with a gap.
238
239
        .. image :: ../pngs/ss_misgap_wrong.png
240
241
        .. when check_bps (by default), then when removing gaps, the functions check is the gap is
242
        paired with any residues (in the blue circle). If yes, then this residues is unpair (in this case ``)`` -> ``.``).
243
244
        .. image :: ../pngs/ss_misgap_ok.png
245
246
        if ``only_canonical`` (by default) is True then only GC, AU can be paired.
247
248
249
        .. image :: ../pngs/ss_only_canonical.png
250
251
252
        If ``allow_gu`` is False (be default is True) then GU pair is also possible.
253
254
        .. image :: ../pngs/ss_no_gu.png
255
256
        If you provide seq and secondary structure such as::
257
258
             GgCcGGggG.GcggG.cc.u.aAUACAAuACCC.GaAA.GGGGAAUAaggCc.gGCc.gu......CU.......uugugcgGUuUUcaAgCccCCgGcCaCCcuuuu
259
             (((((((((....((.((............(((......)))......))))..(((.(.....................)))).......)))))))))........
260
261
        gaps will be remove as well.
262
        """
263
        GAPS = ['-', '.']
264
265
        nseq = ''
266
        nss = ''
267
        for (nt, nt_ss) in zip(self.seq, self.ss):
268
            if nt in GAPS and nt_ss in GAPS:
269
                pass
270
            else:
271
                nseq += nt
272
                nss += nt_ss
273
        self.seq = nseq
274
        self.ss = nss
275
276
        nss = list()
277
        bps = self.ss_to_bps()
278
279
        if check_bps:
280
            nss = list(self.ss)
281
            for bp in bps:
282
                nt_left = self.seq[bp[0]]
283
                nt_right = self.seq[bp[1]]
284
                if nt_left == '-' or nt_right == '-':
285
                    nss[bp[0]] = '.'
286
                    nss[bp[1]] = '.'
287
            self.ss = ''.join(nss)
288
289
        if only_canonical:
290
            nseq = list(self.seq)
291
            nss = list(self.ss)
292
            for bp in bps:
293
                nt_left = nseq[bp[0]]
294
                nt_right = nseq[bp[1]]
295
                if (nt_left == 'A' and nt_right == 'U') or (nt_left == 'U' and nt_right == 'A'):
296
                    pass
297
                elif (nt_left == 'G' and nt_right == 'C') or (nt_left == 'C' and nt_right == 'G'):
298
                    pass
299
                elif (nt_left == 'G' and nt_right == 'U') or (nt_left == 'U' and nt_right == 'G'):
300
                    if allow_gu:
301
                        pass
302
                    else:
303
                        nss[bp[0]] = '.'
304
                        nss[bp[1]] = '.'
305
                else:
306
                    nss[bp[0]] = '.'
307
                    nss[bp[1]] = '.'
308
            self.ss = ''.join(nss)
309
310
        # two?????? what is "two"?
311
        nss = []
312
        nseq = ''
313
        for i, (c, s) in enumerate(zip(self.seq, self.ss)):
314
            if c != '-':
315
                nseq += c
316
                nss.append(s)
317
318
        self.seq = nseq
319
        self.ss = ''.join(nss)
320
321
    def ss_to_bps(self):
322
        """Convert secondary structure into a list of basepairs.
323
324
        Returns:
325
326
            bps (list): a list of base pairs, e.g. [[0, 80], [1, 79], [2, 78], [4, 77], [6, 75], [7, 74], ...]
327
328
        """
329
        j = []
330
        bps = []
331
        pair_types = ['()', '[]', '<>', '{}']
332
        for pair_type in pair_types:
333
            for i, s in enumerate(self.ss):
334
                if s == pair_type[0]:
335
                    j.append(i)
336
                if s == pair_type[1]:
337
                    bps.append([j.pop(), i])
338
            if len(j):
339
                # if something left, this is a problem (!!!)
340
                raise Exception('Mis-paired secondary structure')
341
        bps.sort()
342
        return bps
343
344
    def get_conserved(self, consensus, start=0, to_pymol=True, offset=0):
345
        """Start
346
        UCGGGGUGCCCUUCUGCGUG--------------------------------------------------AAGGC-UGAGAAAUACCCGU-------------------------------------------------AUCACCUG-AUCUGGAU-AAUGC
347
        XXXXXXXXXXXXGXGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX----------------------------XXXXX-XCUGAGAXXXXXXXXXXXXXXXXXXXXXX----------------------------------XXXXXXXX-XXXXXXXX-ACXUG
348
        """
349
        c = start + offset
350
        index = []
351
        print(self.seq)
352
        print(consensus)
353
        for nt_seq, nt_consensus in zip(self.seq, consensus):
354
            if nt_consensus in ['-', 'X']:
355
                pass
356
            else:
357
                index.append(c)
358
            if nt_seq != '-':
359
                c += 1
360
        if to_pymol:
361
            return("color red, " + str(index).replace(', ', '+').replace('[','').replace(']',''))
362
        else:
363
            return index
364
365
    def get_distance_to(self, nseq):
366
        """Get distance of self.seq to nseq."""
367
        try:
368
            import Levenshtein
369
        except:
370
            print('pip install python-Levenshtein')
371
        return round(Levenshtein.ratio(self.seq, nseq), 2)
372
373
class RNAalignment(object):
374
    """RNA alignment - adapter class around BioPython to do RNA alignment stuff
375
376
    Usage (for more see IPython notebook https://github.com/mmagnus/rna-tools/blob/master/rna_tools/tools/rna_alignment/rna_alignment.ipynb)
377
378
    >>> a = RNAalignment('test_data/RF00167.stockholm.sto')
379
    >>> print(a.tail())
380
    >>> print(a.ss_cons)
381
382
    Args:
383
384
      fn (str): Filename
385
      io (Bio.AlignIO): AlignIO.read(fn, "stockholm")
386
      lines (list): List of all lines of fn
387
      seqs (list): List of all sequences as class:`RNASeq` objects
388
      rf (str): ReFerence annotation, the consensus RNA sequence
389
390
    Read more:
391
392
    - http://biopython.org/DIST/docs/api/Bio.AlignIO.StockholmIO-module.html
393
394
    and on the format itself
395
396
    - https://en.wikipedia.org/wiki/Stockholm_format
397
    - http://sonnhammer.sbc.su.se/Stockholm.html
398
399
    .. warning:: fetch requires urllib3
400
    """
401
402
    def __init__(self, fn='', fetch=''):
403
        if fetch:
404
            import urllib3
405
            http = urllib3.PoolManager()
406
            response = http.request('GET', 'http://rfam.xfam.org/family/' +
407
                                    fetch + '/alignment/stockholm?gzip=1&download=1')
408
            if not response.status == 200:
409
                raise RFAMFetchError(
410
                    "The alignment could not be downloaded. Please check the RFAM id that you requested! (don't put .stk etc in the id)")
411
            with open(fetch + '.stk.gz', 'wb') as f:
412
                f.write(response.data)
413
            with gzip.open(fetch + '.stk.gz', 'rb') as f:
414
                file_content = f.read()
415
            with open(fetch + '.stk', 'wb') as f:
416
                f.write(file_content)
417
            fn = fetch + '.stk'
418
419
        self.fn = fn
420
        self.lines = open(fn).read().split('\n')
421
        self.io = AlignIO.read(fn, "stockholm")
422
        self.ss_cons = self.get_ss_cons()
423
        self.ss_cons_pk = self.get_ss_cons_pk()
424
        self.copy_ss_cons_to_all()
425
        self._ss_cons_std = self.ss_cons
426
        self.rf = self.get_gc_rf()
427
        self.shift = self.get_shift_seq_in_align()
428
429
        # get all lines not # nor //
430
        # fix for blocked alignment
431
        seq_lines = [l for l in self.lines if (
432
            not l.startswith('#')) and (not l.startswith('//')) and (l)]
433
        seqs_dict = OrderedDict()
434
        for seq in seq_lines:
435
            seq_id, seq_seq = seq.split()
436
            if seq_id not in seqs_dict:
437
                seqs_dict[seq_id] = seq_seq
438
            else:
439
                seqs_dict[seq_id] += seq_seq
440
        self.seqs = []
441
        for seq in seqs_dict:
442
            self.seqs.append(RNASeq(seq, seqs_dict[seq], ss=self.ss_cons_with_pk_std))
443
444
        # this is sick!
445
        # I create a Cols object to be able to slice alignments
446
        class Cols:
447
            def __init__(self, alignment):
448
                self.alignment = alignment
449
450
            def __getitem__(self, i):
451
                """Return new alignment"""
452
                if type(i) is list:
453
                    pass
454
                else:
455
                    # collect "new" sequences
456
                    n_seqs = []
457
                    for s in self.alignment:
458
                        new_seq = RNASeq(s.id, s.seq[i], s.ss[i])
459
                        n_seqs.append(new_seq)
460
461
                    # this is not very smart :(
462
                    # save new seqs to a file
463
                    # and load it as RNAalignment
464
                    tf = tempfile.NamedTemporaryFile(delete=False)
465
                    tf.name += '.stk'
466
                    with open(tf.name, 'w') as f:
467
                        f.write('# STOCKHOLM 1.0\n')
468
                        for s in n_seqs:
469
                            f.write(' '.join([s.id, s.seq, '\n']))
470
                        # add ss_cons & //
471
                        f.write('#=GC SS_cons ' + self.alignment.ss_cons[i] + '\n')
472
                        if self.alignment.ss_cons_pk:
473
                            f.write('#=GC SS_cons_pk' + self.alignment.ss_cons_pk[i] + '\n')
474
                        f.write('#=GC RF ' + self.alignment.rf[i] + '\n')
475
                        f.write('//\n')
476
                    return RNAalignment(tf.name)
477
        self.cols = Cols(self)
478
        # ^^^^ sick ^^^^^^^^^^^
479
480
    def reload_alignment(self):
481
        tmpfn = tempfile.NamedTemporaryFile(delete=False).name
482
        self.write(tmpfn)
483
        self.io = AlignIO.read(tmpfn, "stockholm")
484
485
    def __len__(self):
486
        """Return length of all sequenes."""
487
        return len(self.seqs)
488
489
    def __getitem__(self, i):
490
        if type(i) is str:
491
            for s in self:
492
                if s.id == i:
493
                    return s
494
        elif type(i) is list:
495
            seqs = []
496
            for j in i:
497
                seqs.append(self.seqs[j])
498
            return seqs
499
        else:
500
            return self.seqs[i]
501
502
    @property
503
    def ss_cons_std(self):
504
        return get_rfam_ss_notat_to_dot_bracket_notat(self.ss_cons)
505
506
    @ss_cons_std.setter
507
    def ss_cons_std(self, ss):
508
        self._ss_cons_std = ss
509
        print(self._ss_cons_std)
510
511
    def subset(self, ids, verbose=False):
512
        """Get subset for ids::
513
514
            # STOCKHOLM 1.0
515
            #=GF WK Tetrahydrofolate_riboswitch
516
            ..
517
            AAQK01002704.1/947-1059              -U-GC-AAAAUAGGUUUCCAUGC..
518
            #=GC SS_cons                         .(.((.((----((((((((((...
519
            #=GC RF                              .g.gc.aGAGUAGggugccgugc..
520
            //
521
522
        """
523
        nalign = ''
524
        for l in self.lines:
525
            if l.startswith('//'):
526
                nalign += l + '\n'
527
            if l.startswith('#'):
528
                nalign += l + '\n'
529
            else:
530
                for i in ids:
531
                    if l.startswith(i):
532
                        nalign += l + '\n'
533
534
        tf = tempfile.NamedTemporaryFile(delete=False)
535
        tf.name += '.stk'
536
        print('Saved to ', tf.name)
537
        if verbose:
538
            print(nalign)
539
        f = open(tf.name, 'w')
540
        f.write(nalign)
541
        #f.write(self.rf + '\n')
542
        f.close()
543
        return RNAalignment(tf.name)
544
545
    def __add__(self, rna_seq):
546
        self.seqs.append(rna_seq)
547
        self.reload_alignment()
548
549
    def write(self, fn, verbose=False):
550
        """Write the alignment to a file"""
551
        if verbose:
552
            print('Save to ', fn)
553
        with open(fn, 'w') as f:
554
            f.write('# STOCKHOLM 1.0\n')
555
            shift = max([len(x) for x in [s.id for s in self.seqs] + ['#=GC=SS_cons']])
556
            for s in self:
557
                f.write(s.id.ljust(shift + 2, ' ') + s.seq + '\n')
558
            f.write('#=GC SS_cons'.ljust(shift + 2, ' ') + self.ss_cons + '\n')
559
            f.write('//')
560
561
    def copy_ss_cons_to_all(self, verbose=False):
562
        for s in self.io:
563
            if verbose:
564
                self.ss_cons
565
                self.io[0].seq
566
            try:
567
                s.letter_annotations['secondary_structure'] = self.ss_cons
568
            except TypeError:
569
                raise Exception(
570
                    'Please check if all your sequences and ss lines are of the same length!')
571
            s.ss = self.ss_cons
572
            s.ss_clean = self.get_clean_ss(s.ss)
573
            s.seq_nogaps = str(s.seq).replace('-', '')
574
            s.ss_nogaps = self.get_ss_remove_gaps(s.seq, s.ss_clean)
575
576
    def copy_ss_cons_to_all_editing_sequence(self, seq_id, before, after):
577
        """Change a sequence's sec structure.
578
579
        :param seq_id: string, sequence id to change, eg: ``AE009948.1/1094322-1094400``
580
        :param before: string, character to change from, eg: ``,``
581
        :param after: string, character to change to, eg: ``.``
582
583
        .. warning:: before and after has to be one character long
584
        """
585
        for s in self.io:
586
            if s.id == seq_id:
587
                s.letter_annotations['secondary_structure'] = self.ss_cons.replace(before, after)
588
            else:
589
                s.letter_annotations['secondary_structure'] = self.ss_cons
590
591
    def get_ss_remove_gaps(self, seq, ss):
592
        """
593
        :param seq: string, sequence
594
        :param ss: string, ss
595
596
        UAU-AACAUAUAAUUUUGACAAUAUGG-GUCAUAA-GUUUCUACCGGAAUACC--GUAAAUAUUCU---GACUAUG-UAUA-
597
        (((.(.((((,,,(((((((_______.))))))).,,,,,,,,(((((((__.._____))))))...),,)))).)))).
598
        """
599
        nss = ''
600
        for i, j in zip(seq, ss):
601
            if i != '-':
602
                nss += j
603
        return nss
604
605
    def plot(self, plot_fn='rchie.png'):
606
        return RChie().plot_cov(self.io.format("fasta"), self.ss_cons_std, plot_fn)
607
608
    def get_ss_cons_pk(self):
609
        """
610
        :return: SS_cons_pk line or None if there is now SS_cons_pk:"""
611
        ss_cons_pk = ''
612
        for l in self.lines:
613
            if l.startswith('#=GC SS_cons_pk'):
614
                ss_cons_pk += l.replace('#=GC SS_cons_pk', '').strip()
615
        return ss_cons_pk
616
617
    def get_ss_cons(self):
618
        """
619
        :return: SS_cons_pk line or None if there is now SS_cons_pk.
620
        """
621
        ss_cons = ''
622
        for l in self.lines:
623
            if '#=GC SS_cons' in l and '#=GC SS_cons_pk' not in l:
624
                ss_cons += l.replace('#=GC SS_cons', '').strip()
625
        return ss_cons
626
627
    @property
628
    def ss_cons_with_pk(self):
629
        """go over ss_cons and overwrite bp is there is pk (ss_cons_pk)
630
631
        ss_cons:         (((.(.((((,,,(((((((_______.))))))).,,,,,,,,(((((((__.._____))))))...),,)))).)))).
632
        ss_cons_pk:      .......................[[...............................]]........................
633
        ss_cons_with_pk: (((.(.((((,,,(((((((___[[__.))))))).,,,,,,,,(((((((__.._]]__))))))...),,)))).)))).
634
635
        "return ss_cons_with_pk: string, e.g. (((.(.((((,,,(((((((___[[__.))))"""
636
        if self.ss_cons_pk:
637
            ss_cons_with_pk = ''
638
            for i, (s, p) in enumerate(zip(self.ss_cons, self.ss_cons_pk)):
639
                if p != '.':
640
                    ss_cons_with_pk += p
641
                else:
642
                    ss_cons_with_pk += s
643
            return ss_cons_with_pk
644
        else:
645
            return self.ss_cons
646
647
    @property
648
    def ss_cons_with_pk_std(self):
649
        if self.ss_cons_pk:
650
            ss_cons_with_pk_std = ''
651
            for i, (s, p) in enumerate(zip(get_rfam_ss_notat_to_dot_bracket_notat(self.ss_cons), self.ss_cons_pk)):
652
                if p != '.':
653
                    ss_cons_with_pk_std += p
654
                else:
655
                    ss_cons_with_pk_std += s
656
            return ss_cons_with_pk_std
657
        else:
658
            return self.ss_cons
659
660
    def get_gc_rf(self):
661
        """Return (str) ``#=GC RF`` or '' if this line is not in the alignment.
662
        """
663
        for l in self.lines:
664
            if l.startswith('#=GC RF'):
665
                return l.replace('#=GC RF', '').replace('_cons', '').strip()
666
        else:
667
            return ''
668
            # raise RNAalignmentError('There is on #=GC RF in the alignment!')
669
670
    def get_shift_seq_in_align(self):
671
        """RF_cons vs '#=GC RF' ???"""
672
        for l in self.lines:
673
            if l.startswith('#=GC RF'):
674
                # #=GC RF                        .g.gc.a
675
                l = l.replace('#=GC RF', '')
676
                c = 7  # 12 # len of '#=GC RF'
677
                #                         .g.gc.a
678
                for i in l:
679
                    if i == ' ':
680
                        c += 1
681
                self.shift = c
682
                return c
683
684
    def map_seq_on_seq(self, seq_id, seq_id_target, resis, v=True):
685
        """
686
        :param seq_id: seq_id, 'AAML04000013.1/228868-228953'
687
        :param seq_id_target: seq_id of target, 'CP000721.1/2204691-2204778'
688
        :param resis: list resis, [5,6]
689
690
        map::
691
692
            [4, 5, 6]
693
            UAU-A
694
            UAU-AA
695
            UAU-AAC
696
            [5, 6, 7]
697
            CAC-U
698
            CAC-U-
699
            CAC-U-U
700
            [4, None, 5]
701
702
        """
703
        # get number for first seq
704
        print(resis)
705
        nresis = []
706
        for s in self.io:
707
            if s.id.strip() == seq_id.strip():
708
                for i in resis:
709
                    print(s.seq[:i + 1])  # UAU-A
710
                    nresis.append(i + s.seq[:i].count('-'))
711
        print(nresis)
712
713
        print(self.map_seq_on_align(seq_id_target, nresis))
714
        return
715
716
        resis_target = []
717
        for s in self.io:
718
            if s.id.strip() == seq_id_target.strip():
719
                for i in nresis:
720
                    if v:
721
                        print(s.seq[:i])
722
                    if s.seq[i - 1] == '-':
723
                        resis_target.append(None)
724
                    else:
725
                        resis_target.append(i - s.seq[:i].count('-'))
726
        return resis_target
727
728
    def map_seq_on_align(self, seq_id, resis, v=True):
729
        """
730
        :param seqid: seq_id, 'CP000721.1/2204691-2204775'
731
        :param resis: list resis, [5,6]
732
733
        maps::
734
735
            [5, 6, 8]
736
            CAC-U
737
            CAC-U-
738
            CAC-U-UA
739
            [4, None, 6]
740
741
        """
742
        if v:
743
            print(resis)
744
        nresis = []
745
        for s in self.io:
746
            if s.id.strip() == seq_id.strip():
747
                for i in resis:
748
                    if v:
749
                        print(s.seq[:i])
750
                    if s.seq[i - 1] == '-':
751
                        nresis.append(None)
752
                    else:
753
                        nresis.append(i - s.seq[:i].count('-'))
754
        return nresis
755
756
    def head(self):
757
        return '\n'.join(self.lines[:5])
758
759
    def tail(self):
760
        return '\n'.join(self.lines[-5:])
761
762
    def describe(self):
763
        """Describe the alignment.
764
765
           > print(a.describe())
766
           SingleLetterAlphabet() alignment with 13 rows and 82 columns
767
768
        """
769
        return str(self.io).split('\n')[0]
770
771
    def remove_empty_columns(self, verbose=False):
772
        """Remove empty columns in place.
773
774
        Example::
775
776
            >>> a = RNAalignment("test_data/zmp.stk")
777
            >>> print(a)
778
            SingleLetterAlphabet() alignment with 6 rows and 319 columns
779
            ---ACCUUGCGCGACUGGCGAAUCC-------------------...AAU CP001644.1/756294-756165
780
            --GCUCUCGCGCGACUGGCGACUUUG------------------...GAA CU234118.1/352539-352459
781
            UGAGUUUUCUGCGACUGACGGAUUAU------------------...CUG BAAV01000055.1/2897-2982
782
            GCCCGUUCGCGUGACUGGCGCUAGU-------------------...CGA CP000927.1/5164264-5164343
783
            -----GGGUCGUGACUGGCGAACA--------------------...--- zmp
784
            UCACCCCUGCGUGACUGGCGAUA---------------------...GUU AP009385.1/718103-718202
785
            >>> a.remove_empty_columns()
786
            >>> print(a)
787
            SingleLetterAlphabet() alignment with 6 rows and 138 columns
788
            ---ACCUUGCGCGACUGGCGAAUCC-UGAAGCUGCUUUG-AGCG...AAU CP001644.1/756294-756165
789
            --GCUCUCGCGCGACUGGCGACUUUG------------------...GAA CU234118.1/352539-352459
790
            UGAGUUUUCUGCGACUGACGGAUUAU------------------...CUG BAAV01000055.1/2897-2982
791
            GCCCGUUCGCGUGACUGGCGCUAGU-------------------...CGA CP000927.1/5164264-5164343
792
            -----GGGUCGUGACUGGCGAACA--------G-----------...--- zmp
793
            UCACCCCUGCGUGACUGGCGAUA--------GAACCCUCGGGUU...GUU AP009385.1/718103-718202
794
795
        go over all seq
796
        modifes self.nss_cons"""
797
        cols_to_rm = []
798
799
        # get only seqs
800
        for i in range(len(self[0])):
801
            gap = True
802
            for seq in self:
803
                if seq.seq[i] != '-':
804
                    gap = False
805
            if gap:
806
                cols_to_rm.append(i)
807
808
        # remove from sequences
809
        for s in self:
810
            s.remove_columns(cols_to_rm)
811
812
        # update io # hack #
813
        tmpfn = tempfile.NamedTemporaryFile(delete=False).name
814
        self.write(tmpfn, verbose=False)
815
        self.io = AlignIO.read(tmpfn, "stockholm")
816
817
        # nss_cons update
818
        nss_cons = ''
819
        for i, s in enumerate(self.ss_cons):
820
            if i not in cols_to_rm:
821
                nss_cons += s
822
        self.ss_cons = nss_cons
823
824
    def format_annotation(self, t):
825
        return self.shift * ' ' + t
826
827
    def find_core(self, ids=None):
828
        """Find common core for ids.
829
830
        .. image:: ../pngs/find_core.png
831
        Fig. By core, we understand columns that have all homologous residues. The core is here marked by `x`.
832
833
        :param id: list, ids of seq in the alignment to use
834
        """
835
        if not ids:
836
            ids = []
837
            for s in self.io:
838
                ids.append(s.id)
839
840
        xx = list(range(0, len(self.io[0])))
841
        for i in range(0, len(self.io[0])):  # if . don't use it
842
            for s in self.io:
843
                # print s.id
844
                if s.id in ids:
845
                    if s.seq[i] == '-':
846
                        xx[i] = '-'
847
                        break
848
                    else:
849
                        xx[i] = 'x'
850
851
        return ''.join(xx)
852
        shift = self.get_shift_seq_in_align()
853
854
        fnlist = open(self.fn).read().strip().split('\n')
855
        fnlist.insert(-2, 'x' + ' ' * (shift - 1) + ''.join(xx))
856
        # print fnlist
857
        for l in fnlist:
858
            print(l)
859
860
    def find_seq(self, seq, verbose=False):
861
        """Find seq (also subsequences) and reverse in the alignment.
862
863
        Args:
864
865
          seq (str): seq is upper()
866
          verbose (bool): be verbose
867
868
        ::
869
870
            seq = "ggaucgcugaacccgaaaggggcgggggacccagaaauggggcgaaucucuuccgaaaggaagaguaggguuacuccuucgacccgagcccgucagcuaaccucgcaagcguccgaaggagaauc"
871
            hit = a.find_seq(seq, verbose=False)
872
            ggaucgcugaacccgaaaggggcgggggacccagaaauggggcgaaucucuuccgaaaggaagaguaggguuacuccuucgacccgagcccgucagcuaaccucgcaagcguccgaaggagaauc
873
            Match: AL939120.1/174742-174619
874
            ID: AL939120.1/174742-174619
875
            Name: AL939120.1
876
            Description: AL939120.1/174742-174619
877
            Number of features: 0
878
            /start=174742
879
            /end=174619
880
            /accession=AL939120.1
881
            Per letter annotation for: secondary_structure
882
            Seq('CCAGGUAAGUCGCC-G-C--ACCG---------------GUCA-----------...GGA', SingleLetterAlphabet())
883
            GGAUCGCUGAACCCGAAAGGGGCGGGGGACCCAGAAAUGGGGCGAAUCUCUUCCGAAAGGAAGAGUAGGGUUACUCCUUCGACCCGAGCCCGUCAGCUAACCUCGCAAGCGUCCGAAGGAGAAUC
884
885
        """
886
        seq = seq.replace('-', '').upper()
887
        for s in self.io:
888
            seq_str = str(s.seq).replace('-', '').upper()
889
            if verbose:
890
                print(seq_str)
891
            if seq_str.find(seq) > -1 or seq.find(seq_str) > -1:
892
                print('Match:', s.id)
893
                print(s)
894
                print(seq)
895
        return s
0 ignored issues
show
introduced by
The variable s does not seem to be defined in case the for loop on line 887 is not entered. Are you sure this can never be the case?
Loading history...
896
        print('Not found')
897
898
    def find_seq_exact(self, seq, verbose=False):
899
        """Find seq (also subsequences) and reverse in the alignment.
900
901
        :param seq: string, seq, seq is upper()
902
        :param verbose: boolean, be verbose or not
903
        """
904
        seq = seq.replace('-', '').upper()
905
        for s in self.io:
906
            seq_str = str(s.seq).replace('-', '').upper()
907
            if verbose:
908
                print(seq_str)
909
            if seq_str == seq:
910
                print('Match:', s.id)
911
                print(s)
912
                print(seq)
913
                return s
914
        print('Not found')
915
916
    def get_clean_ss(self, ss):
917
        nss = ''
918
        for s in ss:
919
            nss += get_rfam_ss_notat_to_dot_bracket_notat(s)
920
        return nss
921
922
    def get_seq_ss(self, seq_id):  # seq,ss):
923
        s = self.get_seq(seq_id).seq
924
        # print seq, ss
925
        # new
926
        nseq = ''
927
        nss = ''
928
        for i, j in zip(seq, ss):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable ss does not seem to be defined.
Loading history...
Comprehensibility Best Practice introduced by
The variable seq does not seem to be defined.
Loading history...
929
            if i != '-':  # gap
930
                # print i,j
931
                nseq += i
932
                nss += get_rfam_ss_notat_to_dot_bracket_notat(j)
933
        return nseq.strip(), nss.strip()
934
935
    def get_seq(self, seq_id):
936
        for s in self.io:
937
            if s.id == seq_id:
938
                return s
939
        raise Exception('Seq not found')
940
941
    def get_seq_with_name(self, seq_name):
942
        for s in self.io:
943
            if s.name == seq_name:
944
                return s
945
        raise Exception('Seq not found')
946
947
    def align_seq(self, seq):
948
        """Align seq to the alignment.
949
950
        Using self.rf.
951
952
        Args:
953
954
            seq (str): sequence, e.g. ``-GGAGAGUA-GAUGAUUCGCGUUAAGUGUGUGUGA-AUGGGAUGUC...``
955
956
        Returns:
957
958
           str: seq that can be inserted into alignemnt, ``.-.GG.AGAGUA-GAUGAUUCGCGUUA`` ! . -> -
959
        """
960
        seq = list(seq)
961
        seq.reverse()
962
        nseq = ''
963
        for n in self.rf:  # n nuclotide
964
            if n != '.':
965
                try:
966
                    j = seq.pop()
967
                except:
968
                    j = '.'
969
                nseq += j
970
            if n == '.':
971
                nseq += '.'  # + j
972
        return nseq.replace('.', '-')
973
974
    def __repr__(self):
975
        return (str(self.io))
976
977
    def trimmed_rf_and_ss(self):
978
        """Remove from RF and SS gaps.
979
980
        Returns:
981
982
          (str,str): trf, tss - new RF and SS
983
984
        """
985
        trf = ''
986
        tss = ''
987
        for r, s in zip(self.rf, self.ss_cons_std):
988
            if r not in ['-', '.']:
989
                trf += r
990
                tss += s
991
        return trf, tss
992
993
    def get_distances(self):
994
        """Get distances (seq identity) all-vs-all.
995
996
        With BioPython.
997
998
        blastn: ``Bad alphabet 'U' in sequence 'AE008922.1/409481-409568' at position '7'`` only for DNA?
999
1000
        read more (also about matrix at <http://biopython.org/wiki/Phylo> and
1001
        HTTP://biopython.org/DIST/docs/api/Bio.Phylo.TreeConstruction.DistanceCalculator-class.html
1002
        """
1003
        calculator = DistanceCalculator('identity')
1004
        dm = calculator.get_distance(self.io)
1005
        return dm
1006
1007
    def get_the_closest_seq_to_ref_seq(self, verbose=False):
1008
        """
1009
1010
        Example::
1011
1012
            >>> a = RNAalignment("test_data/RF02221.stockholm.sto")
1013
            >>> a.get_the_closest_seq_to_ref_seq()
1014
            AF421314.1/431-344
1015
1016
        """
1017
        self + RNASeq('ConSeq', self.rf, '')
1018
        dist = self.get_distances()
1019
        distConSeq = dist['ConSeq'][:-1]  # to remove ending 0, bc of distance to itself
1020
        minimal = min(distConSeq)
1021
1022
        index = dist['ConSeq'].index(minimal)
1023
        #id = dist.names[index]
1024
        if verbose:
1025
            print('dist:\n', str(dist))
1026
            print('distConSeq:', dist['ConSeq'])
1027
        return self[index]
1028
1029
1030
class CMAlign():
1031
    """CMAalign class around cmalign (of Inferal).
1032
1033
    cmalign - aligns  the RNA sequences in <seqfile> to the covariance model
1034
    (CM) in <cmfile>.  The new alignment is output to stdout  in  Stockholm
1035
    format.
1036
1037
    Example::
1038
1039
        cma = ra.CMAlign()
1040
        cma.run_cmalign("ade_seq.fa", "RF00167.cm")
1041
        seq = cma.get_seq()
1042
        print 'cma hit  ', seq
1043
        print 'seq      ', a.align_seq(seq)
1044
        print 'a.rf     ', a.rf
1045
1046
        cmd cmalign -g RF00167.cm ade_seq.fa
1047
1048
        # STOCKHOLM 1.0
1049
        #=GF AU Infernal 1.1.2
1050
1051
        ade          ----------------CGCUUCAUAUAAUCCUAAUGAUAUGGUUUGGGAGUUUCUACCAAGAG-CCUUAAA-CUCUUGAUUAUGAAGUGA------------
1052
        #=GR ade PP  ................99*********************************************.*******.***************999............
1053
        #=GC SS_cons :::::::::::::::::((((((((,,,<<<<<<<_______>>>>>>>,,,,,,,,<<<<<<<_______>>>>>>>,,))))))))::::::::::::::
1054
        #=GC RF      aaaaaauaaaaaaaauucccuCgUAUAAucccgggAAUAUGGcccgggaGUUUCUACCaggcagCCGUAAAcugccuGACUAcGagggaaauuuuuuuuuuu
1055
        //
1056
        cma hit   ----------------CGCUUCAUAUAAUCCUAAUGAUAUGGUUUGGGAGUUUCUACCAAGAG-CCUUAAA-CUCUUGAUUAUGAAGUGA------------
1057
        seq       ----------------CGCU-U-CAUAUAAUCCUAAUGAUAUGG-UUUGGGA-GUUUCUACCAAGAG-CC--UUAAA-CUCUU---GAUUAUG-AAGUGA-------------
1058
        a.rf      aaaaaauaaaaaaaauuccc.u.CgUAUAAucccgggAAUAUGG.cccggga.GUUUCUACCaggcagCC..GUAAAcugccu...GACUAcG.agggaaauuuuuuuuuuu.
1059
1060
    Install http://eddylab.org/infernal/
1061
1062
    Cite: Nawrocki and S. R. Eddy, Infernal 1.1: 100-fold faster RNA homology searches, Bioinformatics 29:2933-2935 (2013). """
1063
1064
    def __init__(self, outputfn=None):
1065
        """Use run_cmalign or load cmalign output from a file"""
1066
        if outputfn:
1067
            self.output = open(outputfn).read().strip().split('\n')
1068
1069 View Code Duplication
    def run_cmalign(self, seq, cm, verbose=True):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1070
        """Run cmalign and process the result.
1071
1072
        :param seq: seq string
1073
        :param cm: cm fn
1074
1075
        Run::
1076
1077
            $ cmalign RF01831.cm 4lvv.seq
1078
            # STOCKHOLM 1.0
1079
            #=GF AU Infernal 1.1.2
1080
1081
            4lvv         -GGAGAGUA-GAUGAUUCGCGUUAAGUGUGUGUGA-AUGGGAUGUCG-UCACACAACGAAGC---GAGA---GCGCGGUGAAUCAUU-GCAUCCGCUCCA
1082
            #=GR 4lvv PP .********.******************9999998.***********.8999999******8...5555...8**************.************
1083
            #=GC SS_cons (((((----(((((((((((,,,,,<<-<<<<<<<<___________>>>>>>>>>>,,,<<<<______>>>>,,,)))))))))))-------)))))
1084
            #=GC RF      ggcaGAGUAGggugccgugcGUuAAGUGccggcgggAcGGGgaGUUGcccgccggACGAAgggcaaaauugcccGCGguacggcaccCGCAUcCgCugcc
1085
            //
1086
1087
        .. warning :: requires cmalign to be set in your shell
1088
        """
1089
        tf = tempfile.NamedTemporaryFile(delete=False)
1090
        tf.name += '.seq'
1091
1092
        with open(tf.name, 'w') as f:
1093
            f.write('>target\n')
1094
            f.write(seq + '\n')
1095
1096
        cmd = 'cmalign -g ' + cm + ' ' + tf.name  # global
1097
        if verbose:
1098
            print('cmd' + cmd)
1099
        o = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1100
        stdout = o.stdout.read().strip()
1101
        stderr = o.stderr.read().strip()
1102
        if verbose:
1103
            print(stdout)
1104
        self.output = stdout.split('\n')
1105
1106
    def get_gc_rf(self):
1107
        """Get ``#=GC RF``.
1108
1109
        :var self.output: string
1110
        """
1111
        for l in self.output:
1112
            if l.startswith('#=GC RF'):
1113
                return l.replace('#=GC RF', '').strip()
1114
1115
    def get_seq(self):
1116
        """
1117
        :var self.output: output of cmalign, string
1118
        """
1119
        for l in self.output:
1120
            if l.strip():
1121
                if not l.startswith('#'):
1122
                    #  4lvv         -GGAGAGUA-GAUGAU
1123
                    return l.split()[1].strip()
1124
1125
1126
def clean_seq_and_ss(seq, ss):
1127
    nseq = ''
1128
    nss = ''
1129
    for i, j in zip(seq, ss):
1130
        if i != '-':  # gap
1131
            # print i,j
1132
            nseq += i
1133
            nss += get_rfam_ss_notat_to_dot_bracket_notat_per_char(j)
1134
    return nseq.strip(), nss.strip()
1135
1136
1137
def get_rfam_ss_notat_to_dot_bracket_notat_per_char(c):
1138
    """Take (c)haracter and standardize ss (including pks in letter (AAaa) notation).
1139
1140
    .. warning:: DD DD will be treated as BB BB (<< >>) (it might be wrong!)"""
1141
    if c in [',', '_', ':', '-']:
1142
        return '.'
1143
    if c == '<':
1144
        return '('
1145
    if c == '{':
1146
        return '('
1147
    if c == '}':
1148
        return ')'
1149
    if c == ']':
1150
        return ')'
1151
    if c == '[':
1152
        return '('
1153
    if c == '>':
1154
        return ')'
1155
    if c == '>':
1156
        return ')'
1157
    if c == 'A':
1158
        return '['
1159
    if c == 'a':
1160
        return ']'
1161
    if c == 'B':
1162
        return '<'
1163
    if c == 'b':
1164
        return '>'
1165
    if c == 'C':
1166
        return '{'
1167
    if c == 'c':
1168
        return '}'
1169
    # !!!!!!!!!!!!!!!!!!!!!!!!!!
1170
    if c == 'D':
1171
        return '<'
1172
    if c == 'd':
1173
        return '>'
1174
    # !!!!!!!!!!!!!!!!!!!!!!!!!!
1175
    return c
1176
1177
1178
def get_rfam_ss_notat_to_dot_bracket_notat(ss):
1179
    """Change all <<>> to "standard" dot bracket notation.
1180
1181
    Works also with pseudknots AA BB CC etc."""
1182
    nss = ''
1183
    for s in ss:
1184
        ns = get_rfam_ss_notat_to_dot_bracket_notat_per_char(s)
1185
        nss += ns
1186
    return nss
1187
1188
1189
def fasta2stokholm(fn):
1190
    """Take a gapped fasta file and return an RNAalignment object.
1191
1192
    A fasta file should look like this::
1193
1194
        >AE009948.1/1094322-1094400
1195
        UAC-U-UAUUUAUGCUGAGGAU--UGG--CUUAGC-GUCUCUACAAGACA-CC--GU-AA-UGUCU---AACAAUA-AGUA-
1196
        ...
1197
        >CP000721.1/2204691-2204778
1198
        CAC-U-UAUAUAAAUCUGAUAAUAAGG-GUCGGAU-GUUUCUACCAAGCUACC--GUAAAUUGCUAUAGGACUAUA-AGUG-
1199
        >SS_cons
1200
        (((.(.((((...(((((((........))))))).........(((((((.........))))))...)..)))).)))).
1201
        >SS_cons_pk
1202
        .........................[[........................]].............................
1203
1204
    ``SS_cons_pk`` in optionally and is an extra line used to define a pseudoknot. You
1205
    can also define second psedoknot as ``<<<...>>>`` and the third one with ``{{{ }}}``.
1206
1207
    :param fn: file
1208
    :return: RNAalignment object
1209
    """
1210
    seqs = []
1211
    s = None
1212
    for l in open(fn):
1213
        if l.startswith('>'):
1214
            if s:
1215
                seqs.append(s)
1216
            s = RNASeq(l.replace('>', '').strip(), '')
1217
        else:
1218
            s.seq += l.strip()
1219
1220
    txt = ''
1221
    for l in open(fn):
1222
        if l.startswith('>'):
1223
            id = '\n' + l.replace('>', '\n').strip() + ' '
1224
            id = re.sub('ss_cons_pk', '#=GC SS_cons_pk', id, flags=re.IGNORECASE)
1225
            id = re.sub('ss_cons', '#=GC SS_cons', id, flags=re.IGNORECASE)
1226
            txt += id
1227
        else:
1228
            txt += l.strip()
1229
    txt = txt.strip() + '\n'  # clean upfront \n and add tailing \n
1230
1231
    tf = tempfile.NamedTemporaryFile(delete=False)
1232
    tf.name += '.stk'
1233
    with open(tf.name, 'w') as f:
1234
        f.write('# STOCKHOLM 1.0\n')
1235
        f.write(txt)
1236
        f.write('//\n')
1237
    return RNAalignment(tf.name)
1238
1239
1240
def fetch_stokholm(rfam_acc, dpath=None):
1241
    """Fetch Stokholm file from Rfam.
1242
1243
    :param rfam_acc: str, Rfam accession number, eg. RF00028
1244
    :param dpath: str or None, if None saves to current location, otherwise save to dpath folder
1245
1246
    .. warning :: urllib3 is required, pip install urllib3
1247
    """
1248
    import urllib3
1249
    http = urllib3.PoolManager()
1250
    # try:
1251
    print(dpath)
1252
    response = http.request('GET', url='http://rfam.xfam.org/family/' + rfam_acc.lower() +
1253
                            '/alignment?acc=' + rfam_acc.lower() + '&format=stockholm&download=1')
1254
    # except urllib3.HTTPError:
1255
    #    raise Exception('The PDB does not exists: ' + pdb_id)
1256
    txt = response.data
1257
    if dpath is None:
1258
        npath = rfam_acc + '.stk'
1259
    else:
1260
        npath = dpath + os.sep + rfam_acc + '.stk'
1261
    print('downloading...' + npath)
1262
    with open(npath, 'wb') as f:
1263
        f.write(txt)
1264
    print('ok')
1265
    return rfam_acc + '.stk'
1266
1267
1268
# def test_seq_multilines_alignment()
1269
def test_alignment_with_pk():
1270
    a = RNAalignment('test_data/RF00167.stockholm.sto')
1271
    print(a.tail())
1272
    print(a.ss_cons)
1273
    print(a.ss_cons_pk)
1274
    print(a.ss_cons_with_pk)
1275
1276
    print(a[[1, 2]])
1277
1278
1279
# main
1280
if __name__ == '__main__':
1281
    ## a = RNAalignment('test_data/RF00167.stockholm.sto')
1282
    # print a.get_shift_seq_in_align()
1283
    # print a.map_seq_on_align('CP000721.1/2204691-2204778', [5,6,8])
1284
    # print a.map_seq_on_seq('AAML04000013.1/228868-228953', 'CP000721.1/2204691-2204778', [4,5,6])
1285
1286
    # print(record.letter_annotations['secondary_structure'])
1287
    ## seq = a.get_seq('AAML04000013.1/228868-228953')
1288
    # print seq.seq
1289
    # print seq.ss
1290
    # print a.ss_cons
1291
1292
    # print 'x'
1293
    # for s in a.io:
1294
    # print s.seq
1295
    # print s.ss_clean #letter_annotations['secondary_structure']
1296
1297
    # for s in a.io:
1298
    # print s.seq_nogaps
1299
    # print s.ss_nogaps
1300
1301
    # a.write('test_output/out.sto')
1302
1303
    # a = RNAalignment("/home/magnus/work/rna-evo/rp12/seq/RF00379.stockholm.stk")##_rm85.stk')
1304
    # print a.get_seq_ss("AL939120.1/174742-174619")
1305
    # subset = a.subset(["AL939120.1/174742-174619",
1306
    # "rp12bsubtilis",
1307
    # "AAWL01000001.1/57092-56953",
1308
    # "CP000612.1/87012-87130",
1309
    # "BA000028.3/1706087-1706245"])
1310
    # % cat /var/folders/yc/ssr9692s5fzf7k165grnhpk80000gp/T/tmpTzPenx
1311
    # for s in subset:
1312
    # print s.seq
1313
    # subset.remove_empty_columns()
1314
    # subset.write('/home/magnus/out2.stk')
1315
    # for s in subset:
1316
    # print s.seq
1317
1318
    # print 'subset.ss_cons_std:', subset.ss_cons_std
1319
1320
    # a = RNAalignment("/home/magnus/work/rna-evo/rp12/seq/RF00379.stockholm.stk")##_rm85.stk')
1321
    # a.ss_cons = "::::::::::{{{{,,,<<<_..."
1322
    # print a.ss_cons
1323
    # print a.ss_cons_std
1324
    # print get_rfam_ss_notat_to_dot_bracket_notat(subset.ss_cons)
1325
1326
    ## a.ss_cons_std = 'test of setter'
1327
    # print a._ss_cons_std
1328
1329
    # pass
1330
    # slice
1331
1332
    ## slices = a.cols[0:10]
1333
    # for s in a:
1334
    # print s.seq
1335
    # add pk
1336
1337
    # for s in slices:
1338
    # print s, s.seq, s.ss
1339
1340
    #a = fasta2stokholm('test_output/ade_gapped.fa')
1341
    # print a
1342
1343
    #a = RNAalignment('test_data/RF00001.blocked.stk')
1344
    # print a
1345
    # print a.ss_cons
1346
    # a.plot('rchie.png')
1347
1348
    # a = RNAalignment("test_data/RF02221.stockholm.sto")
1349
    ## a = RNAalignment('test_data/test_data/RF02221.stockholm.sto')
1350
    ## a + RNASeq('ConSeq', '-A-GU-AGAGUA-GGUCUUAUACGUAA-----------------AGUG-UCAUCGGA-U-GGGGAGACUUCCGGUGAACGAA-G-G-----------------------------GUUA---------------------------CCGCGUUAUAUGAC-C-GCUUCCG-CUA-C-U-','')
1351
    ## dist = a.get_distances()
1352
    # distConSeq = dist['ConSeq'][:-1]  # to remove ending 0, bc of distance to itself
1353
    ## minimal = min(distConSeq)
1354
    ## index = dist['ConSeq'].index(minimal)
1355
    # print(dist.names[index])
1356
1357
    a = RNAalignment("test_data/dist_test2.stk")
1358
    rep = a.get_the_closest_seq_to_ref_seq()
1359
    rep.remove_gaps()
1360
    print(rep)
1361
    print(rep.ss)
1362
    print(rep.seq)
1363
1364
    # a.write('tmp.stk')
1365
    # s.remove_gaps()
1366
    # print(s.seq)
1367
    # print(s.ss)
1368
1369
    import doctest
1370
    doctest.testmod()
1371