Issues in RfamSearch.py (master) - Issues in master - mmagnus/rna-tools - Measure and Improve Code Quality continuously with Scrutinizer

Issues (434)

rna_tools/RfamSearch.py (1 issue)

Labels

Duplication 1

Severity

Informational 1

#!/usr/bin/env python
import subprocess
import tempfile

from rna_tools.Seq import RNASequence
from rna_tools.rna_tools_config import RFAM_DB_PATH


class RfamSearchError(Exception):
    pass


class RfamSearch():
    """RfamSearch (local).

    Rfam is a collection of multiple sequence alignments and covariance models representing non-coding RNA families. Rfam is available on the web http://rfam.xfam.org/. The website allow the user to search a query sequence against a library of covariance models, and view multiple sequence alignments and family annotation. The database can also be downloaded in flatfile form and searched locally using the INFERNAL package (http://infernal.wustl.edu/). The first release of Rfam (1.0) contains 25 families, which annotate over 50 000 non-coding RNA genes in the taxonomic divisions of the EMBL nucleotide database.

    Infernal ("INFERence of RNA ALignment") is for searching DNA sequence databases for RNA structure and sequence similarities. It is an implementation of a special case of profile stochastic context-free grammars called covariance models (CMs). A CM is like a sequence profile, but it scores a combination of sequence consensus and RNA secondary structure consensus, so in many cases, it is more capable of identifying RNA homologs that conserve their secondary structure more than their primary sequence.

    Infernal `cmscan` is used to search the CM-format Rfam database.

    Setup:

    - download the database from ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT (file: Rfam.cm.gz, ~30mb)
    - install http://eddylab.org/infernal/
    - set up ``RFAM_DB_PATH`` in the config file of rna-tools.
    - compress Rfam.cm
    
    Example of compressing the database::

         $ cmpress Rfam.cm
         Working...    done.
         Pressed and indexed 3016 CMs and p7 HMM filters (3016 names and 3016 accessions).
         Covariance models and p7 filters pressed into binary file:  Rfam.cm.i1m
         SSI index for binary covariance model file:                 Rfam.cm.i1i
         Optimized p7 filter profiles (MSV part)  pressed into:      Rfam.cm.i1f
         Optimized p7 filter profiles (remainder) pressed into:      Rfam.cm.i1p

    Cite: Nawrocki and S. R. Eddy, Infernal 1.1: 100-fold faster RNA homology searches, Bioinformatics 29:2933-2935 (2013). """  # noqa

    def __init__(self):
        pass

    def cmscan(self, seq, verbose=False):

        """Run cmscan on the seq.

        Usage::

           >>> seq = RNASequence("GGCGCGGCACCGUCCGCGGAACAAACGG")
           >>> rs = RfamSearch()
           >>> hit = rs.cmscan(seq)
           >>> print(hit)  #doctest: +ELLIPSIS
           # cmscan :: search sequence(s) against a CM database...

        :param seq: string
        :returns: result
        :rtype: string """
        #if verbose:
        print('RFAM_DB_PATH', RFAM_DB_PATH)

        # make tmp file
        tf = tempfile.NamedTemporaryFile(delete=False)
        tf.name += '.fa'
        with open(tf.name, 'w') as f:
            f.write('>target\n')
            f.write(seq.seq + '\n')

        # make output file
        of = tempfile.NamedTemporaryFile(delete=False)

        # run cmscan
        cmd = 'cmscan -E 1 ' + RFAM_DB_PATH + ' ' + tf.name + '  > ' + of.name
        print(cmd)
        o = subprocess.Popen(
            cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        # out = o.stdout.read().strip()
        err = o.stderr.read().strip()
        if err:
            raise RfamSearchError(err)
        self.output = open(of.name).read()
        # os.chdir(old_pwd)
        return self.output


# main
if __name__ == '__main__':
    seq = RNASequence("GGCGCGGCACCGUCCGCGGAACAAACGG")
    rs = RfamSearch()
    hit = rs.cmscan(seq)
    print(hit)

    import doctest
    doctest.testmod()


1		#!/usr/bin/env python
2		import subprocess
3		import tempfile
4
5		from rna_tools.Seq import RNASequence
6		from rna_tools.rna_tools_config import RFAM_DB_PATH
7
8
9		class RfamSearchError(Exception):
10		pass
11
12
13		class RfamSearch():
14		"""RfamSearch (local).
15
16		Rfam is a collection of multiple sequence alignments and covariance models representing non-coding RNA families. Rfam is available on the web http://rfam.xfam.org/. The website allow the user to search a query sequence against a library of covariance models, and view multiple sequence alignments and family annotation. The database can also be downloaded in flatfile form and searched locally using the INFERNAL package (http://infernal.wustl.edu/). The first release of Rfam (1.0) contains 25 families, which annotate over 50 000 non-coding RNA genes in the taxonomic divisions of the EMBL nucleotide database.
17
18		Infernal ("INFERence of RNA ALignment") is for searching DNA sequence databases for RNA structure and sequence similarities. It is an implementation of a special case of profile stochastic context-free grammars called covariance models (CMs). A CM is like a sequence profile, but it scores a combination of sequence consensus and RNA secondary structure consensus, so in many cases, it is more capable of identifying RNA homologs that conserve their secondary structure more than their primary sequence.
19
20		Infernal `cmscan` is used to search the CM-format Rfam database.
21
22		Setup:
23
24		- download the database from ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT (file: Rfam.cm.gz, ~30mb)
25		- install http://eddylab.org/infernal/
26		- set up ``RFAM_DB_PATH`` in the config file of rna-tools.
27		- compress Rfam.cm
28
29		Example of compressing the database::
30
31		$ cmpress Rfam.cm
32		Working... done.
33		Pressed and indexed 3016 CMs and p7 HMM filters (3016 names and 3016 accessions).
34		Covariance models and p7 filters pressed into binary file: Rfam.cm.i1m
35		SSI index for binary covariance model file: Rfam.cm.i1i
36		Optimized p7 filter profiles (MSV part) pressed into: Rfam.cm.i1f
37		Optimized p7 filter profiles (remainder) pressed into: Rfam.cm.i1p
38
39		Cite: Nawrocki and S. R. Eddy, Infernal 1.1: 100-fold faster RNA homology searches, Bioinformatics 29:2933-2935 (2013). """ # noqa
40
41		def __init__(self):
42		pass
43
44	View Code Duplication	def cmscan(self, seq, verbose=False):
		0 ignored issues – show Duplication introduced 2020-06-29 19:56 UTC by Report Bug Copy Issue Report Show Similar Issues like this This code seems to be duplicated in your project. Loading history...
45		"""Run cmscan on the seq.
46
47		Usage::
48
49		>>> seq = RNASequence("GGCGCGGCACCGUCCGCGGAACAAACGG")
50		>>> rs = RfamSearch()
51		>>> hit = rs.cmscan(seq)
52		>>> print(hit) #doctest: +ELLIPSIS
53		# cmscan :: search sequence(s) against a CM database...
54
55		:param seq: string
56		:returns: result
57		:rtype: string """
58		#if verbose:
59		print('RFAM_DB_PATH', RFAM_DB_PATH)
60
61		# make tmp file
62		tf = tempfile.NamedTemporaryFile(delete=False)
63		tf.name += '.fa'
64		with open(tf.name, 'w') as f:
65		f.write('>target\n')
66		f.write(seq.seq + '\n')
67
68		# make output file
69		of = tempfile.NamedTemporaryFile(delete=False)
70
71		# run cmscan
72		cmd = 'cmscan -E 1 ' + RFAM_DB_PATH + ' ' + tf.name + ' > ' + of.name
73		print(cmd)
74		o = subprocess.Popen(
75		cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
76		# out = o.stdout.read().strip()
77		err = o.stderr.read().strip()
78		if err:
79		raise RfamSearchError(err)
80		self.output = open(of.name).read()
81		# os.chdir(old_pwd)
82		return self.output
83
84
85		# main
86		if __name__ == '__main__':
87		seq = RNASequence("GGCGCGGCACCGUCCGCGGAACAAACGG")
88		rs = RfamSearch()
89		hit = rs.cmscan(seq)
90		print(hit)
91
92		import doctest
93		doctest.testmod()
94

mmagnus / rna-tools

Issues (434)

rna_tools/RfamSearch.py (1 issue)

Labels

Severity

Introduced By

Duplication Side-by-Side

Filter issues like