build.rna_tools.tools.pdb_formatix.PDBFile.PDBFile._split_by_ters() - Code Metrics - mmagnus/rna-tools - Measure and Improve Code Quality continuously with Scrutinizer

PDBFile._split_by_ters() A
last analyzed 2025-07-29 00:29 UTC

↳ Parent: build.rna_tools.tools.pdb_formatix.PDBFile

Complexity

Conditions

Size

Total Lines	7
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	2
dl	0
loc	7
rs	10
c	0
b	0
f	0

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from rna_tools.tools.pdb_formatix.SingleLineUtils import get_res_code, get_res_num, get_atom_code, \
    set_atom_code, set_line_bfactor
import re


class PDBFile(object):
    """Class for holding data from a PDB file and modifying it.
    """

    # find 'ATOM' lines in a PDB file
    ATOM_LINE_PATTERN = re.compile('^ATOM')

    # dictionary for changing residue names from 3 letters to 1 letter
    RES3TO1 = {
        'GUA': 'G',
        'URI': 'U',
        'CYT': 'C',
        'ADE': 'A',

        'RG': 'G',
        'RU': 'U',
        'RC': 'C',
        'RA': 'A',

        'R3G': 'G',
        'R3U': 'U',
        'R3C': 'C',
        'R3A': 'A',

        'R5G': 'G',
        'R5U': 'U',
        'R5C': 'C',
        'R5A': 'A',

        'RG3': 'G',
        'RU3': 'U',
        'RC3': 'C',
        'RA3': 'A',

        'RG5': 'G',
        'RU5': 'U',
        'RC5': 'C',
        'RA5': 'A',

        'RGU': 'G',
        'URA': 'U',
        'RCY': 'C',
        'RAD': 'A',

        # just in case
        'G': 'G',
        'U': 'U',
        'C': 'C',
        'A': 'A',
    }

    AMINOACID_CODES = ("Ala", "Arg", "Asn", "Asp", "Cys", "Glu", "Gln", "Gly",
                       "His", "Ile", "Leu", "Lys", "Met", "Phe", "Pro", "Ser", "Thr",
                       "Trp", "Tyr", "Val",
                       "ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "GLN", "GLY",
                       "HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR",
                       "TRP", "TYR", "VAL"
                       )

    def __init__(self, pdb_string=None, pdb_path=None, pdb_handle=None, verbose=False):
        """Constructor, should get exactly one of the arguments:
             * pdb_string = PDB as a string
             * pdb_path = string with path to a PDB file
             * pdb_handle = handle to a PDB file
        """
        self.verbose = verbose

        if len([x for x in [pdb_string, pdb_path, pdb_handle] if x is not None]) > 1:
            print('You should provide at most one source for PDB file')
            raise Exception
        input_string = ''
        if pdb_string is not None:
            input_string = pdb_string
        elif pdb_path is not None:
            with open(pdb_path) as f:
                input_string = f.read()
        elif pdb_handle is not None:
            input_string = pdb_handle.read()
        self.pdb_string = input_string
        self.pdb_lines = self.pdb_string.split('\n')
        self.fixes = []

    def save(self, file_path):
        """Save current PDB to disk

        Arguments:
          * file_path = path where it will be saved
        """
        with open(file_path, 'w') as f:
            f.write(self.pdb_string)

    def _apply_fix(self, fix_name, result_string, result_lines=None):
        """Helper function for applying fixes and saving information about them.

        Arguments:
          * fix_name = string that will be added to self.pdb_fixes, unless
            result_string is the same as pdb_string
          * result_string = string after applying this fix
          * result_lines = optional, list of lines, use this argument if you
            already have such list and don't want to lose time on splitting
            result_string
        """
        if self.pdb_string != result_string:
            self.fixes.append(fix_name)
            self.pdb_string = result_string
            if result_lines is not None:
                self.pdb_lines = result_lines
            else:
                self.pdb_lines = self.pdb_string.split('\n')

    def set_string(self, pdb_string):
        """Change PDB string stored in this instance of the class

        Arguments:
          * pdb_string = new PDB string
        """
        self.pdb_string = pdb_string
        self.pdb_lines = pdb_string.split('\n')

    def _get_atom_lines(self):
        """Get only lines with ATOM information
        """
        return [l for l in self.pdb_lines if re.match(self.ATOM_LINE_PATTERN, l)]

    def validate_pdb(self):
        """Check if file is a PDB structure

        Output:
          * True if it is a PDB, False otherwise
        """
        atom_lines = self._get_atom_lines()
        if not len(atom_lines):
            return False
        return True

    def detect_proteins(self):
        """Check if there are any aminoacid fragments in the file

        Output:
          * True if there are some, False otherwise
        """
        for l in self.pdb_lines:
            if get_res_code(l) in self.AMINOACID_CODES:
                return True
        return False

    def seq_from_pdb(self):
        """Extract sequence from a PDB and return it as a string.

        Output:
          * sequence, returned as a string
        """
        atoms = [l.split() for l in self._get_atom_lines()]
        if atoms[0][3][0] != 'r':  # it is 'r' if it's a ROSETTA PDB
            seq = [self.RES3TO1[atoms[0][3]]]
        else:
            seq = [atoms[0][3][1]]
        for a in range(1, len(atoms)):
            # atom number is different than previous one
            if atoms[a][5] != atoms[a - 1][5]:
                if atoms[a][3][0] != 'r':  # check for ROSETTA PDB
                    seq.append(self.RES3TO1[atoms[a][3]])
                else:
                    seq.append(atoms[a][3][1])
        return ''.join(seq)

    def seq_from_amber_like_pdb(self):
        """Extract sequence from a PDB and return it as a string - use it for amber-like files.

        Output:
          * sequence, returned as a string, such as: RG5 RC RU RG RG RG RC RG RC RA RG RG3 RC5 RC RU RG RA RC RG RG RU RA RC RA RG RC3
        """
        atoms = [l.split() for l in self._get_atom_lines()]
        seq = []
        seq.append(atoms[0][3])
        for a in range(1, len(atoms)):
            # atom number is different than previous one
            if self.verbose:
                print((atoms[a][5], atoms[a - 1][5]))
            if atoms[a][5] != atoms[a - 1][5]:
                seq.append(atoms[a][3])
        return ' '.join(seq)

    def get_fasta(self, name='seq', lowercase=False):
        """Format sequence in FASTA format, with a header and lines split
        at 80 characters.

        Arguments:
          * name = name of the sequence (it's put in the header line)

        Output:
          * FASTA returned as a string
        """
        seq = self.seq_from_pdb()
        if lowercase:
            seq = seq.lower()
        result = ['>' + name]
        result.extend([seq[i:i + 80] for i in range(0, len(seq), 80)])
        result.append('')
        return '\n'.join(result)

    def remove_non_atoms(self):
        """Remove all lines that are not ATOMs
        """
        result = self._get_atom_lines()
        self._apply_fix('Removed non-atom lines', '\n'.join(result), result)

    def _check_resname_3(self):
        """Check if PDB uses 3 or 1 letter residue names.

        Output:
          * bool, True if 3 letters, False otherwise
        """
        for l in self.pdb_lines:
            if l.startswith('ATOM'):
                return 3 == len(l.split()[3])

    def _resname_3to1(self):
        """Convert residue names in PDB file from 3 letters to 1 letter.
        """
        result = []
        for l in self.pdb_lines:
            if l.startswith('ATOM'):
                long_name = l.split()[3]
                try:
                    short_name = self.RES3TO1[long_name]
                except KeyError:
                    short_name = 'X'
                result.append(l[:17] + '  ' + short_name + l[20:])
            else:
                result.append(l)
        self._apply_fix('resname_3to1', '\n'.join(result), result)

    def resname_check_and_3to1(self):
        """Check if resnames are 3 letter long and if so convert them to 1 letter.
        """
        if self._check_resname_3():
            try:
                self._resname_3to1()
            except:
                print('Conversion to 1-letter residue names failed')

    def terminate_chains(self):
        """Add 'TER' at the end of chain if none 'TER's are found in PDB.
        """
        for l in self.pdb_lines:
            if l.startswith('TER'):
                return
        else:
            result = []
            chain_started = False
            ter_added = False
            for l in self.pdb_lines:
                if 'ATOM' in l:
                    chain_started = True
                else:
                    if chain_started:
                        result.append('TER')
                        ter_added = True
                        chain_started = False
                result.append(l)
        if not ter_added:
            result.append('TER')
        self._apply_fix('terminate_chains', '\n'.join(result), result)

    def _split_by_ters(self, separator='TER\n'):
        """Split a PDB string by TER lines or other separator

        Arguments:
          * separator = optional, separator dividing structures, default is 'TER\n'
        """
        return [i + separator for i in self.pdb_string.split(separator)]

    def remove_short_chains(self, threshold=9):
        """Remove chains that have chains with a small number of atoms.

        Arguments:
          * threshold = if chain has more atoms, it stays in the result
        """
        chains = self.pdb_string.split('TER\n')
        good_chains = [c for c in chains if len(c.split('\nATOM')) > threshold]
        self._apply_fix('remove_short_chains', '\n'.join(good_chains))

    def check_and_add_P_at_start(self):
        residues = {}
        for l in self.pdb_lines:
            res_num = get_res_num(l)
            if res_num in residues:
                residues[res_num].append(l)
            else:
                residues[res_num] = [l]
        first_residue = residues[min(residues.keys())]

        # check P
        has_p = False
        for l in first_residue:
            if get_atom_code(l) == 'P':
                has_p = True
        if not has_p:
            # add P
            corrected_residue = []
            for l in first_residue:
                if get_atom_code(l) == 'O5\'' or get_atom_code(l) == 'O5*':
                    corrected_residue.append(set_atom_code(l, 'P'))
                else:
                    corrected_residue.append(l)
            residues[min(residues.keys())] = corrected_residue
            residues = ['\n'.join(residues[r]) for r in residues]
            self._apply_fix('add_P_at_start', '\n'.join(residues))

    def set_residues_bfactor(self, bfactors):
        """Set B-factor to any value you want

        Arguments:
          * bfactors = list of B-factors that will be set, should be of same
            length as nucleotide sequence
        """
        ans = []
        for l in self.pdb_lines:
            if l.startswith('ATOM'):
                res_num = get_res_num(l)
                try:
                    ans.append(set_line_bfactor(l, bfactors[res_num - 1]))
                except IndexError:
                    pass
            else:
                ans.append(l)
        self.pdb_lines = ans
        self.pdb_string = '\n'.join(ans)

    def pedantic_pdb(self):
        """Do everything that's possible to fix PDB: 3-to-1, no HETATMs, TERs etc.
        """
        self.resname_check_and_3to1()
        self.terminate_chains()
        self.remove_short_chains()

    def count_models(self):
        """Count models in the PDB file

        Output:
          * number of models as an int
        """
        model_num = 0
        for l in self.pdb_lines:
            if l.startswith('MODEL'):
                model_num += 1
        return model_num

    def get_model(self, model_num):
        """Get n-th model from the file

        Arguments:
          * model_num = number of model to get, starts with 0
        """
        model_borders = list(zip(
            [i[0] for i in enumerate(self.pdb_lines) if i[1].startswith('MODEL')],
            [i[0] for i in enumerate(self.pdb_lines) if i[1].startswith('ENDMDL')]
        ))
        result = self.pdb_lines[:model_borders[0][0]]
        result.extend(self.pdb_lines[model_borders[model_num][0]:model_borders[model_num][1] + 1])
        result.extend(self.pdb_lines[model_borders[-1][1] + 1:])
        self._apply_fix('get_model', '\n'.join(result), result)

    def check_and_get_first_model(self):
        """Check if there are more than one models and get only the first one
        """
        if self.count_models() >= 2:
            self.get_model(0)


1			#!/usr/bin/env python
2			# -- coding: utf-8 --
3			from rna_tools.tools.pdb_formatix.SingleLineUtils import get_res_code, get_res_num, get_atom_code, \
4			set_atom_code, set_line_bfactor
5			import re
6
7
8			class PDBFile(object):
9			"""Class for holding data from a PDB file and modifying it.
10			"""
11
12			# find 'ATOM' lines in a PDB file
13			ATOM_LINE_PATTERN = re.compile('^ATOM')
14
15			# dictionary for changing residue names from 3 letters to 1 letter
16			RES3TO1 = {
17			'GUA': 'G',
18			'URI': 'U',
19			'CYT': 'C',
20			'ADE': 'A',
21
22			'RG': 'G',
23			'RU': 'U',
24			'RC': 'C',
25			'RA': 'A',
26
27			'R3G': 'G',
28			'R3U': 'U',
29			'R3C': 'C',
30			'R3A': 'A',
31
32			'R5G': 'G',
33			'R5U': 'U',
34			'R5C': 'C',
35			'R5A': 'A',
36
37			'RG3': 'G',
38			'RU3': 'U',
39			'RC3': 'C',
40			'RA3': 'A',
41
42			'RG5': 'G',
43			'RU5': 'U',
44			'RC5': 'C',
45			'RA5': 'A',
46
47			'RGU': 'G',
48			'URA': 'U',
49			'RCY': 'C',
50			'RAD': 'A',
51
52			# just in case
53			'G': 'G',
54			'U': 'U',
55			'C': 'C',
56			'A': 'A',
57			}
58
59			AMINOACID_CODES = ("Ala", "Arg", "Asn", "Asp", "Cys", "Glu", "Gln", "Gly",
60			"His", "Ile", "Leu", "Lys", "Met", "Phe", "Pro", "Ser", "Thr",
61			"Trp", "Tyr", "Val",
62			"ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "GLN", "GLY",
63			"HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR",
64			"TRP", "TYR", "VAL"
65			)
66
67			def __init__(self, pdb_string=None, pdb_path=None, pdb_handle=None, verbose=False):
68			"""Constructor, should get exactly one of the arguments:
69			* pdb_string = PDB as a string
70			* pdb_path = string with path to a PDB file
71			* pdb_handle = handle to a PDB file
72			"""
73			self.verbose = verbose
74
75			if len([x for x in [pdb_string, pdb_path, pdb_handle] if x is not None]) > 1:
76			print('You should provide at most one source for PDB file')
77			raise Exception
78			input_string = ''
79			if pdb_string is not None:
80			input_string = pdb_string
81			elif pdb_path is not None:
82			with open(pdb_path) as f:
83			input_string = f.read()
84			elif pdb_handle is not None:
85			input_string = pdb_handle.read()
86			self.pdb_string = input_string
87			self.pdb_lines = self.pdb_string.split('\n')
88			self.fixes = []
89
90			def save(self, file_path):
91			"""Save current PDB to disk
92
93			Arguments:
94			* file_path = path where it will be saved
95			"""
96			with open(file_path, 'w') as f:
97			f.write(self.pdb_string)
98
99			def _apply_fix(self, fix_name, result_string, result_lines=None):
100			"""Helper function for applying fixes and saving information about them.
101
102			Arguments:
103			* fix_name = string that will be added to self.pdb_fixes, unless
104			result_string is the same as pdb_string
105			* result_string = string after applying this fix
106			* result_lines = optional, list of lines, use this argument if you
107			already have such list and don't want to lose time on splitting
108			result_string
109			"""
110			if self.pdb_string != result_string:
111			self.fixes.append(fix_name)
112			self.pdb_string = result_string
113			if result_lines is not None:
114			self.pdb_lines = result_lines
115			else:
116			self.pdb_lines = self.pdb_string.split('\n')
117
118			def set_string(self, pdb_string):
119			"""Change PDB string stored in this instance of the class
120
121			Arguments:
122			* pdb_string = new PDB string
123			"""
124			self.pdb_string = pdb_string
125			self.pdb_lines = pdb_string.split('\n')
126
127			def _get_atom_lines(self):
128			"""Get only lines with ATOM information
129			"""
130			return [l for l in self.pdb_lines if re.match(self.ATOM_LINE_PATTERN, l)]
131
132			def validate_pdb(self):
133			"""Check if file is a PDB structure
134
135			Output:
136			* True if it is a PDB, False otherwise
137			"""
138			atom_lines = self._get_atom_lines()
139			if not len(atom_lines):
140			return False
141			return True
142
143			def detect_proteins(self):
144			"""Check if there are any aminoacid fragments in the file
145
146			Output:
147			* True if there are some, False otherwise
148			"""
149			for l in self.pdb_lines:
150			if get_res_code(l) in self.AMINOACID_CODES:
151			return True
152			return False
153
154			def seq_from_pdb(self):
155			"""Extract sequence from a PDB and return it as a string.
156
157			Output:
158			* sequence, returned as a string
159			"""
160			atoms = [l.split() for l in self._get_atom_lines()]
161			if atoms[0][3][0] != 'r': # it is 'r' if it's a ROSETTA PDB
162			seq = [self.RES3TO1[atoms[0][3]]]
163			else:
164			seq = [atoms[0][3][1]]
165			for a in range(1, len(atoms)):
166			# atom number is different than previous one
167			if atoms[a][5] != atoms[a - 1][5]:
168			if atoms[a][3][0] != 'r': # check for ROSETTA PDB
169			seq.append(self.RES3TO1[atoms[a][3]])
170			else:
171			seq.append(atoms[a][3][1])
172			return ''.join(seq)
173
174			def seq_from_amber_like_pdb(self):
175			"""Extract sequence from a PDB and return it as a string - use it for amber-like files.
176
177			Output:
178			* sequence, returned as a string, such as: RG5 RC RU RG RG RG RC RG RC RA RG RG3 RC5 RC RU RG RA RC RG RG RU RA RC RA RG RC3
179			"""
180			atoms = [l.split() for l in self._get_atom_lines()]
181			seq = []
182			seq.append(atoms[0][3])
183			for a in range(1, len(atoms)):
184			# atom number is different than previous one
185			if self.verbose:
186			print((atoms[a][5], atoms[a - 1][5]))
187			if atoms[a][5] != atoms[a - 1][5]:
188			seq.append(atoms[a][3])
189			return ' '.join(seq)
190
191			def get_fasta(self, name='seq', lowercase=False):
192			"""Format sequence in FASTA format, with a header and lines split
193			at 80 characters.
194
195			Arguments:
196			* name = name of the sequence (it's put in the header line)
197
198			Output:
199			* FASTA returned as a string
200			"""
201			seq = self.seq_from_pdb()
202			if lowercase:
203			seq = seq.lower()
204			result = ['>' + name]
205			result.extend([seq[i:i + 80] for i in range(0, len(seq), 80)])
206			result.append('')
207			return '\n'.join(result)
208
209			def remove_non_atoms(self):
210			"""Remove all lines that are not ATOMs
211			"""
212			result = self._get_atom_lines()
213			self._apply_fix('Removed non-atom lines', '\n'.join(result), result)
214
215			def _check_resname_3(self):
216			"""Check if PDB uses 3 or 1 letter residue names.
217
218			Output:
219			* bool, True if 3 letters, False otherwise
220			"""
221			for l in self.pdb_lines:
222			if l.startswith('ATOM'):
223			return 3 == len(l.split()[3])
224
225			def _resname_3to1(self):
226			"""Convert residue names in PDB file from 3 letters to 1 letter.
227			"""
228			result = []
229			for l in self.pdb_lines:
230			if l.startswith('ATOM'):
231			long_name = l.split()[3]
232			try:
233			short_name = self.RES3TO1[long_name]
234			except KeyError:
235			short_name = 'X'
236			result.append(l[:17] + ' ' + short_name + l[20:])
237			else:
238			result.append(l)
239			self._apply_fix('resname_3to1', '\n'.join(result), result)
240
241			def resname_check_and_3to1(self):
242			"""Check if resnames are 3 letter long and if so convert them to 1 letter.
243			"""
244			if self._check_resname_3():
245			try:
246			self._resname_3to1()
247			except:
248			print('Conversion to 1-letter residue names failed')
249
250			def terminate_chains(self):
251			"""Add 'TER' at the end of chain if none 'TER's are found in PDB.
252			"""
253			for l in self.pdb_lines:
254			if l.startswith('TER'):
255			return
256			else:
257			result = []
258			chain_started = False
259			ter_added = False
260			for l in self.pdb_lines:
261			if 'ATOM' in l:
262			chain_started = True
263			else:
264			if chain_started:
265			result.append('TER')
266			ter_added = True
267			chain_started = False
268			result.append(l)
269			if not ter_added:
270			result.append('TER')
271			self._apply_fix('terminate_chains', '\n'.join(result), result)
272
273			def _split_by_ters(self, separator='TER\n'):
274			"""Split a PDB string by TER lines or other separator
275
276			Arguments:
277			* separator = optional, separator dividing structures, default is 'TER\n'
278			"""
279			return [i + separator for i in self.pdb_string.split(separator)]
280
281			def remove_short_chains(self, threshold=9):
282			"""Remove chains that have chains with a small number of atoms.
283
284			Arguments:
285			* threshold = if chain has more atoms, it stays in the result
286			"""
287			chains = self.pdb_string.split('TER\n')
288			good_chains = [c for c in chains if len(c.split('\nATOM')) > threshold]
289			self._apply_fix('remove_short_chains', '\n'.join(good_chains))
290
291			def check_and_add_P_at_start(self):
292			residues = {}
293			for l in self.pdb_lines:
294			res_num = get_res_num(l)
295			if res_num in residues:
296			residues[res_num].append(l)
297			else:
298			residues[res_num] = [l]
299			first_residue = residues[min(residues.keys())]
			0 ignored issues – show Comprehensibility Best Practice introduced 2020-06-29 19:56 UTC by Report Bug Copy Issue Report The variable `min` does not seem to be defined. Loading history...
300			# check P
301			has_p = False
302			for l in first_residue:
303			if get_atom_code(l) == 'P':
304			has_p = True
305			if not has_p:
306			# add P
307			corrected_residue = []
308			for l in first_residue:
309			if get_atom_code(l) == 'O5\'' or get_atom_code(l) == 'O5*':
310			corrected_residue.append(set_atom_code(l, 'P'))
311			else:
312			corrected_residue.append(l)
313			residues[min(residues.keys())] = corrected_residue
314			residues = ['\n'.join(residues[r]) for r in residues]
315			self._apply_fix('add_P_at_start', '\n'.join(residues))
316
317			def set_residues_bfactor(self, bfactors):
318			"""Set B-factor to any value you want
319
320			Arguments:
321			* bfactors = list of B-factors that will be set, should be of same
322			length as nucleotide sequence
323			"""
324			ans = []
325			for l in self.pdb_lines:
326			if l.startswith('ATOM'):
327			res_num = get_res_num(l)
328			try:
329			ans.append(set_line_bfactor(l, bfactors[res_num - 1]))
330			except IndexError:
331			pass
332			else:
333			ans.append(l)
334			self.pdb_lines = ans
335			self.pdb_string = '\n'.join(ans)
336
337			def pedantic_pdb(self):
338			"""Do everything that's possible to fix PDB: 3-to-1, no HETATMs, TERs etc.
339			"""
340			self.resname_check_and_3to1()
341			self.terminate_chains()
342			self.remove_short_chains()
343
344			def count_models(self):
345			"""Count models in the PDB file
346
347			Output:
348			* number of models as an int
349			"""
350			model_num = 0
351			for l in self.pdb_lines:
352			if l.startswith('MODEL'):
353			model_num += 1
354			return model_num
355
356			def get_model(self, model_num):
357			"""Get n-th model from the file
358
359			Arguments:
360			* model_num = number of model to get, starts with 0
361			"""
362			model_borders = list(zip(
363			[i[0] for i in enumerate(self.pdb_lines) if i[1].startswith('MODEL')],
364			[i[0] for i in enumerate(self.pdb_lines) if i[1].startswith('ENDMDL')]
365			))
366			result = self.pdb_lines[:model_borders[0][0]]
367			result.extend(self.pdb_lines[model_borders[model_num][0]:model_borders[model_num][1] + 1])
368			result.extend(self.pdb_lines[model_borders[-1][1] + 1:])
369			self._apply_fix('get_model', '\n'.join(result), result)
370
371			def check_and_get_first_model(self):
372			"""Check if there are more than one models and get only the first one
373			"""
374			if self.count_models() >= 2:
375			self.get_model(0)
376

mmagnus / rna-tools

PDBFile._split_by_ters() A last analyzed 2025-07-29 00:29 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

PDBFile._split_by_ters() A
last analyzed 2025-07-29 00:29 UTC