PDBFile._split_by_ters()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 7
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 2
dl 0
loc 7
rs 10
c 0
b 0
f 0
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
from rna_tools.tools.pdb_formatix.SingleLineUtils import get_res_code, get_res_num, get_atom_code, \
4
    set_atom_code, set_line_bfactor
5
import re
6
7
8
class PDBFile(object):
9
    """Class for holding data from a PDB file and modifying it.
10
    """
11
12
    # find 'ATOM' lines in a PDB file
13
    ATOM_LINE_PATTERN = re.compile('^ATOM')
14
15
    # dictionary for changing residue names from 3 letters to 1 letter
16
    RES3TO1 = {
17
        'GUA': 'G',
18
        'URI': 'U',
19
        'CYT': 'C',
20
        'ADE': 'A',
21
22
        'RG': 'G',
23
        'RU': 'U',
24
        'RC': 'C',
25
        'RA': 'A',
26
27
        'R3G': 'G',
28
        'R3U': 'U',
29
        'R3C': 'C',
30
        'R3A': 'A',
31
32
        'R5G': 'G',
33
        'R5U': 'U',
34
        'R5C': 'C',
35
        'R5A': 'A',
36
37
        'RG3': 'G',
38
        'RU3': 'U',
39
        'RC3': 'C',
40
        'RA3': 'A',
41
42
        'RG5': 'G',
43
        'RU5': 'U',
44
        'RC5': 'C',
45
        'RA5': 'A',
46
47
        'RGU': 'G',
48
        'URA': 'U',
49
        'RCY': 'C',
50
        'RAD': 'A',
51
52
        # just in case
53
        'G': 'G',
54
        'U': 'U',
55
        'C': 'C',
56
        'A': 'A',
57
    }
58
59
    AMINOACID_CODES = ("Ala", "Arg", "Asn", "Asp", "Cys", "Glu", "Gln", "Gly",
60
                       "His", "Ile", "Leu", "Lys", "Met", "Phe", "Pro", "Ser", "Thr",
61
                       "Trp", "Tyr", "Val",
62
                       "ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "GLN", "GLY",
63
                       "HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR",
64
                       "TRP", "TYR", "VAL"
65
                       )
66
67
    def __init__(self, pdb_string=None, pdb_path=None, pdb_handle=None, verbose=False):
68
        """Constructor, should get exactly one of the arguments:
69
             * pdb_string = PDB as a string
70
             * pdb_path = string with path to a PDB file
71
             * pdb_handle = handle to a PDB file
72
        """
73
        self.verbose = verbose
74
75
        if len([x for x in [pdb_string, pdb_path, pdb_handle] if x is not None]) > 1:
76
            print('You should provide at most one source for PDB file')
77
            raise Exception
78
        input_string = ''
79
        if pdb_string is not None:
80
            input_string = pdb_string
81
        elif pdb_path is not None:
82
            with open(pdb_path) as f:
83
                input_string = f.read()
84
        elif pdb_handle is not None:
85
            input_string = pdb_handle.read()
86
        self.pdb_string = input_string
87
        self.pdb_lines = self.pdb_string.split('\n')
88
        self.fixes = []
89
90
    def save(self, file_path):
91
        """Save current PDB to disk
92
93
        Arguments:
94
          * file_path = path where it will be saved
95
        """
96
        with open(file_path, 'w') as f:
97
            f.write(self.pdb_string)
98
99
    def _apply_fix(self, fix_name, result_string, result_lines=None):
100
        """Helper function for applying fixes and saving information about them.
101
102
        Arguments:
103
          * fix_name = string that will be added to self.pdb_fixes, unless
104
            result_string is the same as pdb_string
105
          * result_string = string after applying this fix
106
          * result_lines = optional, list of lines, use this argument if you
107
            already have such list and don't want to lose time on splitting
108
            result_string
109
        """
110
        if self.pdb_string != result_string:
111
            self.fixes.append(fix_name)
112
            self.pdb_string = result_string
113
            if result_lines is not None:
114
                self.pdb_lines = result_lines
115
            else:
116
                self.pdb_lines = self.pdb_string.split('\n')
117
118
    def set_string(self, pdb_string):
119
        """Change PDB string stored in this instance of the class
120
121
        Arguments:
122
          * pdb_string = new PDB string
123
        """
124
        self.pdb_string = pdb_string
125
        self.pdb_lines = pdb_string.split('\n')
126
127
    def _get_atom_lines(self):
128
        """Get only lines with ATOM information
129
        """
130
        return [l for l in self.pdb_lines if re.match(self.ATOM_LINE_PATTERN, l)]
131
132
    def validate_pdb(self):
133
        """Check if file is a PDB structure
134
135
        Output:
136
          * True if it is a PDB, False otherwise
137
        """
138
        atom_lines = self._get_atom_lines()
139
        if not len(atom_lines):
140
            return False
141
        return True
142
143
    def detect_proteins(self):
144
        """Check if there are any aminoacid fragments in the file
145
146
        Output:
147
          * True if there are some, False otherwise
148
        """
149
        for l in self.pdb_lines:
150
            if get_res_code(l) in self.AMINOACID_CODES:
151
                return True
152
        return False
153
154
    def seq_from_pdb(self):
155
        """Extract sequence from a PDB and return it as a string.
156
157
        Output:
158
          * sequence, returned as a string
159
        """
160
        atoms = [l.split() for l in self._get_atom_lines()]
161
        if atoms[0][3][0] != 'r':  # it is 'r' if it's a ROSETTA PDB
162
            seq = [self.RES3TO1[atoms[0][3]]]
163
        else:
164
            seq = [atoms[0][3][1]]
165
        for a in range(1, len(atoms)):
166
            # atom number is different than previous one
167
            if atoms[a][5] != atoms[a - 1][5]:
168
                if atoms[a][3][0] != 'r':  # check for ROSETTA PDB
169
                    seq.append(self.RES3TO1[atoms[a][3]])
170
                else:
171
                    seq.append(atoms[a][3][1])
172
        return ''.join(seq)
173
174
    def seq_from_amber_like_pdb(self):
175
        """Extract sequence from a PDB and return it as a string - use it for amber-like files.
176
177
        Output:
178
          * sequence, returned as a string, such as: RG5 RC RU RG RG RG RC RG RC RA RG RG3 RC5 RC RU RG RA RC RG RG RU RA RC RA RG RC3
179
        """
180
        atoms = [l.split() for l in self._get_atom_lines()]
181
        seq = []
182
        seq.append(atoms[0][3])
183
        for a in range(1, len(atoms)):
184
            # atom number is different than previous one
185
            if self.verbose:
186
                print((atoms[a][5], atoms[a - 1][5]))
187
            if atoms[a][5] != atoms[a - 1][5]:
188
                seq.append(atoms[a][3])
189
        return ' '.join(seq)
190
191
    def get_fasta(self, name='seq', lowercase=False):
192
        """Format sequence in FASTA format, with a header and lines split
193
        at 80 characters.
194
195
        Arguments:
196
          * name = name of the sequence (it's put in the header line)
197
198
        Output:
199
          * FASTA returned as a string
200
        """
201
        seq = self.seq_from_pdb()
202
        if lowercase:
203
            seq = seq.lower()
204
        result = ['>' + name]
205
        result.extend([seq[i:i + 80] for i in range(0, len(seq), 80)])
206
        result.append('')
207
        return '\n'.join(result)
208
209
    def remove_non_atoms(self):
210
        """Remove all lines that are not ATOMs
211
        """
212
        result = self._get_atom_lines()
213
        self._apply_fix('Removed non-atom lines', '\n'.join(result), result)
214
215
    def _check_resname_3(self):
216
        """Check if PDB uses 3 or 1 letter residue names.
217
218
        Output:
219
          * bool, True if 3 letters, False otherwise
220
        """
221
        for l in self.pdb_lines:
222
            if l.startswith('ATOM'):
223
                return 3 == len(l.split()[3])
224
225
    def _resname_3to1(self):
226
        """Convert residue names in PDB file from 3 letters to 1 letter.
227
        """
228
        result = []
229
        for l in self.pdb_lines:
230
            if l.startswith('ATOM'):
231
                long_name = l.split()[3]
232
                try:
233
                    short_name = self.RES3TO1[long_name]
234
                except KeyError:
235
                    short_name = 'X'
236
                result.append(l[:17] + '  ' + short_name + l[20:])
237
            else:
238
                result.append(l)
239
        self._apply_fix('resname_3to1', '\n'.join(result), result)
240
241
    def resname_check_and_3to1(self):
242
        """Check if resnames are 3 letter long and if so convert them to 1 letter.
243
        """
244
        if self._check_resname_3():
245
            try:
246
                self._resname_3to1()
247
            except:
248
                print('Conversion to 1-letter residue names failed')
249
250
    def terminate_chains(self):
251
        """Add 'TER' at the end of chain if none 'TER's are found in PDB.
252
        """
253
        for l in self.pdb_lines:
254
            if l.startswith('TER'):
255
                return
256
        else:
257
            result = []
258
            chain_started = False
259
            ter_added = False
260
            for l in self.pdb_lines:
261
                if 'ATOM' in l:
262
                    chain_started = True
263
                else:
264
                    if chain_started:
265
                        result.append('TER')
266
                        ter_added = True
267
                        chain_started = False
268
                result.append(l)
269
        if not ter_added:
270
            result.append('TER')
271
        self._apply_fix('terminate_chains', '\n'.join(result), result)
272
273
    def _split_by_ters(self, separator='TER\n'):
274
        """Split a PDB string by TER lines or other separator
275
276
        Arguments:
277
          * separator = optional, separator dividing structures, default is 'TER\n'
278
        """
279
        return [i + separator for i in self.pdb_string.split(separator)]
280
281
    def remove_short_chains(self, threshold=9):
282
        """Remove chains that have chains with a small number of atoms.
283
284
        Arguments:
285
          * threshold = if chain has more atoms, it stays in the result
286
        """
287
        chains = self.pdb_string.split('TER\n')
288
        good_chains = [c for c in chains if len(c.split('\nATOM')) > threshold]
289
        self._apply_fix('remove_short_chains', '\n'.join(good_chains))
290
291
    def check_and_add_P_at_start(self):
292
        residues = {}
293
        for l in self.pdb_lines:
294
            res_num = get_res_num(l)
295
            if res_num in residues:
296
                residues[res_num].append(l)
297
            else:
298
                residues[res_num] = [l]
299
        first_residue = residues[min(residues.keys())]
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable min does not seem to be defined.
Loading history...
300
        # check P
301
        has_p = False
302
        for l in first_residue:
303
            if get_atom_code(l) == 'P':
304
                has_p = True
305
        if not has_p:
306
            # add P
307
            corrected_residue = []
308
            for l in first_residue:
309
                if get_atom_code(l) == 'O5\'' or get_atom_code(l) == 'O5*':
310
                    corrected_residue.append(set_atom_code(l, 'P'))
311
                else:
312
                    corrected_residue.append(l)
313
            residues[min(residues.keys())] = corrected_residue
314
            residues = ['\n'.join(residues[r]) for r in residues]
315
            self._apply_fix('add_P_at_start', '\n'.join(residues))
316
317
    def set_residues_bfactor(self, bfactors):
318
        """Set B-factor to any value you want
319
320
        Arguments:
321
          * bfactors = list of B-factors that will be set, should be of same
322
            length as nucleotide sequence
323
        """
324
        ans = []
325
        for l in self.pdb_lines:
326
            if l.startswith('ATOM'):
327
                res_num = get_res_num(l)
328
                try:
329
                    ans.append(set_line_bfactor(l, bfactors[res_num - 1]))
330
                except IndexError:
331
                    pass
332
            else:
333
                ans.append(l)
334
        self.pdb_lines = ans
335
        self.pdb_string = '\n'.join(ans)
336
337
    def pedantic_pdb(self):
338
        """Do everything that's possible to fix PDB: 3-to-1, no HETATMs, TERs etc.
339
        """
340
        self.resname_check_and_3to1()
341
        self.terminate_chains()
342
        self.remove_short_chains()
343
344
    def count_models(self):
345
        """Count models in the PDB file
346
347
        Output:
348
          * number of models as an int
349
        """
350
        model_num = 0
351
        for l in self.pdb_lines:
352
            if l.startswith('MODEL'):
353
                model_num += 1
354
        return model_num
355
356
    def get_model(self, model_num):
357
        """Get n-th model from the file
358
359
        Arguments:
360
          * model_num = number of model to get, starts with 0
361
        """
362
        model_borders = list(zip(
363
            [i[0] for i in enumerate(self.pdb_lines) if i[1].startswith('MODEL')],
364
            [i[0] for i in enumerate(self.pdb_lines) if i[1].startswith('ENDMDL')]
365
        ))
366
        result = self.pdb_lines[:model_borders[0][0]]
367
        result.extend(self.pdb_lines[model_borders[model_num][0]:model_borders[model_num][1] + 1])
368
        result.extend(self.pdb_lines[model_borders[-1][1] + 1:])
369
        self._apply_fix('get_model', '\n'.join(result), result)
370
371
    def check_and_get_first_model(self):
372
        """Check if there are more than one models and get only the first one
373
        """
374
        if self.count_models() >= 2:
375
            self.get_model(0)
376