RNAStructure.set_atom_occupancy()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 3
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""rna_tools_lib.py - main lib file, many tools in this lib is using this file."""
4
5
import os
6
import sys
7
from collections import OrderedDict
8
import re
9
import string
10
import time
11
import gzip
12
import tempfile
13
import shutil
14
import subprocess
15
16
from rna_tools.tools.extra_functions.select_fragment import select_pdb_fragment_pymol_style, select_pdb_fragment
17
18
from icecream import ic
19
import sys
20
ic.configureOutput(outputFunction=lambda *a: print(*a, file=sys.stderr), includeContext=True)
21
ic.configureOutput(prefix='')
22
23
import logging
24
logger = logging.getLogger('rna-tools')
25
handler = logging.StreamHandler()
26
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
27
handler.setFormatter(formatter)
28
logger.addHandler(handler)
29
30
# Don't fix OP3, ignore it
31
ignore_op3 = False
32
33
# Settings: what is what in a PDB file
34
AMINOACID_CODES = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "GLN", "GLY",
35
                   "HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR",
36
                   "TRP", "TYR", "VAL"]
37
38
def generate_conect_records(residue_type, shift=0, no_junction=True):
39
    """
40
    Generate CONECT records for an RNA residue (U, A, C, or G).
41
    
42
    Args:
43
        residue_type (str): Type of residue ('U', 'A', 'C', or 'G').
44
        shift (int): Value to shift all atom serial numbers by.
45
    
46
    Returns:
47
        list: List of CONECT records as strings.
48
    """
49
    # Define connections for each residue type
50
    if residue_type == "U":
51
        connections = [
52
            (1, 2),   # P -> OP1
53
            (1, 3),   # P -> OP2
54
            (1, 4),   # P -> O5'
55
            (4, 5),   # O5' -> C5'
56
            (5, 6),   # C5' -> C4'
57
            (6, 7),   # C4' -> O4'
58
            (6, 8),   # C4' -> C3'
59
            (8, 9),   # C3' -> O3'
60
            (8, 10),  # C3' -> C2'
61
            (10, 11), # C2' -> O2'
62
            (10, 12), # C2' -> C1'
63
            (12, 13), # C1' -> N1
64
            (13, 14), # N1 -> C2
65
            (14, 15), # C2 -> O2
66
            (14, 16), # C2 -> N3
67
            (16, 17), # N3 -> C4
68
            (17, 18), # C4 -> O4
69
            (17, 19), # C4 -> C5
70
            (19, 20), # C5 -> C6
71
            (20, 13), # C6 -> N1
72
            (7, 12),  # O4' -> C1'
73
        ]
74
        if not no_junction:
75
            connections.append((9, 21))
76
        print(connections)
77
    elif residue_type == "A":
78
        connections = [
79
            (1, 2),   # P -> OP1
80
            (1, 3),   # P -> OP2
81
            (1, 4),   # P -> O5'
82
            (4, 5),   # O5' -> C5'
83
            (5, 6),   # C5' -> C4'
84
            (6, 7),   # C4' -> O4'
85
            (6, 8),   # C4' -> C3'
86
            (8, 9),   # C3' -> O3'
87
            (8, 10),  # C3' -> C2'
88
            (10, 11), # C2' -> O2'
89
            (10, 12), # C2' -> C1'
90
            (12, 13), # C1' -> N9
91
            (13, 14), # N9 -> C8
92
            (14, 15), # C8 -> N7
93
            (15, 16), # N7 -> C5
94
            (16, 17), # C5 -> C6
95
            (17, 18), # C6 -> N6
96
            (17, 19), # C6 -> N1
97
            (19, 20), # N1 -> C2
98
            (20, 21), # C2 -> N3
99
            (21, 22), # N3 -> C4
100
            (22, 16), # C4 -> C5
101
            (22, 13), # C4 -> N9
102
            (7, 12),  # O4' -> C1'
103
        ]
104
        if not no_junction:
105
            connections.append((9, 23))
106
    elif residue_type == "C":
107
        connections = [
108
            (1, 2),   # P -> OP1
109
            (1, 3),   # P -> OP2
110
            (1, 4),   # P -> O5'
111
            (4, 5),   # O5' -> C5'
112
            (5, 6),   # C5' -> C4'
113
            (6, 7),   # C4' -> O4'
114
            (6, 8),   # C4' -> C3'
115
            (8, 9),   # C3' -> O3'
116
            (8, 10),  # C3' -> C2'
117
            (10, 11), # C2' -> O2'
118
            (10, 12), # C2' -> C1'
119
            (12, 13), # C1' -> N1
120
            (13, 14), # N1 -> C2
121
            (14, 15), # C2 -> O2
122
            (14, 16), # C2 -> N3
123
            (16, 17), # N3 -> C4
124
            (17, 18), # C4 -> N4
125
            (17, 19), # C4 -> C5
126
            (19, 20), # C5 -> C6
127
            (20, 13), # C6 -> N1
128
            (7, 12),  # O4' -> C1' (additional connection)
129
        ]
130
        if not no_junction:
131
            connections.append((9, 21))
132
    elif residue_type == "G":
133
        connections = [
134
            (1, 2),   # P -> OP1
135
            (1, 3),   # P -> OP2
136
            (1, 4),   # P -> O5'
137
            (4, 5),   # O5' -> C5'
138
            (5, 6),   # C5' -> C4'
139
            (6, 7),   # C4' -> O4'
140
            (6, 8),   # C4' -> C3'
141
            (8, 9),   # C3' -> O3'
142
            (8, 10),  # C3' -> C2'
143
            (10, 11), # C2' -> O2'
144
            (10, 12), # C2' -> C1'
145
            (12, 13), # C1' -> N9
146
            (13, 14), # N9 -> C8
147
            (14, 15), # C8 -> N7
148
            (15, 16), # N7 -> C5
149
            (16, 17), # C5 -> C6
150
            (17, 18), # C6 -> O6
151
            (17, 19), # C6 -> N1
152
            (19, 20), # N1 -> C2
153
            (20, 21), # C2 -> N2
154
            (20, 22), # C2 -> N3
155
            (22, 23), # N3 -> C4
156
            (23, 16), # C4 -> C5
157
            (23, 13), # C4 -> N9
158
            (7, 12),  # O4' -> C1' (additional connection)
159
        ]
160
        if not no_junction:
161
            connections.append((9, 24))
162
    else:
163
        raise ValueError("Invalid residue type. Choose 'U', 'A', 'C', or 'G'.")
164
165
    # Generate CONECT records with shifted serial numbers
166
    conect_records = []
167
    for conn in connections:
168
        atom1_serial, atom2_serial = conn
169
        # Shift the serial numbers
170
        atom1_serial += shift
171
        atom2_serial += shift
172
        conect_records.append(f"CONECT{atom1_serial:>5}{atom2_serial:>5}")
173
    return conect_records
174
175
    
176
def aa3to1(aaa):
177
    """based on https://pymolwiki.org/index.php/Aa_codes"""
178
    if len(aaa) != 3:  # aaa is 'G', like for RNA ;-)
179
        return '' # dont do it for rna test aaa
180
    one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', \
181
                 'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y',    \
182
                 'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A',    \
183
                 'GLY':'G', 'PRO':'P', 'CYS':'C'}
184
    return one_letter[aaa]
185
186
187
RES = ['DA', 'DG', 'DT', 'DC']
188
RES += ['A', 'G', 'U', 'C']
189
190
RESS = ['A', 'C', 'G', 'U', 'ADE', 'CYT', 'GUA', 'URY', 'URI', 'U34', 'U31', 'C31', '4SU', 'H2U', 'QUO', 'G7M', '5MU',
191
        '5MC', 'PSU', '2MG', '1MG', '1MA', 'M2G', '5BU', 'FHU', 'FMU', 'IU', 'OMG', 'OMC', 'OMU', 'A2M', 'A23', 'CCC',
192
        'I'] + ['RC', 'RU', 'RA', 'RG', 'RT']
193
DNA = ['DA', 'DG', 'DT', 'DC']
194
RNA = ['A', 'G', 'U', 'C']
195
IONS = ['NA', 'MG', 'MN', 'JOS'] # and ligands JOS
196
HYDROGEN_NAMES = ["H", "H5'", "H5''", "H4'", "H3'", "H2'", "HO2'", "H1'", "H3", "H5", "H6", "H5T", "H41", "1H5'",
197
                  "2H5'", "HO2'", "1H4", "2H4", "1H2", "2H2", "H1", "H8", "H2", "1H6", "2H6", "HO5'", "H21", "H22",
198
                  "H61", "H62", "H42", "HO3'", "1H2'", "2HO'", "HO'2", "H2'1", "HO'2", "HO'2", "H2", "H2'1", "H1", "H2",
199
                  "1H5*", "2H5*", "H4*", "H3*", "H1*", "1H2*", "2HO*", "1H2", "2H2", "1H4", "2H4", "1H6", "2H6", "H1",
200
                  "H2", "H3", "H5", "H6", "H8", "H5'1", "H5'2", "H3T"]
201
202
203
class PDBFetchError(Exception):
204
    pass
205
206
class MethodUnknown(Exception):
207
    pass
208
209
210
try:
211
    from Bio.PDB import *
212
except ImportError:
213
    print("Biopython is not detected. It is required for some functions.")
214
215
import warnings
216
warnings.filterwarnings('ignore', '.*Invalid or missing.*',)
217
warnings.filterwarnings('ignore', '.*with given element *',)
218
warnings.filterwarnings('ignore', '.*is discontinuous*',)
219
warnings.filterwarnings('ignore', '.*Some atoms or residues may be missing in the data structure.*',)
220
warnings.filterwarnings('ignore', '.*Ignoring unrecognized record.*',)
221
222
223
import subprocess
224
def exe(cmd):
225
    o = subprocess.Popen(
226
        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
227
    out = o.stdout.read().strip().decode()
228
    err = o.stderr.read().strip().decode()
229
    return out, err
230
231
232
def get_rna_tools_path():
233
    """Return path to the rt."""
234
    import inspect
235
    import rna_tools
236
    return os.path.dirname(inspect.getfile(rna_tools))
237
238
def get_version(currfn='', verbose=False):
239
    import rna_tools
240
    return rna_tools.__version__
241
242
243
def sort_strings(l):
244
    """ Sort the given list in the way that humans expect.
245
    http://blog.codinghorror.com/sorting-for-humans-natural-sort-order/
246
    """
247
    def convert(text): return int(text) if text.isdigit() else text
248
249
    def alphanum_key(key): return [convert(c) for c in re.split('([0-9]+)', key)]
250
    l.sort(key=alphanum_key)
251
    return l
252
253
254
class RNAStructure:
255
    """RNAStructure - handles an RNA pdb file.
256
257
    Attributes:
258
259
        fn (string)  : path to the structural file, e.g., "../rna_tools/input/4ts2.pdb"
260
        name(string) : filename of the structural file, "4ts2.pdb"
261
        lines (list) : the PDB file is loaded and ATOM/HETATM/TER/END go to self.lines
262
263
    """
264
265
    def __init__(self, fn=''):
266
        if not fn: # if empty
267
            tf = tempfile.NamedTemporaryFile(delete=False)
268
            fn = tf.name
269
270
        self.fn = os.path.abspath(fn) # ../rna_tools/input/4ts2.pdb
271
        # add name attribute with filename 4ts2.pdb
272
        self.name = self.fn.split(os.sep)[-1] # OS.path.splitext(self.fn)[0]
273
274
        self.report = []
275
        self.report.append('The RNARNAStructure report: %s ' % self.fn)
276
277
        self.mol2_format = False
278
279
        self.lines = []
280
281
        try:
282
            # lines = open(fn).read().strip().split('\n') # don't strip?, good or bed?
283
            lines = open(fn).read().split('\n')
284
        except UnicodeDecodeError:
285
            print("Can't open a binary file")
286
            self.lines = ''
287
            return
288
289
        self.has_many_models = False
290
291
        for l in lines:
292
            # multi-models pdb files
293
            if l.startswith('MODEL'):
294
                self.has_many_models = True
295
            if l.startswith('ENDMDL'):
296
                break
297
298
            if l.startswith('ATOM') or l.startswith('HETATM') or l.startswith('TER') or l.startswith('END'):
299
                self.lines.append(l) # don't strip .strip())
300
            if l.startswith("@<TRIPOS>"):
301
                self.mol2_format = True
302
                self.report.append('This is mol2 format')
303
304
        self.res = self.get_resn_uniq()
305
306
    def reload(self):
307
        """Reload the object."""
308
        self.__init__(self.fn)
309
        
310
    def is_pdb(self):
311
        """Return True if the files is in PDB format.
312
313
        If self.lines is empty it means that nothing was parsed into the PDB format."""
314
        if len(self.lines):
315
            return True
316
        else:
317
            return False
318
319
    def is_nmr(self):
320
        """True if the file is an NMR-style multiple model pdb
321
322
        :returns: True or Fo
323
        :rtype: boolean
324
        """
325
        return self.has_many_models
326
327
    def un_nmr(self, startwith1=True, directory='', verbose=False):
328
        """Un NMR - Split NMR-style multiple model pdb files into individual models.
329
330
        Take self.fn and create new files in the way::
331
332
            input/1a9l_NMR_1_2_models.pdb
333
               input/1a9l_NMR_1_2_models_0.pdb
334
               input/1a9l_NMR_1_2_models_1.pdb
335
336
        .. warning:: This function requires biopython.
337
338
        rna_pdb_tools.py --un-nmr AF-Q5TCX8-F1-model_v1_core_Ctrim_mdr_MD.pdb
339
36
340
        cat *MD_* > md.pdb
341
        """
342
        #
343
        npdb = ''
344
        c = 0
345
346
        nmodels = 0
347
        for l in open(self.fn):
348
            if l.startswith('MODEL'):
349
                nmodels += 1
350
        print(nmodels)
351
352
        for l in open(self.fn):
353
            if 1:
354
                if not l.startswith('HETATM'):
355
                    npdb += l
356
            if l.startswith('MODEL'):
357
                c += 1
358
            if l.startswith('ENDMDL'):
359
                id = str(c).zfill(len(str(nmodels)))
360
                nf = self.fn.replace('.pdb', '_%s.pdb' % id)
361
                verbose = 1
362
                if verbose:
363
                    print(nf)
364
                with open(nf, 'w') as f:
365
                    f.write(npdb)
366
                npdb = ''
367
        biopython = 0
368
        
369
        if biopython:
370
            parser = PDBParser()
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable PDBParser does not seem to be defined.
Loading history...
371
            structure = parser.get_structure('', self.fn)
372
            for c, m in enumerate(structure):
373
                if verbose:
374
                    print(m)
375
                io = PDBIO()
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable PDBIO does not seem to be defined.
Loading history...
376
                io.set_structure(m)
377
                if startwith1:
378
                    c += 1
379
                if directory:
380
                    io.save(directory + os.sep + self.fn.replace('.pdb', '_%i.pdb' % c))
381
                else:
382
                    io.save(self.fn.replace('.pdb', '_%i.pdb' % c))                
383
384
    def is_mol2(self):
385
        """Return True if is_mol2 based on the presence of ```@<TRIPOS>```."""
386
        return self.mol2_format
387
388
    def decap_gtp(self):
389
        lines = []
390
        for l in self.lines:
391
            if l.startswith('ATOM') or l.startswith('HETATM'):
392
                if l[12:16].strip() in ['PG', 'O1G', 'O2G', 'O3G', 'O3B', 'PB', 'O1B', 'O2B', 'O3A']:
393
                    continue
394
                if l[12:16].strip() == 'PA':
395
                    l = l.replace('PA', 'P ')
396
                if l[12:16].strip() == 'O1A':
397
                    l = l.replace('O1A', 'O1P')
398
                if l[12:16].strip() == 'O2A':
399
                    l = l.replace('O2A', 'O2P')
400
                if l[17:20].strip() == 'GTP':
401
                    l = l[:17] + '  G' + l[20:]
402
                    l = l.replace('HETATM', 'ATOM  ')
403
                lines.append(l)
404
        self.lines = lines
405
406
    def is_amber_like(self):
407
        """Use self.lines and check if there is XX line
408
        """
409
        for l in self.lines:
410
            if l.startswith('ATOM') or l.startswith('HETATM'):
411
                rn = l[17:20]
412
                if rn in ['RU5', 'RC5', 'RA5', 'RT5', 'RG5']:
413
                    self.report.append('This is amber-like format')
414
                    return True
415
        return False
416
417
    def replace_hetatms(self):
418
        nlines = []
419
        for l in self.lines:
420
            l = l.replace('HETATM', 'ATOM  ')
421
            nlines.append(l)
422
        self.lines = nlines
423
424
    def fix_with_qrnas(self, outfn="", verbose=False):
425
        """Add missing heavy atom.
426
427
        A residue is recognized base on a residue names.
428
429
        Copy QRNAS folder to curr folder, run QRNAS and remove QRNAS.
430
431
        .. warning:: QRNAS required (http://genesilico.pl/QRNAS/QRNAS.tgz)
432
        """
433
        from rpt_config import QRNAS_PATH
434
435
        # prepare folder get ff folder
436
        to_go = os.path.abspath(os.path.dirname(self.fn))
437
        curr = os.getcwd()
438
439
        # set occupancy to 0
440
        s = RNAStructure(self.fn)
441
        s.set_occupancy_atoms(0.00)
442
        s.write(self.fn)
443
444
        os.chdir(to_go)
445
        try:
446
            shutil.copytree(QRNAS_PATH, to_go + os.sep + "QRNAS")
447
        except OSError:
448
            pass
449
        # prepare config file
450
        with open('qrna_config.txt', 'w') as f:
451
            f.write("WRITEFREQ   1\n")
452
            f.write("NSTEPS      1\n")
453
        # run qrnas
454
        print('QRNAS...')
455
456
        if not outfn:
457
            cmd = "QRNAS -c qrna_config.txt -i " + os.path.basename(self.fn)
458
        else:
459
            cmd = "QRNAS -c qrna_config.txt -i " + \
460
                os.path.basename(self.fn) + " -o " + curr + os.sep + outfn
461
        #o = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
462
        os.system(cmd)
463
        if False:
464
            out = o.stdout.read().strip()
465
            err = o.stderr.read().strip()
466
            if verbose:
467
                print(out)
468
                print(err)
469
                shutil.rmtree(to_go + os.sep + "QRNAS")
470
        # post cleaning
471
        if outfn:
472
            print('Cleaning...')
473
            s = RNAStructure(curr + os.sep + outfn)
474
            s.remove_hydrogen()
475
            s.std_resn()
476
            s.write(curr + os.sep + outfn)
477
        os.chdir(curr)
478
479
    def mol2toPDB(self, outfn=""):
480
        try:
481
            import pybel
482
        except ImportError:
483
            print ('pybel is needed for mol2 to pdb convertion')
484
            # sys.exit(1)
485
            sys.exit(0)
486
487
        if not outfn:
488
            outfn = self.fn.replace('.mol2', '.pdb')
489
490
        for mol in pybel.readfile("mol2", self.fn):
491
            mol.write("pdb", outfn, overwrite=True)
492
493
        print('outfn: ', outfn)
494
        self.report.append('  Converted from mol2 to PDB')
495
        return outfn
496
497
    def get_no_lines(self):
498
        return len(self.lines)
499
        
500
    def check_geometry(self, verbose=False):
501
        """Check for correct "Polymer linkage, it should be around 1.6Å with a sigma around 0.01.
502
        
503
        Carrascoza, F., Antczak, M., Miao, Z., Westhof, E. & Szachniuk, M. Evaluation of the stereochemical quality of predicted RNA 3D models in the RNA-Puzzles submissions. Rna 28, 250–262 (2022).
504
505
        Values for 1xjr.pdb::
506
507
            op.mean(): 1.599316
508
            op.std(): 0.009274074
509
510
            po.mean(): 1.5984017
511
            po.std(): 0.0069191623
512
        
513
        requires biopython
514
        """
515
        if verbose:
516
            logger.setLevel(logging.DEBUG)
517
518
        try:
519
            from Bio import PDB
520
            from Bio.PDB import PDBIO
521
            import warnings
522
            warnings.filterwarnings('ignore', '.*Invalid or missing.*',)
523
            warnings.filterwarnings('ignore', '.*with given element *',)
524
        except:
525
            sys.exit('Error: Install biopython to use this function (pip biopython)')
526
527
        pdb = self.fn
528
        struc = PDB.PDBParser().get_structure('struc', pdb)
529
        resi_prev = {}
530
        op = []
531
        po = []
532
        angles = []
533
        angles2 = []
534
535
        p_ps = []
536
        sigma = 0.009274074
537
        mean =  1.599316
538
        po3p_p  = []
539
        import numpy as np
540
541
        dist_o3pp= []
542
        for s in struc:
543
            for c in s:
544
                # for new chain reset resi_prev
545
                resi_prev = None
546
                for r in c:
547
                    if resi_prev:
548
                        p_p = resi_prev["P"] - r['P']
549
                        p_ps.append(p_p)
550
                        
551
                        v = resi_prev["O3'"] - r['P']
552
                        po3p_p = resi_prev["O3'"] - r['P']
553
                        op.append(v)
554
                        import math
555
                        dist_o3pp.append(abs(1.6 - v))
556
                        x = mean - 12 * sigma
557
                        y = mean + 12 * sigma
558
559
                        v2 = r["P"] - r["O5'"]
560
                        #ic(v2)
561
                        po.append(v2)
562
563
                        #ic(x, v, y)
564
                        if not (x <= v <= y):
565
                            print(f' ! lack of connectivity between {r}, {resi_prev}, distance between residues: {v:.2f}')
566
                            pass
567
568
                        # angle o3' p o5'
569
                        pc3p = resi_prev["C3'"].get_vector()
570
                        po3p = resi_prev["O3'"].get_vector()
571
                        p = r['P'].get_vector()
572
573
                        v1 = po3p - p
574
                        v2 = po3p - pc3p
575
                        angle = np.rad2deg(v1.angle(v2))#o5p)) 
576
                        angles.append(angle)
577
578
                        # agnle o3'-p-o5' angle
579
                        o5p = r["O5'"].get_vector()
580
                        v1 = po3p - p
581
                        v2 = o5p - p
582
                        angle2 = np.rad2deg(v2.angle(v1))
583
                        angles2.append(angle2)
584
585
                    resi_prev = r
586
        #ic(dist_o3pp, len(dist_o3pp), np.mean(dist_o3pp))
587
        # Colors
588
        RED = "\033[31m"
589
        GREEN = "\033[32m"
590
        YELLOW = "\033[33m"
591
        BLUE = "\033[34m"
592
        RESET = "\033[0m"
593
        
594
        ic(np.array(op))
595
596
        # Create a boxplot
597
        import numpy as np
598
        import pandas as pd
599
        import seaborn as sns
600
        import matplotlib.pyplot as plt
601
        df = pd.DataFrame(op, columns=["Values"])
602
        
603
        # Assuming df is your DataFrame
604
        sns.histplot(data=df, x="Values")#, bins=200)#, kde=True)  # kde=True adds a density curve
605
        plt.title("Histogram of O3'-P distances for 2o44.pdb")
606
        plt.xlabel("Values")
607
        plt.ylabel("Frequency")
608
        percentile_95 = np.percentile(df['Values'], 95)
609
        percentile_05 = np.percentile(df['Values'], 5)
610
        # Add a vertical line for the 95th percentile
611
        plt.axvline(percentile_95, color='red', linestyle='--', linewidth=1, label=f"95th Percentile: {percentile_95:.2f}")
612
        plt.axvline(percentile_05, color='blue', linestyle='--', linewidth=1, label=f"05th Percentile: {percentile_05:.2f}")
613
        plt.legend()
614
        plt.show()
615
        #ic(dist_o3pp)
616
        #aaa
617
618
        print(f'Connectivity Score; {GREEN} {sum(dist_o3pp) } {RESET}')
619
        print('Connectivity Score; ', sum(dist_o3pp) / len(dist_o3pp))
620
        p_ps = np.array(p_ps)
621
622
        plots = True
623
624
        ic(v)
0 ignored issues
show
introduced by
The variable v does not seem to be defined for all execution paths.
Loading history...
625
626
        plots = False
627
628
        if plots:
629
            import matplotlib.pyplot as plt
630
            plt.hist(op, bins='auto')
631
            plt.xlim(-1, 10)
632
            plt.title("prev-o3' p")
633
            plt.show()
634
635
        #np.set_printoptions(threshold=np.inf)
636
        # ic.disable()
637
        ic(p_ps)
638
        ic(p_ps.mean(), p_ps.std())
639
640
        if plots:
641
            import matplotlib.pyplot as plt
642
            plt.hist(p_ps, bins='auto')
643
            plt.title("p-ps")
644
            plt.xlim(0, 10)
645
            plt.show()
646
647
            #plt.savefig("histogram.png")  # Save the figure as a PNG file
648
        
649
        op = np.array(op)
650
        ic(op.mean(), op.std())
651
652
        po = np.array(po)
653
        ic(po.mean(), po.std())
654
655
        print("c3'-o3'-p")
656
        angles = np.array(angles)
657
        ic(angles)
658
        ic(angles.mean(), angles.std())
659
660
        import math
661
        angle_degrees = angles.mean()
662
        angle_radians = math.radians(angle_degrees)  # Convert to radians
663
        cos_value = math.cos(angle_radians)  # Calculate cosine
664
        ic(cos_value)
665
        
666
        coss = []
667
        for a in angles:
668
            angle_radians = math.radians(a)
669
            cos_value = math.cos(angle_radians)  # Calculate cosine
670
            coss.append(cos_value)
671
        coss = np.array(coss)
672
        ic(coss.std())
673
674
        if plots:
675
            import matplotlib.pyplot as plt
676
            plt.hist(angles, bins='auto')
677
            plt.title("Histogram of angles o3'-p-o5'")
678
            plt.xlim(0, 360)
679
            plt.show()
680
681
        angles2 = np.array(angles2)
682
        print("o3'-p-o5'")
683
        ic(angles2)
684
        ic(angles2.mean(), angles2.std())
685
686
        angle_degrees = angles2.mean()
687
        angle_radians = math.radians(angle_degrees)  # Convert to radians
688
        cos_value = math.cos(angle_radians)  # Calculate cosine
689
690
        ic(cos_value)
691
692
        coss = []
693
        for a in angles2:
694
            angle_radians = math.radians(a)
695
            cos_value = math.cos(angle_radians)  # Calculate cosine
696
            coss.append(cos_value)
697
        coss = np.array(coss)
698
        ic(coss.std())
699
700
        if plots:
701
            import matplotlib.pyplot as plt
702
            plt.hist(angles2, bins='auto')
703
            plt.title("Histogram of angles c3'-o3'-p")
704
            plt.xlim(0, 360)
705
            plt.show()
706
        #ic.enable()
707
        
708
    def get_text(self, add_end=True):
709
        """works on self.lines."""
710
        txt = ''
711
        for l in self.lines:
712
            if l.startswith('END'):
713
                continue  # skip end
714
            txt += l + '\n' # .strip()
715
        if add_end:
716
            if not l.startswith('END'):
0 ignored issues
show
introduced by
The variable l does not seem to be defined in case the for loop on line 711 is not entered. Are you sure this can never be the case?
Loading history...
717
                txt += 'END'
718
        return txt.strip()
719
720
    def get_chain(self, chain_id='A'):
721
        txt = ''
722
        for l in self.lines:
723
            if l.startswith('ATOM') or l.startswith('HETATM'):
724
                if l[21] == chain_id:
725
                    txt += l.strip() + '\n'
726
            if l.startswith('TER'):
727
                if len(l) < 21: # this is only TER
728
                    #                     ter = 'TER   ' +  str(self.get_atom_num(txt) + 1).rjust(5) + '\n'
729
                    raise Exception('TER line has no chain id')
730
                else:     
731
                    if l[21] == chain_id:
732
                          txt += l.strip() + '\n'
733
        ## ll = txt.strip().split('\n')[-1] # last line = ll
734
        ## print(ll)
735
        ## #if ll.startswith('TER'):  # if the last line does not start with ter
736
        ## ter = self.set_res_code(ter, self.get_res_code(ll)) # + self.get_chain_id(l).rjust(11)
737
        ## print(ter)
738
        return txt.strip()
739
740
    def rename_chain(self, chain_id_old, chain_id_new, debug=False):
741
        """Rename chains
742
743
        Args:
744
            chain_id_old (str): e.g., A
745
            chain_id_new (str): e.g., B
746
            debug (bool): show some diagnostics
747
748
        Returns:
749
            pdb content (txt)
750
            self.lines is updated with new lines
751
        """
752
        txt = ''
753
        lines = []
754
        for l in self.lines:
755
            if l.startswith('ATOM') or l.startswith('HETATM') or l.startswith('TER'):
756
                # TER
757
                try:
758
                    l[21]
759
                except IndexError:
760
                    continue
761
                if l[21] == chain_id_old:
762
                    l = l[:21] + chain_id_new + l[22:]
763
            if debug: print(l)
764
            txt += l.strip() + '\n'  # ok, actually keep all lines as it was
765
            lines.append(l)
766
        self.lines = lines
767
        return txt
768
769
770
    def get_resn_uniq(self):
771
        res = set()
772
        for l in self.lines:
773
            r = l[17:20].strip().upper()
774
            res.add(r)
775
        return res
776
777
    def check_res_if_std_na(self):
778
        wrong = []
779
780
        for r in self.res:
781
            if r not in RES:
782
                wrong.append(r)
783
        return wrong
784
785
    def get_seq(self, compact=False, chainfirst=False, fasta=False, addfn='', color=False):
786
        """Get seq (v2) gets segments of chains with correct numbering
787
788
        Run::
789
790
            python rna_pdb_seq.py input/1ykq_clx.pdb
791
            > 1ykq_clx A:101-111
792
            GGAGCUCGCCC
793
            > 1ykq_clx B:201-238
794
            GGGCGAGGCCGUGCCAGCUCUUCGGAGCAAUACUCGGC
795
796
            > 6_solution_0 A:1-19 26-113 117-172
797
            GGCGGCAGGUGCUCCCGACGUCGGGAGUUAAAAGGGAAG
798
799
        Chains is ``{'A': {'header': 'A:1-19 26-113 117-172', 'resi': [1, 2, 3, ..., \
800
        19, 26, 27, ..., 172], 'seq': ['G', 'G', 'C', 'G', ... C', 'G', 'U', 'C']}}``
801
802
        Chains are in other as the appear in the file.
803
804
        .. warning :: take only ATOM and HETATM lines.
805
        """
806
        if addfn:
807
            addfn += ' ' # add space to file name
808
        seq = self.lines[0][19]
809
        chains = OrderedDict()
810
        resi_prev = None
811
        chain_prev = None
812
813
        for l in self.lines:
814
            if l.startswith('ATOM') or l.startswith('HETATM'):
815
                resi = int(l[22:26])
816
                if resi_prev != resi:
817
                    resname = l[17:20].strip()
818
                    chain_curr = l[21]
819
                    if resname in AMINOACID_CODES:
820
                        resname = aa3to1(resname)
821
                    if len(resname) == 'GTP':  # DG -> g GTP
822
                        resname = 'g'
823
                    if len(resname) > 1:  # DG -> g GTP
824
                        resname = resname[-1].lower()
825
                    try:
826
                        chains[chain_curr]['resi'].append(resi)
827
                        chains[chain_curr]['seq'].append(resname)
828
                    except KeyError:
829
                        chains[chain_curr] = {}
830
                        chains[chain_curr]['resi'] = []
831
                        chains[chain_curr]['resi'].append(resi)
832
                        chains[chain_curr]['seq'] = []
833
                        chains[chain_curr]['seq'].append(resname)
834
835
                    resi_prev = resi
836
                    chain_prev = chain_curr
837
                    
838
        def color_seq(seq, color):
839
            if not color:
840
                return seq
841
            else:
842
                from termcolor import colored
843
                seqc = ''
844
                for s in seq:
845
                    if s in ['G']:
846
                        seqc += colored(s, 'green')
847
                    if s in ['G']:
848
                        seqc += colored(s, 'red')
849
                    if s in ['T', 'U']:
850
                        seqc += colored(s, 'blue')                        
851
                    if s in ['C']:
852
                        seqc += colored(s, 'yellow')                        
853
                return seqc
854
            
855
856
        for c in list(chains.keys()):
857
            header = c + ':' + str(chains[c]['resi'][0]) + '-'  # add A:1-
858
            for i in range(1, len(chains[c]['resi'])):  # start from second element
859
                if chains[c]['resi'][i] - chains[c]['resi'][i - 1] > 1:
860
                    header += '' + str(chains[c]['resi'][i - 1]) + ' '
861
                    header += '' + str(chains[c]['resi'][i]) + '-'
862
            header += '' + str(chains[c]['resi'][-1])
863
            chains[c]['header'] = header  # add -163 (last element)
864
865
        if compact:
866
            txt = ''
867
            for c in list(chains.keys()):
868
                if chainfirst:
869
                    txt += '' + chains[c]['header'].ljust(15) + color_seq(''.join(chains[c]['seq']), color) + ' '
870
                elif fasta:
871
                    txt += color_seq(''.join(chains[c]['seq']), color) + ' '
872
                else:
873
                    txt += color_seq(''.join(chains[c]['seq']), color) + ' # ' + chains[c]['header'] + ' '
874
            return txt.strip()
875
        else:
876
            txt = ''
877
            for c in list(chains.keys()):
878
                txt += '>' + addfn + chains[c]['header'] + '\n'
879
                txt += color_seq(''.join(chains[c]['seq']), color) + '\n' # color ;-)
880
            return txt.strip()
881
882
    def __get_seq(self):
883
        """get_seq DEPRECATED
884
885
        You get `chains` such as:
886
        OrderedDict([('A', {'header': 'A:1-47', 'seq': 'CGUGGUUAGGGCCACGUUAAAUAGUUGCUUAAGCCCUAAGCGUUGAU'}), ('B', {'header': 'B:48-58', 'seq': 'AUCAGGUGCAA'})])
887
888
        .. warning:: take only ATOM and HETATM lines.
889
        """
890
        seq = ''
891
        curri = int(self.lines[0][22:26])
892
        seq = self.lines[0][19]
893
        chains = OrderedDict()
894
        curri = -100000000000000000000000000000000  # ugly
895
        chain_prev = None
896
897
        for l in self.lines:
898
            if l.startswith('ATOM') or l.startswith('HETATM'):
899
                resi = int(l[22:26])
900
                if curri != resi:
901
                    print(l)
902
                    resname = l[17:20].strip()
903
                    if len(resname) == 'GTP':  # DG -> g GTP
904
                        resname = 'g'
905
                    if len(resname) > 1:  # DG -> g GTP
906
                        resname = resname[-1].lower()
907
908
                    seq += resname
909
                    chain_curr = l[21]
910
911
                    # if distances between curr res and previevs is bigger than 1, then show it as a fragment
912
                    if resi - curri > 1 and resi - curri < 100000000000000000000000000000000:  # ugly hack
913
                        print(resi - curri)
914
                        chains[chain_prev]['header'] += '-' + str(resi_prev)
0 ignored issues
show
introduced by
The variable resi_prev does not seem to be defined for all execution paths.
Loading history...
915
                    if chain_prev != chain_curr and chain_prev:
916
                        chains[chain_prev]['header'] += '-' + str(resi_prev)
917
                    if chaoin_curr in chains:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable chaoin_curr does not seem to be defined.
Loading history...
918
                        chains[chain_curr]['seq'] += resname
919
                    else:
920
                        chains[chain_curr] = dict()
921
                        chains[chain_curr]['header'] = chain_curr + ':' + str(resi)  # resi_prev)
922
                        chains[chain_curr]['seq'] = resname
923
                    resi_prev = resi
924
                    chain_prev = chain_curr
925
                curri = resi
926
        chains[chain_prev]['header'] += '-' + str(resi_prev)
927
        seq = ''
928
        for c in list(chains.keys()):
929
            seq += '> ' + os.path.basename(self.fn) + ' ' + chains[c]['header'] + '\n'
930
            seq += chains[c]['seq'] + '\n'
931
        return seq.strip()
932
933
    def get_info_chains(self):
934
        """return A:3-21 B:22-32
935
        """
936
        seq = ''
937
        curri = int(self.lines[0][22:26])
938
        seq = self.lines[0][19]
939
        chains = OrderedDict()
940
        curri = -100000000000000  # ugly
941
        chain_prev = None
942
        for l in self.lines:
943
            if l.startswith('ATOM') or l.startswith('HETATM'):
944
                resi = int(l[22:26])
945
                if curri != resi:
946
                    resname = l[17:20].strip()
947
                    if len(resname) == 'GTP':  # DG -> g GTP
948
                        resname = 'g'
949
                    if len(resname) > 1:  # DG -> g GTP
950
                        resname = resname[-1].lower()
951
                    seq += resname
952
                    chain_curr = l[21]
953
                    if chain_prev != chain_curr and chain_prev:
954
                        chains[chain_prev]['header'] += '-' + str(resi_prev)
0 ignored issues
show
introduced by
The variable resi_prev does not seem to be defined for all execution paths.
Loading history...
955
                    if chain_curr in chains:
956
                        chains[chain_curr]['seq'] += resname
957
                    else:
958
                        chains[chain_curr] = dict()
959
                        chains[chain_curr]['header'] = chain_curr + ':' + str(resi)  # resi_prev)
960
                        chains[chain_curr]['seq'] = resname
961
                    resi_prev = resi
962
                    chain_prev = chain_curr
963
                curri = resi
964
        chains[chain_prev]['header'] += '-' + str(resi_prev)
965
        seq = ''
966
        for c in list(chains.keys()):
967
            seq += chains[c]['header'] + ' '
968
        return seq.strip()
969
970
    def detect_file_format(self):
971
        pass
972
973
    def detect_molecule_type(self):
974
        aa = []
975
        na = []
976
        for r in self.res:
977
            if r in AMINOACID_CODES:
978
                aa.append(r)
979
            if r in RESS:
980
                na.append(r)
981
982
        aa = float(len(aa)) / len(self.res)
983
        na = float(len(na)) / len(self.res)
984
985
        if aa == 0 and na == 0:
986
            return 'error'
987
        if aa > na:
988
            return '>protein< vs na', aa, na
989
        else:
990
            return 'protein vs >na<', aa, na
991
992
    def get_head(self):
993
        return '\n'.join(self.lines[:5])
994
995
    def get_tail(self):
996
        return '\n'.join(self.lines[-5:])
997
998
    def get_preview(self):
999
        t = '\n'.join(self.lines[:5])
1000
        t += '\n-------------------------------------------------------------------\n'
1001
        t += '\n'.join(self.lines[-5:])
1002
        return t
1003
1004
    def remove_hydrogen(self):
1005
        lines = []
1006
        for l in self.lines:
1007
            if l[77:79].strip() == 'H':
1008
                continue
1009
            if l[12:16].strip() in HYDROGEN_NAMES:
1010
                # if l[12:16].strip().startswith('H'):
1011
                continue
1012
            else:
1013
                # print l[12:16]
1014
                lines.append(l)
1015
        self.lines = lines
1016
1017
    def remove_water(self):
1018
        """Remove HOH and TIP3"""
1019
        lines = []
1020
        for l in self.lines:
1021
            if l[17:21].strip() in ['HOH', 'TIP3', 'WAT']:
1022
                continue
1023
            else:
1024
                lines.append(l)
1025
        self.lines = lines
1026
1027
    def remove_ion(self):
1028
        """
1029
    TER    1025        U A  47
1030
    HETATM 1026 MG    MG A 101      42.664  34.395  50.249  1.00 70.99          MG
1031
    HETATM 1027 MG    MG A 201      47.865  33.919  48.090  1.00 67.09          MG
1032
        :rtype: object
1033
        """
1034
        lines = []
1035
        for l in self.lines:
1036
            element = l[76:78].strip().upper()
1037
            element2 = l[17:20].strip().upper()
1038
            if element in IONS:
1039
                continue
1040
            if element2 in IONS:
1041
                continue
1042
            else:
1043
                lines.append(l)
1044
        self.lines = lines
1045
1046
    def fixU__to__U(self):
1047
        lines = []
1048
        for l in self.lines:
1049
            if l.startswith('ATOM') or l.startswith('HETATM'):
1050
                rn = l[17:20]
1051
                rn = rn.replace('G  ', '  G')
1052
                rn = rn.replace('U  ', '  U')
1053
                rn = rn.replace('C  ', '  C')
1054
                rn = rn.replace('A  ', '  A')
1055
                l = l[:16] + ' ' + rn + ' ' + l[21:]
1056
            # print l.strip()
1057
            # print l2
1058
            #l = l.replace(' U   ', '   U ')
1059
            #l = l.replace(' G   ', '   G ')
1060
            #l = l.replace(' A   ', '   A ')
1061
            #l = l.replace(' C   ', '   C ')
1062
            lines.append(l)
1063
        print ('fixU__to__U OK')
1064
        self.report.append('  Fix: U__ -> __U')
1065
        self.lines = lines
1066
1067
    def resn_as_dna(self):
1068
        lines = []
1069
        for l in self.lines:
1070
            if l.startswith('ATOM') or l.startswith('HETATM'):
1071
                # print l
1072
                nl = l.replace('DA5', ' DA')  # RA should be the last!!!!
1073
                nl = nl.replace('DA3', ' DA')
1074
                nl = nl.replace(' DA', ' DA')
1075
                nl = nl.replace(' rA', ' DA')
1076
1077
                nl = nl.replace('DC5', ' DC')
1078
                nl = nl.replace('DC3', ' DC')
1079
                nl = nl.replace(' DC', ' DC')
1080
                nl = nl.replace(' rC', ' DC')
1081
1082
                nl = nl.replace('DG5', ' DG')
1083
                nl = nl.replace('DG3', ' DG')
1084
                nl = nl.replace(' DG', ' DG')
1085
                nl = nl.replace(' rG', ' DG')
1086
1087
                nl = nl.replace('DU5', ' DU')
1088
                nl = nl.replace('DU3', ' DU')
1089
                nl = nl.replace(' DU', ' DU')
1090
                nl = nl.replace(' rU', ' DU')
1091
1092
                nl = nl.replace('DT5', ' DT')
1093
                nl = nl.replace('DT3', ' DT')
1094
                nl = nl.replace(' DT', ' DT')
1095
                nl = nl.replace(' rT', ' DT')
1096
1097
                nl = nl.replace('C5M', 'C7 ')
1098
1099
                if l[17:20].strip() == 'G':
1100
                    nl = nl[:17] + ' DG' + nl[20:]
1101
1102
                if l[17:20].strip() == 'C':
1103
                    nl = nl[:17] + ' DC' + nl[20:]
1104
1105
                if l[17:20].strip() == 'T':
1106
                    nl = nl[:17] + ' DT' + nl[20:]
1107
1108
                if l[17:20].strip() == 'U':
1109
                    nl = nl[:17] + ' DU' + nl[20:]
1110
1111
                if l[17:20].strip() == 'A':
1112
                    nl = nl[:17] + ' DA' + nl[20:]
1113
1114
                lines.append(nl)
1115
            if l.startswith("END") or l.startswith("TER"):
1116
                lines.append(l)
1117
1118
        print ('resn_as_dna')
1119
        self.report.append('  resn_as_dna')
1120
        self.lines = lines
1121
1122
    def fix_O_in_UC(self):
1123
        """.. warning: remove RU names before using this function"""
1124
        lines = []
1125
        for l in self.lines:
1126
            # if l[12:16].strip() in
1127
            # if l[12:16].strip().startswith('H'):
1128
            nl = l.replace('O     U',
1129
                           'O2    U')
1130
            nl = nl.replace('O     C',
1131
                            'O2    C')
1132
            lines.append(nl)
1133
        self.lines = lines
1134
1135
    def fix_op_atoms(self):
1136
        """Replace OXP' to OPX1, e.g ('O1P' -> 'OP1')"""
1137
        lines = []
1138
        for l in self.lines:
1139
            nl = l.replace('*', '\'')
1140
            nl = nl.replace('O1P', 'OP1')
1141
            nl = nl.replace('O2P', 'OP2')
1142
            nl = nl.replace('O3P', 'OP3')
1143
            lines.append(nl)
1144
        self.lines = lines
1145
1146
    def get_report(self):
1147
        """
1148
        Returns:
1149
            string: report, messages collected on the way of parsing this file
1150
        """
1151
        return '\n'.join(self.report)
1152
1153
    def is_rna(self):
1154
        wrong = []
1155
        for r in self.res:
1156
            if r.upper().strip() in ['RC', 'RU', 'RA', 'RG', 'RT']:
1157
                if r not in wrong_res:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable wrong_res does not seem to be defined.
Loading history...
1158
                    wrong_res.append(r)
1159
        return wrong_res
1160
1161
    def check_res_if_std_dna(self):
1162
        wrong_res = []
1163
        for r in self.res:
1164
            if r.upper().strip() in ['A', 'T', 'C', 'G']:
1165
                if r not in wrong_res:
1166
                    wrong_res.append(r)
1167
        return wrong_res
1168
1169
    def check_res_if_supid_rna(self):
1170
        wrong_res = []
1171
        for r in self.res:
1172
            if r.upper().strip() in ['RC', 'RU', 'RA', 'RG', 'RT']:
1173
                if r not in wrong_res:
1174
                    wrong_res.append(r)
1175
        return wrong_res
1176
1177
1178
    def is_rna(self):
1179
        for r in self.res:
1180
            if r.upper().strip() in ['RC', 'RU', 'RA', 'RG', 'RT']:
1181
                if r not in wrong_res:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable wrong_res does not seem to be defined.
Loading history...
1182
                    wrong_res.append(r)
1183
        return wrong_res
1184
1185
    def renum_atoms(self):
1186
        """Renum atoms, from 1 to X for line; ATOM/HETATM"""
1187
        lines = []
1188
        c = 1
1189
        for l in self.lines:
1190
            l = l[:6] + str(c).rjust(5) + l[11:]
1191
            c += 1
1192
            lines.append(l)
1193
        self.lines = lines
1194
1195
    def std_resn(self):
1196
        """'Fix' residue names which means to change them to standard, e.g. RA5 -> A
1197
1198
        Works on self.lines, and returns the result to self.lines.
1199
1200
        Will change things like::
1201
1202
            # URI -> U, URA -> U
1203
            1xjr_clx_charmm.pdb:ATOM    101  P   URA A   5      58.180  39.153  30.336  1.00 70.94
1204
            rp13_Dokholyan_1_URI_CYT_ADE_GUA_hydrogens.pdb:ATOM  82  P   URI A   4     501.633 506.561 506.256  1.00  0.00         P
1205
        """
1206
        lines = []
1207
        for l in self.lines:
1208
            nl = l.replace('RA5', '  A')  # RA should be the last!!!!
1209
            nl = nl.replace('RA3', '  A')
1210
            nl = nl.replace('ADE', '  A')
1211
            nl = nl.replace(' RA', '  A')
1212
            nl = nl.replace(' rA', '  A')
1213
1214
            nl = nl.replace('RC5', '  C')
1215
            nl = nl.replace('RC3', '  C')
1216
            nl = nl.replace('CYT', '  C')
1217
            nl = nl.replace(' RC', '  C')
1218
            nl = nl.replace(' rC', '  C')
1219
1220
            nl = nl.replace('RG5', '  G')
1221
            nl = nl.replace('RG3', '  G')
1222
            nl = nl.replace('GUA', '  G')
1223
            nl = nl.replace(' RG', '  G')
1224
            nl = nl.replace(' rG', '  G')
1225
1226
            nl = nl.replace('RU5', '  U')
1227
            nl = nl.replace('RU3', '  U')
1228
            nl = nl.replace('URA', '  U')
1229
            nl = nl.replace('URI', '  U')
1230
            nl = nl.replace('URY', '  U')
1231
            nl = nl.replace(' RU', '  U')
1232
            nl = nl.replace(' rU', '  U')
1233
1234
            nl = nl.replace('RT5', '  T')
1235
            nl = nl.replace('RT3', '  T')
1236
            nl = nl.replace('THY', '  T')
1237
            nl = nl.replace(' RT', '  T')
1238
            nl = nl.replace(' rT', '  T')
1239
1240
            lines.append(nl)
1241
1242
        self.lines = lines
1243
1244
    def check_res_if_std_prot(self):
1245
        wrong = []
1246
        for r in self.res:
1247
            if r not in AMINOACID_CODES:
1248
                wrong.append(r)
1249
        return wrong
1250
1251
    def write(self, outfn='', verbose=False):
1252
        """Write ```self.lines``` to a file (and add END file)
1253
1254
        Args:
1255
            outfn (str): file to save, if outfn is '', then simply use self.fn
1256
            verbose (Boolen): be verbose or not
1257
1258
        Returns:
1259
            None
1260
1261
        """
1262
        if outfn == '':
1263
            outfn = self.fn
1264
        f = open(outfn, 'w')
1265
        # test if there is anything to write, if not, it's likely that the
1266
        # file is not a PDB file, e.g. .outCR
1267
        if not self.lines:
1268
            raise Exception('Nothing to write. Is the input a PDB file? ', self.fn)
1269
        for l in self.lines:
1270
            f.write(l + '\n')
1271
        if not l.startswith('END'):
0 ignored issues
show
introduced by
The variable l does not seem to be defined in case the for loop on line 1269 is not entered. Are you sure this can never be the case?
Loading history...
1272
            f.write('END')
1273
        f.close()
1274
        if verbose:
1275
            print('Write %s' % outfn)
1276
1277
    def get_atom_num(self, line):
1278
        """Extract atom number from a line of PDB file
1279
        Arguments:
1280
          * line = ATOM line from a PDB file
1281
        Output:
1282
          * atom number (int)
1283
        """
1284
        return int(''.join([x for x in line[6:11] if x.isdigit()]))
1285
1286
    def get_res_num(self, line):
1287
        """Extract residue number from a line of PDB file
1288
        Arguments:
1289
          * line = ATOM line from a PDB file
1290
        Output:
1291
          * residue number as an integer
1292
        """
1293
        return int(''.join([x for x in line[22:27] if x.isdigit()]))
1294
1295
    def get_res_code(self, line):
1296
        """Get residue code from a line of a PDB file
1297
        """
1298
        #if not line.startswith('ATOM'):
1299
        #    return None
1300
        return line[17:20]
1301
1302
    def shift_atom_names(self):
1303
        nlines = []
1304
        for l in self.lines:
1305
            if l.startswith('ATOM'):
1306
                atom_name = self.get_atom_code(l)
1307
                l = self.set_atom_code(l, atom_name)
1308
            nlines.append(l)
1309
        self.lines = nlines
1310
1311
    def prune_elements(self):
1312
        nlines = []
1313
        for l in self.lines:
1314
            if l.startswith('ATOM'):
1315
                l = l[:76] + ' ' + l[78:]
1316
            nlines.append(l)
1317
        self.lines = nlines
1318
1319
    def get_atom_code(self, line):
1320
        """Get atom code from a line of a PDB file
1321
        """
1322
        if not line.startswith('ATOM'):
1323
            return None
1324
        return line[12:16].replace(' ', '').strip()
1325
1326
    def get_atom_coords(self, line):
1327
        """Get atom coordinates from a line of a PDB file
1328
        """
1329
        if not line.startswith('ATOM'):
1330
            return None
1331
        return tuple(map(float, line[31:54].split()))
1332
1333
    def set_atom_coords(self, line, x, y, z):
1334
        """Get atom coordinates from a line of a PDB file
1335
        """
1336
        line = line[:30] + (f"{x:>8.3f}") + line[38:]
1337
        line = line[:38] + (f"{y:>8.3f}") + line[46:]
1338
        line = line[:46] + (f"{z:>8.3f}") + line[54:]
1339
        return  line
1340
1341
    def set_line_bfactor(self, line, bfactor):
1342
        if not line.startswith('ATOM'):
1343
            return None
1344
        return line[:60] + (" %5.2f" % bfactor) + line[66:]
1345
1346
    def set_atom_occupancy(self, line, occupancy):
1347
        """set occupancy for line"""
1348
        return line[:54] + (" %5.2f" % occupancy) + line[60:]
1349
1350
    def set_atom_code(self, line, code):
1351
        """Add atom name/code:
1352
        
1353
           ATOM      1  OP2   C A   1      29.615  36.892  42.657  1.00  1.00          O
1354
                        ^^^                                                            ^ and element
1355
        """
1356
        return line[:12] + ' ' + code + ' ' * (3 - len(code)) + line[16:77] + code[0] + line[78:]
1357
1358
    def set_res_code(self, line, code):
1359
        """
1360
        Args:
1361
            lines
1362
            code
1363
        path (str): The path of the file to wrap
1364
        field_storage (FileStorage): The :class:Y instance to wrap
1365
            temporary (bool): Whether or not to delete the file when the File
1366
            instance is destructed
1367
1368
        Returns:
1369
            BufferedFileStorage: A buffered writable file descriptor
1370
        """
1371
        return line[:17] + code.rjust(3) + line[20:]
1372
1373
    def get_chain_id(self, line):
1374
        return line[21:22]
1375
1376
    def get_all_chain_ids(self):
1377
        """
1378
        Returns:
1379
           set: chain ids, e.g. set(['A', 'B'])
1380
        """
1381
        chain_ids = []
1382
        for l in self.lines:
1383
            chain_id = self.get_chain_id(l)
1384
            if chain_id and chain_id not in chain_ids:
1385
                chain_ids.append(chain_id)
1386
        return chain_ids
1387
1388
    def get_atom_index(self, line):
1389
        try:
1390
            return int(line[6:11])
1391
        except:
1392
            return None
1393
1394
    def set_atom_index(self, line, index):
1395
        return line[:6] + str(index).rjust(5) + line[11:]
1396
1397
    def reindex_atom_index(self):
1398
        self.nlines = []
1399
        for i, l in enumerate(self.lines):
1400
            nl = self.set_atom_index(l, i + 1)  # start from 1
1401
            self.nlines.append(nl)
1402
        self.lines = self.nlines
1403
        return self.nlines
1404
1405
    def get_res_index(self, line):
1406
        return int(line[22:26])
1407
1408
    def set_res_index(self, line, index):
1409
        return line[:23] + str(index).rjust(3) + line[26:]
1410
1411
    def set_chain_id(self, line, chain_id):
1412
        return line[:21] + chain_id + line[22:]
1413
1414
    def get_rnapuzzle_ready(self, renumber_residues=True, fix_missing_atoms=True,
1415
                            rename_chains=True,
1416
                            ignore_op3 = False,
1417
                            report_missing_atoms=True, keep_hetatm=False, backbone_only=False,
1418
                            no_backbone = False, bases_only = False, save_single_res = False,
1419
                            ref_frame_only = False,
1420
                            p_only = False,
1421
                            check_geometry = False,
1422
                            conect = False,
1423
                            conect_no_linkage = False,
1424
                            verbose=False):  # :, ready_for="RNAPuzzle"):
1425
        """Get rnapuzzle (SimRNA) ready structure.
1426
1427
        Clean up a structure, get current order of atoms.
1428
1429
        :param renumber_residues: boolean, from 1 to ..., second chain starts from 1 etc.
1430
        :param fix_missing_atoms: boolean, superimpose motifs from the minilibrary and copy-paste missing atoms, this is super crude, so should be used with caution.
1431
1432
        Submission format @http://ahsoka.u-strasbg.fr/rnapuzzles/
1433
1434
        Run :func:`rna_tools.rna_tools.lib.RNAStructure.std_resn` before this function to fix names.
1435
        
1436
        .. image:: ../pngs/rebuild_op1op2_backbone.png
1437
        Figure: (Starting from left) input structure, structure with rebuilded atoms, and reference. The B fragment is observed in the reference used here as a “benchmark”, fragment A is reconstructed atoms (not observed in the reference"). 201122
1438
1439
        - 170305 Merged with get_simrna_ready and fixing OP3 terminal added
1440
        - 170308 Fix missing atoms for bases, and O2'
1441
1442
        .. image:: ../pngs/fix_missing_o_before_after.png
1443
        Fig. Add missing O2' atom (before and after).
1444
1445
        .. image:: ../pngs/fix_missing_superposition.png
1446
        Fig. The residue to fix is in cyan. The G base from the library in red. Atoms O4', C2', C1' are shared between the sugar (in cyan) and the G base from the library (in red). These atoms are used to superimpose the G base on the sugar, and then all atoms from the base are copied to the residues.
1447
1448
        .. image:: ../pngs/fix_missing_bases.png
1449
        **Fig.** Rebuild ACGU base-less. It's not perfect but good enough for some applications.
1450
1451
        .. warning:: It was only tested with the whole base missing!
1452
1453
        .. warning:: requires: Biopython
1454
1455
        Selection of atoms:
1456
        
1457
        - posphate group (3x, OP1 ,P, OP2),
1458
        - connector (2x O5', C5'), /5x
1459
        - sugar (5x, C4', O4', C3', O3', C1', C2'), /10
1460
        - extra oxygens from sugar (2x, O2' O3'), for now it's /12!
1461
        - A (10x), G (11x), C (8x), U(8x), max 12+11=23
1462
1463
        And 27 unique atoms: {'N9', 'O2', 'OP2', "O2'", "O4'", 'C8', "O3'", "C1'", 'C2', 'C6', "C5'", 'N6', 'C5', "C4'", 'C4', "O5'", "C3'", 'O6', 'N2', 'N7', 'OP1', 'N1', 'N4', 'P', "C2'", 'N3', 'O4'}.
1464
        """
1465
1466
        if verbose:
1467
            logger.setLevel(logging.DEBUG)
1468
1469
        try:
1470
            from Bio import PDB
1471
            from Bio.PDB import PDBIO
1472
            import warnings
1473
            warnings.filterwarnings('ignore', '.*Invalid or missing.*',)
1474
            warnings.filterwarnings('ignore', '.*with given element *',)
1475
        except:
1476
            sys.exit('Error: Install biopython to use this function (pip biopython)')
1477
1478
        import copy
1479
        # for debugging
1480
        #renumber_residues = True
1481
        # if ready_for == "RNAPuzzle":
1482
1483
        if p_only:
1484
            G_ATOMS = "P".split()
1485
            A_ATOMS = "P".split()
1486
            U_ATOMS = "P".split()
1487
            C_ATOMS = "P".split()
1488
        elif backbone_only:
1489
            G_ATOMS = "P OP1 OP2 O5' C5'".split()
1490
            A_ATOMS = "P OP1 OP2 O5' C5'".split()
1491
            U_ATOMS = "P OP1 OP2 O5' C5'".split()
1492
            C_ATOMS = "P OP1 OP2 O5' C5'".split()
1493
            #G_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split()
1494
            #A_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split()
1495
            #U_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split()
1496
            #C_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split()
1497
1498
        elif ref_frame_only:
1499
            G_ATOMS = "P OP1 OP2".split()
1500
            A_ATOMS = "P OP1 OP2".split()
1501
            U_ATOMS = "P OP1 OP2".split()
1502
            C_ATOMS = "P OP1 OP2".split()
1503
            
1504
        elif no_backbone:
1505
            G_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split() 
1506
            A_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split()     
1507
            U_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 O4 C5 C6".split()
1508
            C_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 N4 C5 C6".split()
1509
1510
        elif bases_only:
1511
            G_ATOMS = "N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split()
1512
            A_ATOMS = "N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split()
1513
            U_ATOMS = "N1 C2 O2 N3 C4 O4 C5 C6".split()
1514
            C_ATOMS = "N1 C2 O2 N3 C4 N4 C5 C6".split()
1515
        else:
1516
            G_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split() # 23 
1517
            A_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split()    # 22
1518
            U_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 O4 C5 C6".split()          # 20
1519
            C_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 N4 C5 C6".split()          # 20
1520
1521
        # hmm.. is it the same as RNApuzzle???
1522
        # if ready_for == "SimRNA":
1523
        #    G_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split()
1524
        #    A_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split()
1525
        #    U_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 O4 C5 C6".split()
1526
        #    C_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 N4 C5 C6".split()
1527
1528
        tf = tempfile.NamedTemporaryFile(delete=False)
1529
        ftmp = tf.name
1530
        self.write(ftmp, verbose=False)
1531
1532
        parser = PDB.PDBParser()
1533
        #try:
1534
        struct = parser.get_structure('', ftmp)
1535
        #except:
1536
        #    print('Error in ' + self.fn)
1537
        #    sys.exit(1)
1538
        model = struct[0]
1539
1540
        s2 = PDB.Structure.Structure(struct.id)
1541
        m2 = PDB.Model.Model(model.id)
1542
1543
        chains2 = []
1544
1545
        missing = []
1546
        fixed = []
1547
        protein_chains_remmoved = []
1548
1549
        new_chains = list(string.ascii_uppercase)
1550
        remarks = []
1551
        if ignore_op3:
1552
             remarks.append("REMARK 250  don't add OP3 at the start of chain")
1553
        # get path to mini db with models to rebuilt structures
1554
        currfn = __file__
1555
        if currfn == '':
1556
            path = '.'
1557
        else:
1558
            path = os.path.dirname(currfn)
1559
        if os.path.islink(currfn):  # path + os.sep + os.path.basename(__file__)):
1560
            path = os.path.dirname(os.readlink(
1561
                path + os.sep + os.path.basename(currfn)))
1562
1563
        # for each chain #
1564
        for chain in model.get_list():
1565
            logger.debug('chain: %s' % chain)
1566
1567
            # is it RNA? ############################
1568
            protein_like = 0
1569
            for c, r in enumerate(chain, 1):
1570
                if r.resname in AMINOACID_CODES:
1571
                    protein_like += 1
1572
            if (protein_like / float(c + 1)) > .8:  # 90%
0 ignored issues
show
introduced by
The variable c does not seem to be defined in case the for loop on line 1564 is not entered. Are you sure this can never be the case?
Loading history...
1573
                protein_chains_remmoved.append(chain.get_id())
1574
            # ######################################
1575
1576
            res = []
1577
            for r in chain:
1578
                res.append(r)
1579
1580
            res = copy.copy(res)
1581
1582
            # start chains from A..BCD. etc
1583
1584
            if rename_chains:
1585
                try:
1586
                    chain.id = new_chains.pop(0)
1587
                except ValueError:
1588
                    # ValueError: Cannot change id from `A` to `A`.
1589
                    # The id `A` is already used for a sibling of this entity.
1590
                    # so keep it as it was
1591
                    pass
1592
1593
            c2 = PDB.Chain.Chain(chain.id)
1594
1595
            c = 1  # new chain, goes from 1 !!! if renumber True
1596
            prev_r = '' # init prev_r 
1597
1598
            conects = []
1599
            atom_index = 0
1600
            for r in res:
1601
                #
1602
                #  deal with heteratm
1603
                #
1604
                if r.id[0].startswith('H_') or r.id[0].startswith('W'):
1605
                    if keep_hetatm:
1606
                        c2.add(r)
1607
                    else:
1608
                        # str(r) % r.id[0].replace('H_', '').strip())
1609
                        remarks.append('REMARK 250 Hetatom to be removed from file: ' + str(r))
1610
                    continue
1611
                # hack for amber/qrna
1612
                r.resname = r.resname.strip()
1613
                if r.resname == 'RC3':
1614
                    r.resname = 'C'
1615
                if r.resname == 'RU3':
1616
                    r.resname = 'U'
1617
                if r.resname == 'RG3':
1618
                    r.resname = 'G'
1619
                if r.resname == 'RA3':
1620
                    r.resname = 'A'
1621
1622
                if r.resname == 'C3':
1623
                    r.resname = 'C'
1624
                if r.resname == 'U3':
1625
                    r.resname = 'U'
1626
                if r.resname == 'G3':
1627
                    r.resname = 'G'
1628
                if r.resname == 'A3':
1629
                    r.resname = 'A'
1630
1631
                if r.resname == 'RC5':
1632
                    r.resname = 'C'
1633
                if r.resname == 'RU5':
1634
                    r.resname = 'U'
1635
                if r.resname == 'RG5':
1636
                    r.resname = 'G'
1637
                if r.resname == 'RA5':
1638
                    r.resname = 'A'
1639
1640
                if r.resname == 'C5':
1641
                    r.resname = 'C'
1642
                if r.resname == 'U5':
1643
                    r.resname = 'U'
1644
                if r.resname == 'G5':
1645
                    r.resname = 'G'
1646
                if r.resname == 'A5':
1647
                    r.resname = 'A'
1648
1649
                if r.resname.strip() == 'RC':
1650
                    r.resname = 'C'
1651
                if r.resname.strip() == 'RU':
1652
                    r.resname = 'U'
1653
                if r.resname.strip() == 'RG':
1654
                    r.resname = 'G'
1655
                if r.resname.strip() == 'RA':
1656
                    r.resname = 'A'
1657
1658
                # unmodified rna 2MG -> G and take only G atoms
1659
                if (r.resname.strip() not in ['C', 'U', 'G', 'A']) and \
1660
                        (r.resname.strip()[-1] in ['C', 'U', 'G', 'A']):
1661
                    r.resname = r.resname.strip()[-1].strip()
1662
1663
                r2 = PDB.Residue.Residue(r.id, r.resname.strip(), r.segid)
1664
                if renumber_residues:
1665
                    r2.id = (r2.id[0], c, r2.id[2])  # renumber residues
1666
                #
1667
                # experimental: fixing missing OP3.
1668
                # Only for the first residues.
1669
                #
1670
                if c == 1:
1671
                    # if p_missing
1672
                    p_missing = True
1673
                    # if p_missing:
1674
                    #    try:
1675
                    #        x = r["O5'"]
1676
                    #        x.id =       ' P'
1677
                    #        x.name =     ' P'
1678
                    #        x.fullname = ' P'
1679
                    #        print "REMARK 000 FIX O5' -> P fix in chain ", chain.id
1680
                    #    except:
1681
                    #        pass
1682
                    for a in r:
1683
                        if a.id == 'P':
1684
                            p_missing = False
1685
                    logger.debug('p_missing %s' % p_missing)
1686
1687
                    if p_missing and fix_missing_atoms:
1688
                        currfn = __file__
1689
                        if currfn == '':
1690
                            path = '.'
1691
                        else:
1692
                            path = os.path.dirname(currfn)
1693
                        if os.path.islink(currfn):  # path + os.sep + os.path.basename(__file__)):
1694
                            path = os.path.dirname(os.readlink(
1695
                                path + os.sep + os.path.basename(currfn)))
1696
1697
                        po3_struc = PDB.PDBParser().get_structure('', path + '/data/PO3_inner.pdb')
1698
                        po3 = [po3_atom for po3_atom in po3_struc[0].get_residues()][0]
1699
1700
                        r_atoms = [r["O4'"], r["C4'"], r["C3'"]]
1701
                        po3_atoms = [po3["O4'"], po3["C4'"], po3["C3'"]]
1702
1703
                        sup = PDB.Superimposer()
1704
                        sup.set_atoms(r_atoms, po3_atoms)
1705
                        rms = round(sup.rms, 3)
1706
1707
                        sup.apply(po3_struc.get_atoms())  # to all atoms of po3
1708
1709
                        r.add(po3['P'])
1710
                        r.add(po3['OP1'])
1711
                        r.add(po3['OP2'])
1712
                        try:
1713
                            r.add(po3["O5'"])
1714
                        except:
1715
                            del r["O5'"]
1716
                            r.add(po3["O5'"])
1717
1718
                        fixed.append(['add OP3 at the beginning of the chain ', chain.id, r, r.id[1]])
1719
1720
                    p_missing = False  # off this function
1721
1722
                    # save it
1723
                    #io = PDB.PDBIO()
1724
                    #io.set_structure( po3_struc )
1725
                    # io.save("po3.pdb")
1726
1727
                #
1728
                # fix missing O2'
1729
                #
1730
                o2p_missing = True
1731
                for a in r:
1732
                    logger.debug('o2p_missing: %s %s %s' % (r, o2p_missing, a.id))
1733
                    if a.id == "O2'":
1734
                        o2p_missing = False
1735
                logger.debug('o2p_missing: %s', o2p_missing)
1736
1737
                if o2p_missing and fix_missing_atoms:
1738
                    currfn = __file__
1739
                    if currfn == '':
1740
                        path = '.'
1741
                    else:
1742
                        path = os.path.dirname(currfn)
1743
                    if os.path.islink(currfn):  # path + os.sep + os.path.basename(__file__)):
1744
                        path = os.path.dirname(os.readlink(
1745
                            path + os.sep + os.path.basename(currfn)))
1746
1747
                    o2p_struc = PDB.PDBParser().get_structure('', path + '/data/o2prim.pdb')
1748
                    o2p = [o2p_atom for o2p_atom in o2p_struc[0].get_residues()][0]
1749
1750
                    try:
1751
                        r_atoms = [r["C3'"], r["C2'"], r["C1'"]]
1752
                    except:
1753
                        ic(r)
1754
                    o2p_atoms = [o2p["C3'"], o2p["C2'"], o2p["C1'"]]
1755
1756
                    sup = PDB.Superimposer()
1757
                    sup.set_atoms(r_atoms, o2p_atoms)
1758
                    rms = round(sup.rms, 3)
1759
1760
                    sup.apply(o2p_struc.get_atoms())  # to all atoms of o2p
1761
1762
                    r.add(o2p["O2'"])
1763
                    logger.debug('fixing o2p for ' % r)
1764
                    fixed.append(['add O2\' ', chain.id, r, c])
1765
1766
                o2p_missing = False  # off this function
1767
                ######## fixing of missing OP1 and OP2 atoms in backbone ###########
1768
                if fix_missing_atoms:
1769
                    if 'OP1' not in r:
1770
                        if prev_r:
1771
                            op4 = PDB.PDBParser().get_structure('', path + '/data/op4.pdb')
1772
                            op4a = [a for a in op4[0].get_residues()][0]
1773
                            if 'P' not in r:
1774
                                print("Error missing P of residue " + chain.id +  ':' + str(r.id[1]) + ". Sorry, I can't rebuild missing atoms!")
1775
                                exit()
1776
                            r_atoms = [r["O5'"], r["P"], prev_r["O3'"]] #  r["C5'"], 
1777
                            op4_atoms = [op4a["O5'"], op4a["P"], op4a["O3'"]] # op4a["C5'"]] #, 
1778
1779
                            sup = PDB.Superimposer()
1780
                            sup.set_atoms(r_atoms, op4_atoms)
1781
                            sup.apply(op4.get_atoms())
1782
1783
                            r.add(op4a["OP1"])
1784
                            r.add(op4a["OP2"])
1785
                            fixed.append(['add the missing OP1 and OP2', chain.id, r, r.id[1]])
1786
1787
                        else:
1788
                            remarks.append("REMARK 250 Warning: Can't fix missing OP1 or/and OP2 atoms of " + str(r))
1789
                #
1790
                # fix missing C (the whole base at the moment)
1791
                #
1792 View Code Duplication
                if str(r.get_resname()).strip() == "C" and fix_missing_atoms:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1793
                    for a in r:
1794
                        if a.id == "N1":
1795
                            break
1796
                    else:
1797
                        C_struc = PDB.PDBParser().get_structure('', path + '/data/C.pdb')
1798
                        C = [C_atom for C_atom in C_struc[0].get_residues()][0]
1799
1800
                        r_atoms = [r["O4'"], r["C2'"], r["C1'"]]
1801
                        C_atoms = [C["O4'"], C["C2'"], C["C1'"]]
1802
1803
                        sup = PDB.Superimposer()
1804
                        sup.set_atoms(r_atoms, C_atoms)
1805
                        rms = round(sup.rms, 3)
1806
1807
                        sup.apply(C_struc.get_atoms())  # to all atoms of C
1808
1809
                        r.add(C["N1"])
1810
                        r.add(C["C2"])
1811
                        r.add(C["O2"])
1812
                        r.add(C["N3"])
1813
                        r.add(C["C4"])
1814
                        r.add(C["N4"])
1815
                        r.add(C["C5"])
1816
                        r.add(C["C6"])
1817
1818
                        fixed.append(['add the whole base C', chain.id, r, r.id[1]])
1819
1820
                #
1821
                # fix missing U (the whole base at the moment)
1822
                #
1823 View Code Duplication
                if str(r.get_resname()).strip() == "U" and fix_missing_atoms:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1824
                    for a in r:
1825
                        if a.id == "N1":
1826
                            break
1827
                    else:
1828
                        U_struc = PDB.PDBParser().get_structure('', path + '/data/U.pdb')
1829
                        U = [U_atom for U_atom in U_struc[0].get_residues()][0]
1830
1831
                        r_atoms = [r["O4'"], r["C2'"], r["C1'"]]
1832
                        U_atoms = [U["O4'"], U["C2'"], U["C1'"]]
1833
1834
                        sup = PDB.Superimposer()
1835
                        sup.set_atoms(r_atoms, U_atoms)
1836
                        rms = round(sup.rms, 3)
1837
1838
                        sup.apply(U_struc.get_atoms())  # to all atoms of U
1839
1840
                        r.add(U["N1"])
1841
                        r.add(U["C2"])
1842
                        r.add(U["O2"])
1843
                        r.add(U["N3"])
1844
                        r.add(U["C4"])
1845
                        r.add(U["O4"])
1846
                        r.add(U["C5"])
1847
                        r.add(U["C6"])
1848
1849
                        fixed.append(['add the whole base U', chain.id, r, r.id[1]])
1850
                #
1851
                # fix missing G (the whole base at the moment)
1852
                #
1853 View Code Duplication
                if str(r.get_resname()).strip() == "G" and fix_missing_atoms:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1854
                    for a in r:
1855
                        if a.id == "N1":
1856
                            break
1857
                    else:
1858
                        G_struc = PDB.PDBParser().get_structure('', path + '/data/G.pdb')
1859
                        G = [G_atom for G_atom in G_struc[0].get_residues()][0]
1860
1861
                        r_atoms = [r["O4'"], r["C2'"], r["C1'"]]
1862
                        G_atoms = [G["O4'"], G["C2'"], G["C1'"]]
1863
1864
                        sup = PDB.Superimposer()
1865
                        sup.set_atoms(r_atoms, G_atoms)
1866
                        rms = round(sup.rms, 3)
1867
1868
                        sup.apply(G_struc.get_atoms())  # to all atoms of G
1869
1870
                        r.add(G["N9"])
1871
                        r.add(G["C8"])
1872
                        r.add(G["N7"])
1873
                        r.add(G["C5"])
1874
                        r.add(G["C6"])
1875
                        r.add(G["O6"])
1876
                        r.add(G["N1"])
1877
                        r.add(G["C2"])
1878
                        r.add(G["N2"])
1879
                        r.add(G["N3"])
1880
                        r.add(G["C4"])
1881
1882
                        fixed.append(['add the whole base G', chain.id, r, r.id[1]])
1883
                #
1884
                # fix missing A (the whole base at the moment)
1885
                #
1886 View Code Duplication
                if str(r.get_resname()).strip() == "A" and fix_missing_atoms:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1887
                    for a in r:
1888
                        if a.id == "N1":
1889
                            break
1890
                    else:
1891
                        A_struc = PDB.PDBParser().get_structure('', path + '/data/A.pdb')
1892
                        A = [A_atom for A_atom in A_struc[0].get_residues()][0]
1893
1894
                        r_atoms = [r["O4'"], r["C2'"], r["C1'"]]
1895
                        A_atoms = [A["O4'"], A["C2'"], A["C1'"]]
1896
1897
                        sup = PDB.Superimposer()
1898
                        sup.set_atoms(r_atoms, A_atoms)
1899
                        rms = round(sup.rms, 3)
1900
1901
                        sup.apply(A_struc.get_atoms())  # to all atoms of A
1902
1903
                        r.add(A["N9"])
1904
                        r.add(A["C8"])
1905
                        r.add(A["N7"])
1906
                        r.add(A["C5"])
1907
                        r.add(A["C6"])
1908
                        r.add(A["N6"])
1909
                        r.add(A["N1"])
1910
                        r.add(A["C2"])
1911
                        r.add(A["N3"])
1912
                        r.add(A["C4"])
1913
1914
                        fixed.append(['add the whole base A', chain.id, r, r.id[1]])
1915
1916
                #
1917
                # strip residues of extra atoms, not in G_ATOMS in this case
1918
                #
1919
                if str(r.get_resname()).strip() == "G":
1920
                    for an in G_ATOMS:
1921
                        if c == 1 and ignore_op3:
1922
                            if an in ['P', 'OP1', 'OP2']:
1923
                                continue
1924
                        try:
1925
                            if c == 1 and an == "O5'" and p_missing:
0 ignored issues
show
introduced by
The variable p_missing does not seem to be defined for all execution paths.
Loading history...
1926
                                r2.add(x)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable x does not seem to be defined.
Loading history...
1927
                            else:
1928
                                r2.add(r[an])
1929
                        except KeyError:
1930
                            # print 'Missing:', an, r, ' new resi', c
1931
                            missing.append([an, chain.id, r, r.id[1]])
1932
                    c2.add(r2)
1933
1934
                elif str(r.get_resname()).strip() == "A":
1935
                    for an in A_ATOMS:
1936
                        if c == 1 and ignore_op3:
1937
                            if an in ['P', 'OP1', 'OP2']:
1938
                                continue
1939
                        try:
1940
                            if c == 1 and an == "O5'" and p_missing:
1941
                                r2.add(x)
1942
                            else:
1943
                                r2.add(r[an])
1944
                        except KeyError:
1945
                            # print 'Missing:', an, r, ' new resi', c
1946
                            missing.append([an, chain.id, r, r.id[1]])
1947
                    c2.add(r2)
1948
1949
                elif str(r.get_resname()).strip() == "C":
1950
                    for an in C_ATOMS:
1951
                        if c == 1 and ignore_op3:
1952
                            if an in ['P', 'OP1', 'OP2']:
1953
                                continue
1954
                        try:
1955
                            if c == 1 and an == "O5'" and p_missing:
1956
                                r2.add(x)
1957
                            else:
1958
                                r2.add(r[an])
1959
                        except:
1960
                            # print 'Missing:', an, r, ' new resi', c
1961
                            missing.append([an, chain.id, r, r.id[1]])
1962
                    c2.add(r2)
1963
1964
                elif str(r.get_resname()).strip() == "U":
1965
                    for an in U_ATOMS:
1966
                        if c == 1 and ignore_op3:
1967
                            if an in ['P', 'OP1', 'OP2']:
1968
                                continue
1969
                        try:
1970
                            if c == 1 and an == "O5'" and p_missing:
1971
                                r2.add(x)
1972
                            else:
1973
                                r2.add(r[an])
1974
                        except KeyError:
1975
                            # print 'Missing:', an, r,' new resi', c
1976
                            missing.append([an, chain.id, r, r.id[1]])
1977
                    c2.add(r2)
1978
1979
1980
                # save each residue a sa single ife
1981
                if save_single_res:
1982
                    from Bio.PDB import Structure
1983
                    # Create an empty structure
1984
                    structure = Structure.Structure("my_structure")
1985
                    model = Model.Model(0)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable Model does not seem to be defined.
Loading history...
1986
                    chain = Chain.Chain('A')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable Chain does not seem to be defined.
Loading history...
1987
                    chain.add(r2)
1988
                    model.add(chain)
1989
                    structure.add(model)
1990
1991
                    io = PDBIO()
1992
                    io.set_structure(structure)
1993
                    io.save(f"{self.fn}_{r2.id[1]}.pdb")
1994
1995
                # add CONET
1996
                if conect:
1997
                    ic(atom_index, r2)
1998
                    conects.extend(generate_conect_records(r2.get_resname().strip(), atom_index, conect_no_linkage))
1999
                    atom_index += len(r2)
2000
                    print(conect)
2001
2002
                prev_r = r  # hack to keep preview residue to be used in the function
2003
                            # e.g., to get an atom from this residue
2004
                c += 1
2005
            chains2.append(c2)
2006
2007
        io = PDBIO()
2008
        s2.add(m2)
2009
        for chain2 in chains2:
2010
            m2.add(chain2)
2011
        # print c2
2012
        # print m2
2013
        io.set_structure(s2)
2014
        
2015
        tf = tempfile.NamedTemporaryFile(delete=False)
2016
        fout = tf.name
2017
        io.save(fout)
2018
2019
        if fixed:
2020
            remarks.append('REMARK 250 Fixed atoms/residues:')
2021
            for i in fixed:
2022
                remarks.append(
2023
                    ' '.join(['REMARK 250  -', str(i[0]), 'in chain:', str(i[1]), str(i[2]), 'residue #', str(i[3])]))
2024
2025
        if missing and report_missing_atoms:
2026
            remarks.append('REMARK 250 Missing atoms:')
2027
            for i in missing:
2028
                remarks.append(' '.join(['REMARK 250   +', str(i[0]),
2029
                                         'chain:', str(i[1]), str(i[2]), 'residue #', str(i[3])]))
2030
            #raise Exception('Missing atoms in %s' % self.fn)
2031
2032
        if protein_chains_remmoved:
2033
            remarks.append('REMARK 250 Chains that seem to be proteins removed and : ' +
2034
                           ' '.join(protein_chains_remmoved))
2035
        #
2036
2037
        # fix ter 'TER' -> TER    1528        G A  71
2038
        #
2039
        s = RNAStructure(fout)
2040
        if check_geometry:
2041
            s.check_geometry()
2042
        self.lines = s.lines
2043
        c = 0
2044
        # ATOM   1527  C4    G A  71       0.000   0.000   0.000  1.00  0.00           C
2045
        nlines = []
2046
        no_ters = 0
2047
        for l in self.lines:
2048
            ## align atoms to the left #######################################################
2049
            # ATOM   3937    P   C B 185      11.596  -7.045  26.165  1.00  0.00           P
2050
            # ATOM   3937  P     C B 185      11.596  -7.045  26.165  1.00  0.00           P
2051
            if l.startswith('ATOM'):
2052
                atom_code = self.get_atom_code(l)
2053
                l = self.set_atom_code(l, atom_code)
2054
            ##################################################################################
2055
            if l.startswith('TER'):
2056
                #                pass  # leave it for now this
2057
                atom_l = self.lines[c - 1]
2058
                new_l = 'TER'.ljust(80)   # TER    1528        G A  71 <<<'
2059
                new_l = self.set_atom_index(new_l, str(self.get_atom_index(atom_l) + 1 + no_ters))
2060
                new_l = self.set_res_code(new_l, self.get_res_code(atom_l))
2061
                new_l = self.set_chain_id(new_l, self.get_chain_id(atom_l))
2062
                new_l = self.set_res_index(new_l, self.get_res_index(atom_l))
2063
                nlines.append(new_l)
2064
                #nlines.append(l)file:/Users/magnus/Desktop/3nt_rpr.pdb
2065
                no_ters += 1
2066
            else:
2067
                if self.get_atom_index(l):
2068
                    l = self.set_atom_index(l, self.get_atom_index(
2069
                        l) + no_ters)  # 1 ter +1 2 ters +2 etc
2070
                nlines.append(l)
2071
            c += 1
2072
        self.lines = nlines
2073
        if conect:
2074
            self.lines += conects
0 ignored issues
show
introduced by
The variable conects does not seem to be defined in case the for loop on line 1564 is not entered. Are you sure this can never be the case?
Loading history...
2075
        return remarks
2076
2077
    def set_occupancy_atoms(self, occupancy):
2078
        """
2079
        :param occupancy:
2080
        """
2081
        nlines = []
2082
        for l in self.lines:
2083
            if l.startswith('ATOM'):
2084
                l = self.set_atom_occupancy(l, 0.00)
2085
                nlines.append(l)
2086
            else:
2087
                nlines.append(l)
2088
        self.lines = nlines
2089
2090
    def edit_occupancy_of_pdb(txt, pdb, pdb_out, v=False):
2091
        """Make all atoms 1 (flexi) and then set occupancy 0 for seletected atoms.
2092
        Return False if error. True if OK
2093
        """
2094
        struc = PDB.PDBParser().get_structure('struc', pdb)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable PDB does not seem to be defined.
Loading history...
2095
2096
        txt = txt.replace(' ', '')
2097
        if v:
2098
            print (txt)
2099
        l = re.split('[,:;]', txt)
2100
        if v:
2101
            print (l)
2102
2103
        for s in struc:
2104
            for c in s:
2105
                for r in c:
2106
                    for a in r:
2107
                        a.set_occupancy(1)  # make it flaxi
2108
2109
        for i in l:  # ['A', '1-10', '15', '25-30', 'B', '1-10']
2110
2111
            if i in string.ascii_letters:
2112
                if v:
2113
                    print('chain', i)
2114
                chain_curr = i
2115
                continue
2116
2117
            if i.find('-') > -1:
2118
                start, ends = i.split('-')
2119
                if start > ends:
2120
                    print('Error: range start > end ' + i)  # >>sys.stderr
2121
                    return False
2122
                index = list(range(int(start), int(ends) + 1))
2123
            else:
2124
                index = [int(i)]
2125
2126
            for i in index:
2127
                # change b_factor
2128
                try:
2129
                    atoms = struc[0][chain_curr][i]
0 ignored issues
show
introduced by
The variable chain_curr does not seem to be defined in case the for loop on line 2109 is not entered. Are you sure this can never be the case?
Loading history...
2130
                except KeyError:
2131
                    if i == chain_curr:
2132
                        print('Error: Chain ' + chain_curr +
2133
                              ' not found in the PDB structure')  # >>sys.stderr,
2134
                    else:
2135
                        print('Error: Residue ' + chain_curr + ':' + str(i) +
2136
                              ' found in the PDB structure')  # >>sys.stderr,
2137
                        return False
2138
                for a in atoms:
2139
                    a.set_occupancy(0)
2140
2141
        io = PDBIO()
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable PDBIO does not seem to be defined.
Loading history...
2142
        io.set_structure(struc)
2143
        io.save(pdb_out)
2144
        print('Saved ', pdb_out)
2145
        return True
2146
2147
    def view(self):
2148
        os.system('pymol ' + self.fn)
2149
2150
    def remove(self, verbose):
2151
        """Delete file, self.fn"""
2152
        os.remove(self.fn)
2153
        if verbose:
2154
            'File %s removed' % self.fn
2155
2156
    def __repr__(self):
2157
        return 'RNAStructure %s' % self.fn
2158
2159
2160
    def get_remarks_text(self):
2161
        """Get remarks as text for given file.
2162
        This function re-open files, as define as self.fn to get remarks.
2163
2164
        Example::
2165
2166
            r = RNAStructure(fout)
2167
            remarks = r.get_remarks_txt()
2168
            r1 = r.get_res_txt('A', 1)
2169
            r2 = r.get_res_txt('A', 2)
2170
            r3 = r.get_res_txt('A', 3)
2171
            with open(fout, 'w') as f:
2172
                f.write(remarks)
2173
                f.write(r1)
2174
                f.write(r2)
2175
                f.write(r3)
2176
2177
            remarks is
2178
2179
            REMARK 250 Model edited with rna-tools
2180
            REMARK 250  ver 3.5.4+63.g4338516.dirty 
2181
            REMARK 250  https://github.com/mmagnus/rna-tools 
2182
            REMARK 250  Fri Nov 13 10:15:19 2020
2183
2184
        """
2185
        txt = ''
2186
        for l in open(self.fn):
2187
            if l.startswith("REMARK"):
2188
                txt += l
2189
        return txt
2190
2191
    def get_res_text(self, chain_id, resi):
2192
        """Get a residue of given resi of chain_id and return as a text
2193
2194
        Args:
2195
            chain_id (str): e.g., 'A'
2196
            resi (int): e.g., 1
2197
2198
        Returns:
2199
             txt: 
2200
2201
        Example::
2202
2203
            r = RNAStructure(fn)
2204
            print(r.get_res_txt('A', 1))
2205
2206
            ATOM      1  O5'   G A   1      78.080 -14.909  -0.104  1.00  9.24           O  
2207
            ATOM      2  C5'   G A   1      79.070 -15.499  -0.956  1.00  9.70           C  
2208
            ATOM      3  C4'   G A   1      78.597 -16.765  -1.648  1.00  9.64           C  
2209
            ATOM      4  O4'   G A   1      78.180 -17.761  -0.672  1.00  9.88           O  
2210
            (...)
2211
2212
        """
2213
        txt = ''
2214
        for l in self.lines:
2215
            if l.startswith("ATOM"):
2216
                if self.get_res_num(l) == resi and self.get_chain_id(l) == chain_id:
2217
                    txt += l + '\n'
2218
        return txt
2219
2220
    def mq(self, method="RASP", verbose=False):
2221
2222
        if method.lower() == "rasp":
2223
            from rna_tools.tools.mq.RASP import RASP
2224
            r = RASP.RASP()
2225
            return [float(i) for i in r.run(self.fn, potentials=['all'])]
2226
            # ['-42058.4', '466223', '-0.0902109', '0', '0', '0']
2227
2228
        if method.lower() == "dfire":
2229
            from rna_tools.tools.mq.Dfire import Dfire
2230
            r = Dfire.Dfire()
2231
            return r.run(self.fn, verbose)
2232
        
2233
        if method.lower() == "rna3dcnn":
2234
            from rna_tools.tools.mq.RNA3DCNN import RNA3DCNN
2235
            r = RNA3DCNN.RNA3DCNN()
2236
            return r.run(self.fn, verbose)
2237
2238
        if method.lower() == "qrna":
2239
            from rna_tools.tools.mq.QRNA import QRNA
2240
            r = QRNA.QRNA()
2241
            return r.run(self.fn, verbose)
2242
2243
        if method.lower() == "clashscore":
2244
            from rna_tools.tools.mq.ClashScore import ClashScore
2245
            r = ClashScore.ClashScore()
2246
            return r.run(self.fn, verbose)
2247
2248
        if method.lower() == "analyzegeometry":
2249
            from rna_tools.tools.mq.AnalyzeGeometry import AnalyzeGeometry
2250
            r = AnalyzeGeometry.AnalyzeGeometry()
2251
            return r.run(self.fn, verbose)
2252
2253
        raise MethodUnknown('Define corrent method')
2254
2255
    def get_empty_line(self):
2256
        l = "ATOM      1  C5'   A A   3      25.674  19.091   3.459  1.00  1.00           X"
2257
        return l
2258
    
2259
    def add_line(self, l):
2260
        self.lines.append(l)
2261
2262
def add_header(version=None):
2263
    now = time.strftime("%c")
2264
    txt = 'REMARK 250 Model edited with rna-tools\n'
2265
    txt += 'REMARK 250  ver %s \nREMARK 250  https://github.com/mmagnus/rna-tools \nREMARK 250  %s' % (
2266
        version, now)
2267
    return txt
2268
2269
2270
def edit_pdb(f, args):
2271
    """Edit your structure.
2272
2273
    The function can take ``A:3-21>A:1-19`` or even syntax like this
2274
    ``A:3-21>A:1-19,B:22-32>B:20-30`` and will do an editing.
2275
2276
    The output is printed, line by line. Only ATOM lines are edited!
2277
2278
    Examples::
2279
2280
        $ rna_pdb_tools.py --edit 'A:3-21>A:1-19' 1f27_clean.pdb > 1f27_clean_A1-19.pdb
2281
2282
    or even::
2283
2284
        $ rna_pdb_tools.py --edit 'A:3-21>A:1-19,B:22-32>B:20-30' 1f27_clean.pdb > 1f27_clean_renumb.pdb
2285
2286
    """
2287
    # open a new file
2288
    s = RNAStructure(f)
2289
    output = ''
2290
    if not args.no_hr:
2291
        output += add_header() + '\n'
2292
        output += 'REMARK 250    HEADER --edit ' + args.edit + '\n'
2293
2294
    # --edit 'A:3-21>A:1-19,B:22-32>B:20-30'
2295
    if args.edit.find(',') > -1:
2296
        # more than one edits
2297
        edits = args.edit.split(',')  # ['A:3-21>A:1-19', 'B:22-32>B:20-30']
2298
        selects = []
2299
        for e in edits:
2300
            selection_from, selection_to = select_pdb_fragment(
2301
                e.split('>')[0]), select_pdb_fragment(e.split('>')[1])
2302
            if len(selection_to) != len(selection_from):
2303
                raise Exception('len(selection_to) != len(selection_from)')
2304
            selects.append([selection_from, selection_to])
2305
    else:
2306
        # one edit
2307
        e = args.edit
2308
        selection_from, selection_to = select_pdb_fragment(
2309
            e.split('>')[0]), select_pdb_fragment(e.split('>')[1])
2310
        if len(selection_to) != len(selection_from):
2311
            raise Exception('len(selection_to) != len(selection_from)')
2312
        selects = [[selection_from, selection_to]]
2313
2314
    # go ever all edits: ['A:3-21>A:1-19','B:22-32>B:20-30']
2315
    for l in s.lines:
2316
        if l.startswith('ATOM'):
2317
                # get chain and resi
2318
            chain = l[21:22].strip()
2319
            resi = int(l[22:26].strip())
2320
2321
            if_selected_dont_print = False
2322
            # for selections
2323
            for select in selects:
2324
                selection_from, selection_to = select
2325
                if chain in selection_from:
2326
                    if resi in selection_from[chain]:
2327
                            # [1,2,3] mapping from [4,5,10], you want to know how to map 1
2328
                            # 1 is [0] element of first list, so you have to index first list
2329
                            # to get 0, with this 0 you can get 4 out of second list [4,5,10][0] -> 4
2330
                        nl = list(l)
2331
                        chain_new = list(selection_to.keys())[0]  # chain form second list
2332
                        nl[21] = chain_new  # new chain
2333
                        index = selection_from[chain].index(int(resi))  # get index of 1
2334
                        resi_new = str(selection_to[chain_new][index]).rjust(
2335
                            4)  # 'A' [1,2,3] -> '  1'
2336
                        nl[22:26] = resi_new
2337
                        nl = ''.join(nl)
2338
                        if_selected_dont_print = True
2339
                        output += nl + '\n'
2340
            if not if_selected_dont_print:
2341
                output += l + '\n'
2342
        else:  # if not atom
2343
            output += l + '\n'
2344
    return output
2345
2346
2347
def collapsed_view(args):
2348
    """Collapsed view of pdb file. Only lines with C5' atoms are shown and TER, MODEL, END.
2349
2350
    example::
2351
2352
        [mm] rna_tools git:(master) $ python rna-pdb-tools.py --cv input/1f27.pdb
2353
        ATOM      1  C5'   A A   3      25.674  19.091   3.459  1.00 16.99           C
2354
        ATOM     23  C5'   C A   4      19.700  19.206   5.034  1.00 12.65           C
2355
        ATOM     43  C5'   C A   5      14.537  16.130   6.444  1.00  8.74           C
2356
        ATOM     63  C5'   G A   6      11.726  11.579   9.544  1.00  9.81           C
2357
        ATOM     86  C5'   U A   7      12.007   7.281  13.726  1.00 11.35           C
2358
        ATOM    106  C5'   C A   8      12.087   6.601  18.999  1.00 12.74           C
2359
        TER"""
2360
    r = RNAStructure(args.file)
2361
    for l in r.lines:
2362
        at = r.get_atom_code(l)
2363
        if at == "P": # C5'":
2364
            print(l)
2365
        if l.startswith('TER') or l.startswith('MODEL') or l.startswith('END'):
2366
            print(l)
2367
2368
2369
def fetch(pdb_id, path="."):
2370
    """fetch pdb file from RCSB.org
2371
    https://files.rcsb.org/download/1Y26.pdb
2372
2373
    Args:
2374
    - pdb_id, but also a chain can be specified, 1jj2:A+B+C
2375
    
2376
    Returns:
2377
    - a path to a file
2378
2379
    TODO: na_pdb_tools.py --extract A:1-25+B:30-57 1jj2.pdb"""
2380
2381
    chains = ''
2382
    pdb_id = pdb_id.replace('_', ':') # to accept also 1jj2_A
2383
    if ':' in pdb_id:
2384
        pdb_id, chains = pdb_id.split(':') # >>> 'X:A+B+C'.split(':') ['X', 'A+B+C']
2385
2386
    if pdb_id == 'rp':
2387
        os.system('wget https://github.com/RNA-Puzzles/standardized_dataset/archive/master.tar.gz -O - | tar -xz')
2388
        return
2389
    
2390
    import urllib3
2391
    http = urllib3.PoolManager()
2392
2393
    # try:
2394
    pdb_id = pdb_id.replace('.pdb', '')
2395
    response = http.request('GET', 'https://files.rcsb.org/download/' + pdb_id + '.pdb')
2396
    if not response.status == 200:
2397
        raise PDBFetchError()
2398
2399
    # except urllib3.HTTPError:
2400
    #    raise Exception('The PDB does not exists: ' + pdb_id)
2401
    txt = response.data
2402
2403
    if path != '.':
2404
        npath = path + os.sep + pdb_id + '.pdb'
2405
    else:
2406
        npath = pdb_id + '.pdb'
2407
    print('downloading... ' + npath)
2408
    with open(npath, 'wb') as f:
2409
        f.write(txt)
2410
    if chains:
2411
        for chain in chains.split('+'):
2412
            cmd = f'rna_pdb_tools.py --extract-chain {chain} {pdb_id}.pdb > {pdb_id}_{chain}.pdb'
2413
            print(cmd)
2414
            exe(cmd)
2415
    # print('ok')
2416
    return npath
2417
2418
2419 View Code Duplication
def fetch_ba(pdb_id, path="."):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2420
    """fetch biological assembly pdb file from RCSB.org
2421
2422
    >>> fetch_ba('1xjr')
2423
    ...
2424
    """
2425
    try:
2426
        import urllib3
2427
    except ImportError:
2428
        print('urllib3 is required')
2429
        return
2430
    http = urllib3.PoolManager()
2431
    # try:
2432
    response = http.request('GET', url='https://files.rcsb.org/download/' +
2433
                            pdb_id.lower() + '.pdb1')
2434
    if not response.status == 200:
2435
        raise PDBFetchError()
2436
    txt = response.data
2437
2438
    npath = path + os.sep + pdb_id + '_ba.pdb'
2439
    print('downloading...' + npath)
2440
    with open(npath, 'wb') as f:
2441
        f.write(txt)
2442
    print('ok')
2443
    return pdb_id + '_ba.pdb'
2444
2445
2446 View Code Duplication
def fetch_cif_ba(cif_id, path="."):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2447
    """fetch biological assembly cif file from RCSB.org"""
2448
    import urrlib3
2449
    http = urllib3.PoolManager()
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable urllib3 does not seem to be defined.
Loading history...
2450
    # try:
2451
    response = http.request('GET', url='https://files.rcsb.org/download/' +
2452
                            cif_id.lower() + '-assembly1.cif')
2453
    if not response.status == 200:
2454
        raise PDBFetchError()
2455
    txt = response.data
2456
2457
    npath = path + os.sep + cif_id + '_ba.cif'
2458
    print('downloading...' + npath)
2459
    with open(npath, 'wb') as f:
2460
        f.write(txt)
2461
    print('ok')
2462
    return cif_id + '_ba.cif'
2463
2464
def fetch_cif(cif_id, path="."):
2465
    """fetch biological assembly cif file from RCSB.org"""
2466
    import urllib3
2467
    http = urllib3.PoolManager()
2468
    # try:
2469
    if '_' in cif_id:
2470
        cif_id = cif_id.split('_')[0]  
2471
    ic(cif_id)
2472
    cif_id = cif_id.replace('.cif', '')
2473
    url = 'https://files.rcsb.org/download/' + cif_id.lower() + '.cif'
2474
    print(url)
2475
    response = http.request('GET', url)
2476
    if not response.status == 200:
2477
        raise PDBFetchError()
2478
    txt = response.data
2479
2480
    npath = path + os.sep + cif_id + '.cif'
2481
    #print('downloading... ' + npath)
2482
    with open(npath, 'wb') as f:
2483
        f.write(txt)
2484
    return cif_id + '.cif'
2485
2486
def replace_chain(struc_fn, insert_fn, chain_id):
2487
    """Replace chain of the main file (struc_fn) with some new chain (insert_fn) of given chain id.
2488
2489
    Args:
2490
       struc_fn (str): path to the main PDB file
2491
       insert_fn (str): path to the file that will be injected in into the main PDB file
2492
       chain_id (str): chain that will be inserted into the main PDB file
2493
2494
    Returns:
2495
       string: text in the PDB format
2496
    """
2497
    struc = RNAStructure(struc_fn)
2498
    insert = RNAStructure(insert_fn)
2499
2500
    output = ''
2501
    inserted = False
2502
    for l in struc.lines:
2503
        if l.startswith('ATOM'):
2504
            chain = l[21]
2505
            if chain == chain_id:
2506
                if not inserted:
2507
                    for insertl in insert.lines:
2508
                        if not insertl.startswith('HEADER') and not insertl.startswith('END'):
2509
                            output += insertl + '\n'
2510
                    inserted = True
2511
                continue
2512
            # insert pdb
2513
            output += l + '\n'
2514
    return output.strip()
2515
2516
2517
def replace_atoms(struc_fn, insert_fn, verbose=False):
2518
    """Replace XYZ coordinate of the file (struc_fn)  with XYZ from another file (insert_fn).
2519
2520
    This can be useful if you want to replace positions of atoms, for example, one base only. The lines are muted based on atom name, residue name, chain, residue index (marked with XXXX below).::
2521
    
2522
    ATOM     11  N1    A 2  27     303.441 273.472 301.457  1.00  0.00           N   # file
2523
    ATOM      1  N1    A 2  27     300.402 273.627 303.188  1.00 99.99           N   # insert 
2524
    ATOM     11  N1    A 2  27     300.402 273.627 303.188  1.00  0.00           N   # inserted
2525
                 XXXXXXXXXXXXXXXX    # part used to find lines to be replaced
2526
2527
     ATOM      1  P     A 2  27     295.653 270.783 300.135  1.00119.29           P   # next line
2528
2529
    Args:
2530
       struc_fn (str): path to the main PDB file
2531
       insert_fn (str): path to the file that will be injected in into the main PDB file
2532
2533
    Returns:
2534
       string: text in the PDB format
2535
    """
2536
    struc = RNAStructure(struc_fn)
2537
    insert = RNAStructure(insert_fn)
2538
2539
    output = ''
2540
    for l in struc.lines:
2541
        inserted = False
2542
        if l.startswith('ATOM'):
2543
            idx = l[13:26]#.strip() # P     A 2  27
2544
            for li in insert.lines:
2545
                idxi = li[13:26]#.strip()
2546
                if idx == idxi:
2547
                    ln = l[:30] + li[30:54] + l[54:] + '\n' # only coords replace (!)
2548
                    if verbose:
2549
                        print(l)
2550
                        print(li)
2551
                        print(ln)
2552
                        print()
2553
                    inserted = True
2554
                    output += ln
2555
            if not inserted:
2556
                output += l + '\n'
2557
                inserted = False
2558
    return output.strip()
2559
2560
def set_chain_for_struc(struc_fn, chain_id, save_file_inplace=False, skip_ter=True):
2561
    """Quick & dirty function to set open a fn PDB format, set chain_id
2562
    and save it to a file. Takes only lines with ATOM and TER."""
2563
    txt = ''
2564
    for line in open(struc_fn):
2565
        if skip_ter:
2566
            if line.startswith('ATOM') or line.startswith('TER'):
2567
                # if TER line is only TER, not like TER atom chain etc
2568
                # then just keep TER there and don't add any chain to id
2569
                if line == 'TER':
2570
                    pass
2571
                else:
2572
                    l = line[:21] + chain_id + line[22:]
2573
                txt += l
0 ignored issues
show
introduced by
The variable l does not seem to be defined for all execution paths.
Loading history...
2574
            else:
2575
                txt += line
2576
        else:
2577
            if line.startswith('ATOM'):
2578
                l = line[:21] + chain_id + line[22:]
2579
                txt += l
2580
            elif line.startswith('TER'):
2581
                continue
2582
            else:
2583
                txt += line
2584
    if save_file_inplace:
2585
        with open(struc_fn, 'w') as f:
2586
             f.write(txt)
2587
    return txt.strip()
2588
2589
def load_rnas(path, verbose=True):
2590
    """Load structural files (via glob) and return a list of RNAStructure objects.
2591
    
2592
    Examples::
2593
2594
        rnas = rtl.load_rnas('../rna_tools/input/mq/*.pdb')
2595
    
2596
    """
2597
    rs = []
2598
    import glob
2599
    for f in glob.glob(path):
2600
        r = RNAStructure(f)
2601
        rs.append(r)
2602
    return rs
2603
2604
# main
2605
if '__main__' == __name__:
2606
    f1 = "input/t2-4-ACA.pdb"
2607
    f2 = "input/to-replace.pdb"
2608
    r1 = RNAStructure(f1)
2609
    r2 = RNAStructure(f2)    
2610
    t = replace_atoms(f1, f2)
2611
    with open('output/replaced.pdb', 'w') as f: f.write(t)
2612
2613
    fn = "input/1a9l_NMR_1_2_models.pdb"
2614
    r = RNAStructure(fn)
2615
    t = r.get_res_text('A', 1)
2616
    with open('output/1a9l_NMR_1_2_models_R1.pdb', 'w') as f: f.write(t)
2617
2618
    fn = "input/remarks.pdb"
2619
    r = RNAStructure(fn)
2620
    t = r.get_remarks_text()
2621
    with open('output/remarks_only.pdb', 'w') as f: f.write(t)
2622
2623
    fn = 'input/image'
2624
    print('fn:', fn)
2625
    struc = RNAStructure(fn)
2626
    print(' pdb?:', struc.is_pdb())
2627
    # print( atoms:', struc.get_no_lines())
2628
2629
    fn = 'input/na.pdb'
2630
    s = RNAStructure(fn)
2631
    print(s.detect_molecule_type())
2632
    #res = get_all_res(na)
2633
    # print 'what is?', what_is(res)
2634
    # print res
2635
    print('non standard:', s.check_res_if_std_na())
2636
    print('is protein:', s.detect_molecule_type())
2637
2638
    fn = 'input/prot.pdb'
2639
    s = RNAStructure(fn)
2640
    print('non standard:', s.check_res_if_std_prot())
2641
    print('is protein:',  s.detect_molecule_type())
2642
2643
    fn = 'input/rna-ru.pdb'
2644
    s = RNAStructure(fn)
2645
    print('non standard:', s.check_res_if_supid_rna())
2646
    print('is protein:', s.detect_molecule_type())
2647
2648
    fn = 'input/na_highAtomNum.pdb'
2649
    print(fn)
2650
    s = RNAStructure(fn)
2651
    s.renum_atoms()
2652
    s.write('output/na_highAtomNum.pdb')
2653
2654
    fn = 'input/na_solvet_old_format.pdb'
2655
    print(fn)
2656
    s = RNAStructure(fn)
2657
    s.fix_op_atoms()
2658
    s.remove_hydrogen()
2659
    s.remove_ion()
2660
    s.remove_water()
2661
    s.write('output/na_solvet_old_format.pdb')
2662
2663
    fn = 'input/na_solvet_old_format.pdb'
2664
    print(fn)
2665
    s = RNAStructure(fn)
2666
    s.std_resn()
2667
    s.remove_hydrogen()
2668
    s.remove_ion()
2669
    s.remove_water()
2670
    s.write('output/na_solvet_old_format.pdb')
2671
2672
    #fn = 'input/na_solvet_old_format__.pdb'
2673
    #s = RNAStructure(fn)
2674
    # s.std_resn()
2675
    # s.remove_hydrogen()
2676
    # s.remove_ion()
2677
    # s.remove_water()
2678
    # s.renum_atoms()
2679
    # s.fix_op_atoms()
2680
    # s.write('output/na_solvet_old_format__.pdb')
2681
2682
    fn = 'input/1xjr.pdb'
2683
    s.std_resn()
2684
    s.remove_hydrogen()
2685
    s.remove_ion()
2686
    s.remove_water()
2687
    s.renum_atoms()
2688
    s.fix_op_atoms()
2689
    s.write('output/1xjr.pdb')
2690
2691
    fn = 'input/decoy0165_amb.pdb'
2692
    print(fn)
2693
    s = RNAStructure(fn)
2694
    s.std_resn()
2695
    s.remove_hydrogen()
2696
    s.remove_ion()
2697
    s.remove_water()
2698
    s.renum_atoms()
2699
    s.fix_O_in_UC()
2700
    s.fix_op_atoms()
2701
    s.write('output/decoy0165_amb_clx.pdb')
2702
2703
    fn = 'input/farna.pdb'
2704
    print(fn)
2705
    s = RNAStructure(fn)
2706
    s.std_resn()
2707
    s.remove_hydrogen()
2708
    s.remove_ion()
2709
    s.remove_water()
2710
    s.fix_op_atoms()
2711
    s.renum_atoms()
2712
    s.write('output/farna.pdb')
2713
2714
    fn = 'input/farna.pdb'
2715
    print(fn)
2716
2717
    r = RNAStructure(fn)
2718
    print(r.is_mol2())
2719
2720
    if True:
2721
        print('================================================')
2722
        print ("input/1xjr_clx_fChimera_noIncludeNumbers.mol2")
2723
        r = RNAStructure("input/1xjr_clx_fChimera_noIncludeNumbers.mol2")
2724
        print(r.is_mol2())
2725
        r.mol2toPDB('/tmp/x.pdb')
2726
2727
        r = RNAStructure('/tmp/x.pdb')
2728
        print(r.get_report)
2729
        r.std_resn()
2730
        r.remove_hydrogen()
2731
        r.remove_ion()
2732
        r.remove_water()
2733
        r.fix_op_atoms()
2734
        r.renum_atoms()
2735
        r.fixU__to__U()
2736
        r.write("output/1xjr_clx_fChimera_noIncludeNumbers.mol2")
2737
2738
    if True:
2739
        r = RNAStructure("input/2du3_prot_bound.mol2")
2740
        print(r.is_mol2())
2741
        outfn = r.mol2toPDB()
2742
        print(r.get_report)
2743
2744
    print('================================================')
2745
    fn = "input/3e5fA-nogtp_processed_zephyr.pdb"
2746
    r = RNAStructure(fn)
2747
    print(r.is_mol2())
2748
    #outfn = r.mol2toPDB()
2749
    print(r.is_amber_like())
2750
    print(r.get_report)
2751
2752
    print(r.get_preview())
2753
2754
    r.std_resn()
2755
2756
    print(r.get_preview())
2757
2758
    r.remove_hydrogen()
2759
    r.remove_ion()
2760
    r.remove_water()
2761
    #renum_atoms(t, t)
2762
    #fix_O_in_UC(t, t)
2763
    #fix_op_atoms(t, t)
2764
    r.write('output/3e5fA-nogtp_processed_zephyr.pdb')
2765
2766
    print()
2767
    fn = "input/1xjr_clx_charmm.pdb"
2768
    print(fn)
2769
    s = RNAStructure(fn)
2770
    s.std_resn()
2771
    s.remove_hydrogen()
2772
    s.remove_ion()
2773
    s.remove_water()
2774
    s.write('output/1xjr_clx_charmm.pdb')
2775
2776
    #renum_atoms(t, t)
2777
    #fix_O_in_UC(t, t)
2778
    #fix_op_atoms(t, t)
2779
2780
    print()
2781
    fn = "input/dna_fconvpdb_charmm22.pdb"
2782
    print(fn)
2783
    r = RNAStructure(fn)
2784
    r.get_preview()
2785
    r.resn_as_dna()
2786
    r.remove_hydrogen()
2787
    r.remove_ion()
2788
    r.remove_water()
2789
    r.std_resn()
2790
    print(r.get_head())
2791
    print(r.get_tail())
2792
    print(r.get_preview())
2793
    r.write("output/dna_fconvpdb_charmm22.pdb")
2794
2795
    print()
2796
    fn = "input/1a9l_NMR_1_2_models.pdb"
2797
    print(fn)
2798
    r = RNAStructure(fn)
2799
    r.write("output/1a9l_NMR_1_2_models_lib.pdb")
2800
    # r.get_text() # get #1 model
2801
2802
    import doctest
2803
    doctest.testmod()
2804