build.rna_tools.rna_tools_lib.add_header()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 6
nop 1
dl 0
loc 6
rs 10
c 0
b 0
f 0
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""rna_tools_lib.py - main lib file, many tools in this lib is using this file."""
4
5
import os
6
import sys
7
from collections import OrderedDict
8
import re
9
import string
10
import time
11
import gzip
12
import tempfile
13
import shutil
14
import subprocess
15
16
from rna_tools.tools.extra_functions.select_fragment import select_pdb_fragment_pymol_style, select_pdb_fragment
17
18
from icecream import ic
19
import sys
20
ic.configureOutput(outputFunction=lambda *a: print(*a, file=sys.stderr), includeContext=True)
21
ic.configureOutput(prefix='')
22
23
import logging
24
logger = logging.getLogger('rna-tools')
25
handler = logging.StreamHandler()
26
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
27
handler.setFormatter(formatter)
28
logger.addHandler(handler)
29
30
# Don't fix OP3, ignore it
31
ignore_op3 = False
32
33
# Settings: what is what in a PDB file
34
AMINOACID_CODES = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "GLN", "GLY",
35
                   "HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR",
36
                   "TRP", "TYR", "VAL"]
37
38
def generate_conect_records(residue_type, shift=0, no_junction=True):
39
    """
40
    Generate CONECT records for an RNA residue (U, A, C, or G).
41
    
42
    Args:
43
        residue_type (str): Type of residue ('U', 'A', 'C', or 'G').
44
        shift (int): Value to shift all atom serial numbers by.
45
    
46
    Returns:
47
        list: List of CONECT records as strings.
48
    """
49
    # Define connections for each residue type
50
    if residue_type == "U":
51
        connections = [
52
            (1, 2),   # P -> OP1
53
            (1, 3),   # P -> OP2
54
            (1, 4),   # P -> O5'
55
            (4, 5),   # O5' -> C5'
56
            (5, 6),   # C5' -> C4'
57
            (6, 7),   # C4' -> O4'
58
            (6, 8),   # C4' -> C3'
59
            (8, 9),   # C3' -> O3'
60
            (8, 10),  # C3' -> C2'
61
            (10, 11), # C2' -> O2'
62
            (10, 12), # C2' -> C1'
63
            (12, 13), # C1' -> N1
64
            (13, 14), # N1 -> C2
65
            (14, 15), # C2 -> O2
66
            (14, 16), # C2 -> N3
67
            (16, 17), # N3 -> C4
68
            (17, 18), # C4 -> O4
69
            (17, 19), # C4 -> C5
70
            (19, 20), # C5 -> C6
71
            (20, 13), # C6 -> N1
72
            (7, 12),  # O4' -> C1'
73
        ]
74
        if not no_junction:
75
            connections.append((9, 21))
76
        print(connections)
77
    elif residue_type == "A":
78
        connections = [
79
            (1, 2),   # P -> OP1
80
            (1, 3),   # P -> OP2
81
            (1, 4),   # P -> O5'
82
            (4, 5),   # O5' -> C5'
83
            (5, 6),   # C5' -> C4'
84
            (6, 7),   # C4' -> O4'
85
            (6, 8),   # C4' -> C3'
86
            (8, 9),   # C3' -> O3'
87
            (8, 10),  # C3' -> C2'
88
            (10, 11), # C2' -> O2'
89
            (10, 12), # C2' -> C1'
90
            (12, 13), # C1' -> N9
91
            (13, 14), # N9 -> C8
92
            (14, 15), # C8 -> N7
93
            (15, 16), # N7 -> C5
94
            (16, 17), # C5 -> C6
95
            (17, 18), # C6 -> N6
96
            (17, 19), # C6 -> N1
97
            (19, 20), # N1 -> C2
98
            (20, 21), # C2 -> N3
99
            (21, 22), # N3 -> C4
100
            (22, 16), # C4 -> C5
101
            (22, 13), # C4 -> N9
102
            (7, 12),  # O4' -> C1'
103
        ]
104
        if not no_junction:
105
            connections.append((9, 23))
106
    elif residue_type == "C":
107
        connections = [
108
            (1, 2),   # P -> OP1
109
            (1, 3),   # P -> OP2
110
            (1, 4),   # P -> O5'
111
            (4, 5),   # O5' -> C5'
112
            (5, 6),   # C5' -> C4'
113
            (6, 7),   # C4' -> O4'
114
            (6, 8),   # C4' -> C3'
115
            (8, 9),   # C3' -> O3'
116
            (8, 10),  # C3' -> C2'
117
            (10, 11), # C2' -> O2'
118
            (10, 12), # C2' -> C1'
119
            (12, 13), # C1' -> N1
120
            (13, 14), # N1 -> C2
121
            (14, 15), # C2 -> O2
122
            (14, 16), # C2 -> N3
123
            (16, 17), # N3 -> C4
124
            (17, 18), # C4 -> N4
125
            (17, 19), # C4 -> C5
126
            (19, 20), # C5 -> C6
127
            (20, 13), # C6 -> N1
128
            (7, 12),  # O4' -> C1' (additional connection)
129
        ]
130
        if not no_junction:
131
            connections.append((9, 21))
132
    elif residue_type == "G":
133
        connections = [
134
            (1, 2),   # P -> OP1
135
            (1, 3),   # P -> OP2
136
            (1, 4),   # P -> O5'
137
            (4, 5),   # O5' -> C5'
138
            (5, 6),   # C5' -> C4'
139
            (6, 7),   # C4' -> O4'
140
            (6, 8),   # C4' -> C3'
141
            (8, 9),   # C3' -> O3'
142
            (8, 10),  # C3' -> C2'
143
            (10, 11), # C2' -> O2'
144
            (10, 12), # C2' -> C1'
145
            (12, 13), # C1' -> N9
146
            (13, 14), # N9 -> C8
147
            (14, 15), # C8 -> N7
148
            (15, 16), # N7 -> C5
149
            (16, 17), # C5 -> C6
150
            (17, 18), # C6 -> O6
151
            (17, 19), # C6 -> N1
152
            (19, 20), # N1 -> C2
153
            (20, 21), # C2 -> N2
154
            (20, 22), # C2 -> N3
155
            (22, 23), # N3 -> C4
156
            (23, 16), # C4 -> C5
157
            (23, 13), # C4 -> N9
158
            (7, 12),  # O4' -> C1' (additional connection)
159
        ]
160
        if not no_junction:
161
            connections.append((9, 24))
162
    else:
163
        raise ValueError("Invalid residue type. Choose 'U', 'A', 'C', or 'G'.")
164
165
    # Generate CONECT records with shifted serial numbers
166
    conect_records = []
167
    for conn in connections:
168
        atom1_serial, atom2_serial = conn
169
        # Shift the serial numbers
170
        atom1_serial += shift
171
        atom2_serial += shift
172
        conect_records.append(f"CONECT{atom1_serial:>5}{atom2_serial:>5}")
173
    return conect_records
174
175
    
176
def aa3to1(aaa):
177
    """based on https://pymolwiki.org/index.php/Aa_codes"""
178
    if len(aaa) != 3:  # aaa is 'G', like for RNA ;-)
179
        return '' # dont do it for rna test aaa
180
    one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', \
181
                 'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y',    \
182
                 'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A',    \
183
                 'GLY':'G', 'PRO':'P', 'CYS':'C'}
184
    return one_letter[aaa]
185
186
187
RES = ['DA', 'DG', 'DT', 'DC']
188
RES += ['A', 'G', 'U', 'C']
189
190
RESS = ['A', 'C', 'G', 'U', 'ADE', 'CYT', 'GUA', 'URY', 'URI', 'U34', 'U31', 'C31', '4SU', 'H2U', 'QUO', 'G7M', '5MU',
191
        '5MC', 'PSU', '2MG', '1MG', '1MA', 'M2G', '5BU', 'FHU', 'FMU', 'IU', 'OMG', 'OMC', 'OMU', 'A2M', 'A23', 'CCC',
192
        'I'] + ['RC', 'RU', 'RA', 'RG', 'RT']
193
DNA = ['DA', 'DG', 'DT', 'DC']
194
RNA = ['A', 'G', 'U', 'C']
195
IONS = ['NA', 'MG', 'MN', 'JOS'] # and ligands JOS
196
HYDROGEN_NAMES = ["H", "H5'", "H5''", "H4'", "H3'", "H2'", "HO2'", "H1'", "H3", "H5", "H6", "H5T", "H41", "1H5'",
197
                  "2H5'", "HO2'", "1H4", "2H4", "1H2", "2H2", "H1", "H8", "H2", "1H6", "2H6", "HO5'", "H21", "H22",
198
                  "H61", "H62", "H42", "HO3'", "1H2'", "2HO'", "HO'2", "H2'1", "HO'2", "HO'2", "H2", "H2'1", "H1", "H2",
199
                  "1H5*", "2H5*", "H4*", "H3*", "H1*", "1H2*", "2HO*", "1H2", "2H2", "1H4", "2H4", "1H6", "2H6", "H1",
200
                  "H2", "H3", "H5", "H6", "H8", "H5'1", "H5'2", "H3T"]
201
202
203
class PDBFetchError(Exception):
204
    pass
205
206
class MethodUnknown(Exception):
207
    pass
208
209
210
try:
211
    from Bio.PDB import *
212
except ImportError:
213
    print("Biopython is not detected. It is required for some functions.")
214
215
import warnings
216
warnings.filterwarnings('ignore', '.*Invalid or missing.*',)
217
warnings.filterwarnings('ignore', '.*with given element *',)
218
warnings.filterwarnings('ignore', '.*is discontinuous*',)
219
warnings.filterwarnings('ignore', '.*Some atoms or residues may be missing in the data structure.*',)
220
warnings.filterwarnings('ignore', '.*Ignoring unrecognized record.*',)
221
222
223
import subprocess
224
def exe(cmd):
225
    o = subprocess.Popen(
226
        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
227
    out = o.stdout.read().strip().decode()
228
    err = o.stderr.read().strip().decode()
229
    return out, err
230
231
232
def get_rna_tools_path():
233
    """Return path to the rt."""
234
    import inspect
235
    import rna_tools
236
    return os.path.dirname(inspect.getfile(rna_tools))
237
238
def get_version(currfn='', verbose=False):
239
    import rna_tools
240
    return rna_tools.__version__
241
242
243
def sort_strings(l):
244
    """ Sort the given list in the way that humans expect.
245
    http://blog.codinghorror.com/sorting-for-humans-natural-sort-order/
246
    """
247
    def convert(text): return int(text) if text.isdigit() else text
248
249
    def alphanum_key(key): return [convert(c) for c in re.split('([0-9]+)', key)]
250
    l.sort(key=alphanum_key)
251
    return l
252
253
254
class RNAStructure:
255
    """RNAStructure - handles an RNA pdb file.
256
257
    Attributes:
258
259
        fn (string)  : path to the structural file, e.g., "../rna_tools/input/4ts2.pdb"
260
        name(string) : filename of the structural file, "4ts2.pdb"
261
        lines (list) : the PDB file is loaded and ATOM/HETATM/TER/END go to self.lines
262
263
    """
264
265
    def __init__(self, fn=''):
266
        if not fn: # if empty
267
            tf = tempfile.NamedTemporaryFile(delete=False)
268
            fn = tf.name
269
270
        self.fn = os.path.abspath(fn) # ../rna_tools/input/4ts2.pdb
271
        # add name attribute with filename 4ts2.pdb
272
        self.name = self.fn.split(os.sep)[-1] # OS.path.splitext(self.fn)[0]
273
274
        self.report = []
275
        self.report.append('The RNARNAStructure report: %s ' % self.fn)
276
277
        self.mol2_format = False
278
279
        self.lines = []
280
281
        try:
282
            # lines = open(fn).read().strip().split('\n') # don't strip?, good or bed?
283
            lines = open(fn).read().split('\n')
284
        except UnicodeDecodeError:
285
            print("Can't open a binary file")
286
            self.lines = ''
287
            return
288
289
        self.has_many_models = False
290
291
        for l in lines:
292
            # multi-models pdb files
293
            if l.startswith('MODEL'):
294
                self.has_many_models = True
295
            if l.startswith('ENDMDL'):
296
                break
297
298
            if l.startswith('ATOM') or l.startswith('HETATM') or l.startswith('TER') or l.startswith('END'):
299
                self.lines.append(l) # don't strip .strip())
300
            if l.startswith("@<TRIPOS>"):
301
                self.mol2_format = True
302
                self.report.append('This is mol2 format')
303
304
        self.res = self.get_resn_uniq()
305
306
    def reload(self):
307
        """Reload the object."""
308
        self.__init__(self.fn)
309
        
310
    def is_pdb(self):
311
        """Return True if the files is in PDB format.
312
313
        If self.lines is empty it means that nothing was parsed into the PDB format."""
314
        if len(self.lines):
315
            return True
316
        else:
317
            return False
318
319
    def is_nmr(self):
320
        """True if the file is an NMR-style multiple model pdb
321
322
        :returns: True or Fo
323
        :rtype: boolean
324
        """
325
        return self.has_many_models
326
327
    def un_nmr(self, startwith1=True, directory='', verbose=False):
328
        """Un NMR - Split NMR-style multiple model pdb files into individual models.
329
330
        Take self.fn and create new files in the way::
331
332
            input/1a9l_NMR_1_2_models.pdb
333
               input/1a9l_NMR_1_2_models_0.pdb
334
               input/1a9l_NMR_1_2_models_1.pdb
335
336
        .. warning:: This function requires biopython.
337
338
        rna_pdb_tools.py --un-nmr AF-Q5TCX8-F1-model_v1_core_Ctrim_mdr_MD.pdb
339
36
340
        cat *MD_* > md.pdb
341
        """
342
        #
343
        npdb = ''
344
        c = 0
345
346
        nmodels = 0
347
        for l in open(self.fn):
348
            if l.startswith('MODEL'):
349
                nmodels += 1
350
        print(nmodels)
351
352
        for l in open(self.fn):
353
            if 1:
354
                if not l.startswith('HETATM'):
355
                    npdb += l
356
            if l.startswith('MODEL'):
357
                c += 1
358
            if l.startswith('ENDMDL'):
359
                id = str(c).zfill(len(str(nmodels)))
360
                nf = self.fn.replace('.pdb', '_%s.pdb' % id)
361
                verbose = 1
362
                if verbose:
363
                    print(nf)
364
                with open(nf, 'w') as f:
365
                    f.write(npdb)
366
                npdb = ''
367
        biopython = 0
368
        
369
        if biopython:
370
            parser = PDBParser()
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable PDBParser does not seem to be defined.
Loading history...
371
            structure = parser.get_structure('', self.fn)
372
            for c, m in enumerate(structure):
373
                if verbose:
374
                    print(m)
375
                io = PDBIO()
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable PDBIO does not seem to be defined.
Loading history...
376
                io.set_structure(m)
377
                if startwith1:
378
                    c += 1
379
                if directory:
380
                    io.save(directory + os.sep + self.fn.replace('.pdb', '_%i.pdb' % c))
381
                else:
382
                    io.save(self.fn.replace('.pdb', '_%i.pdb' % c))                
383
384
    def is_mol2(self):
385
        """Return True if is_mol2 based on the presence of ```@<TRIPOS>```."""
386
        return self.mol2_format
387
388
    def decap_gtp(self):
389
        lines = []
390
        for l in self.lines:
391
            if l.startswith('ATOM') or l.startswith('HETATM'):
392
                if l[12:16].strip() in ['PG', 'O1G', 'O2G', 'O3G', 'O3B', 'PB', 'O1B', 'O2B', 'O3A']:
393
                    continue
394
                if l[12:16].strip() == 'PA':
395
                    l = l.replace('PA', 'P ')
396
                if l[12:16].strip() == 'O1A':
397
                    l = l.replace('O1A', 'O1P')
398
                if l[12:16].strip() == 'O2A':
399
                    l = l.replace('O2A', 'O2P')
400
                if l[17:20].strip() == 'GTP':
401
                    l = l[:17] + '  G' + l[20:]
402
                    l = l.replace('HETATM', 'ATOM  ')
403
                lines.append(l)
404
        self.lines = lines
405
406
    def is_amber_like(self):
407
        """Use self.lines and check if there is XX line
408
        """
409
        for l in self.lines:
410
            if l.startswith('ATOM') or l.startswith('HETATM'):
411
                rn = l[17:20]
412
                if rn in ['RU5', 'RC5', 'RA5', 'RT5', 'RG5']:
413
                    self.report.append('This is amber-like format')
414
                    return True
415
        return False
416
417
    def replace_hetatms(self):
418
        nlines = []
419
        for l in self.lines:
420
            l = l.replace('HETATM', 'ATOM  ')
421
            nlines.append(l)
422
        self.lines = nlines
423
424
    def fix_with_qrnas(self, outfn="", verbose=False):
425
        """Add missing heavy atom.
426
427
        A residue is recognized base on a residue names.
428
429
        Copy QRNAS folder to curr folder, run QRNAS and remove QRNAS.
430
431
        .. warning:: QRNAS required (http://genesilico.pl/QRNAS/QRNAS.tgz)
432
        """
433
        from rpt_config import QRNAS_PATH
434
435
        # prepare folder get ff folder
436
        to_go = os.path.abspath(os.path.dirname(self.fn))
437
        curr = os.getcwd()
438
439
        # set occupancy to 0
440
        s = RNAStructure(self.fn)
441
        s.set_occupancy_atoms(0.00)
442
        s.write(self.fn)
443
444
        os.chdir(to_go)
445
        try:
446
            shutil.copytree(QRNAS_PATH, to_go + os.sep + "QRNAS")
447
        except OSError:
448
            pass
449
        # prepare config file
450
        with open('qrna_config.txt', 'w') as f:
451
            f.write("WRITEFREQ   1\n")
452
            f.write("NSTEPS      1\n")
453
        # run qrnas
454
        print('QRNAS...')
455
456
        if not outfn:
457
            cmd = "QRNAS -c qrna_config.txt -i " + os.path.basename(self.fn)
458
        else:
459
            cmd = "QRNAS -c qrna_config.txt -i " + \
460
                os.path.basename(self.fn) + " -o " + curr + os.sep + outfn
461
        #o = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
462
        os.system(cmd)
463
        if False:
464
            out = o.stdout.read().strip()
465
            err = o.stderr.read().strip()
466
            if verbose:
467
                print(out)
468
                print(err)
469
                shutil.rmtree(to_go + os.sep + "QRNAS")
470
        # post cleaning
471
        if outfn:
472
            print('Cleaning...')
473
            s = RNAStructure(curr + os.sep + outfn)
474
            s.remove_hydrogen()
475
            s.std_resn()
476
            s.write(curr + os.sep + outfn)
477
        os.chdir(curr)
478
479
    def mol2toPDB(self, outfn=""):
480
        try:
481
            import pybel
482
        except ImportError:
483
            print ('pybel is needed for mol2 to pdb convertion')
484
            # sys.exit(1)
485
            sys.exit(0)
486
487
        if not outfn:
488
            outfn = self.fn.replace('.mol2', '.pdb')
489
490
        for mol in pybel.readfile("mol2", self.fn):
491
            mol.write("pdb", outfn, overwrite=True)
492
493
        print('outfn: ', outfn)
494
        self.report.append('  Converted from mol2 to PDB')
495
        return outfn
496
497
    def get_no_lines(self):
498
        return len(self.lines)
499
        
500
    def check_geometry(self, verbose=False):
501
        """Check for correct "Polymer linkage, it should be around 1.6Å with a sigma around 0.01.
502
        
503
        Carrascoza, F., Antczak, M., Miao, Z., Westhof, E. & Szachniuk, M. Evaluation of the stereochemical quality of predicted RNA 3D models in the RNA-Puzzles submissions. Rna 28, 250–262 (2022).
504
505
        Values for 1xjr.pdb::
506
507
            op.mean(): 1.599316
508
            op.std(): 0.009274074
509
510
            po.mean(): 1.5984017
511
            po.std(): 0.0069191623
512
        
513
        requires biopython
514
        """
515
        if verbose:
516
            logger.setLevel(logging.DEBUG)
517
518
        try:
519
            from Bio import PDB
520
            from Bio.PDB import PDBIO
521
            import warnings
522
            warnings.filterwarnings('ignore', '.*Invalid or missing.*',)
523
            warnings.filterwarnings('ignore', '.*with given element *',)
524
        except:
525
            sys.exit('Error: Install biopython to use this function (pip biopython)')
526
527
        pdb = self.fn
528
        struc = PDB.PDBParser().get_structure('struc', pdb)
529
        resi_prev = {}
530
        op = []
531
        po = []
532
        angles = []
533
        angles2 = []
534
535
        p_ps = []
536
        sigma = 0.009274074
537
        mean =  1.599316
538
        po3p_p  = []
539
        import numpy as np
540
541
        dist_o3pp= []
542
        for s in struc:
543
            for c in s:
544
                # for new chain reset resi_prev
545
                resi_prev = None
546
                for r in c:
547
                    if resi_prev:
548
                        p_p = resi_prev["P"] - r['P']
549
                        p_ps.append(p_p)
550
                        
551
                        v = resi_prev["O3'"] - r['P']
552
                        po3p_p = resi_prev["O3'"] - r['P']
553
                        op.append(v)
554
                        import math
555
                        dist_o3pp.append(abs(1.6 - v))
556
                        x = mean - 6 * sigma
557
                        y = mean + 6 * sigma
558
559
                        v2 = r["P"] - r["O5'"]
560
                        #ic(v2)
561
                        po.append(v2)
562
563
                        if not (x <= v <= y):
564
                            #print(f' ! lack of connectivity between {r}, {resi_prev}, distance between residues: {v:.2f}')
565
                            pass
566
567
                        # angle o3' p o5'
568
                        pc3p = resi_prev["C3'"].get_vector()
569
                        po3p = resi_prev["O3'"].get_vector()
570
                        p = r['P'].get_vector()
571
572
                        v1 = po3p - p
573
                        v2 = po3p - pc3p
574
                        angle = np.rad2deg(v1.angle(v2))#o5p)) 
575
                        angles.append(angle)
576
577
                        # agnle o3'-p-o5' angle
578
                        o5p = r["O5'"].get_vector()
579
                        v1 = po3p - p
580
                        v2 = o5p - p
581
                        angle2 = np.rad2deg(v2.angle(v1))
582
                        angles2.append(angle2)
583
584
                    resi_prev = r
585
        #ic(dist_o3pp, len(dist_o3pp), np.mean(dist_o3pp))
586
        # Colors
587
        RED = "\033[31m"
588
        GREEN = "\033[32m"
589
        YELLOW = "\033[33m"
590
        BLUE = "\033[34m"
591
        RESET = "\033[0m"
592
593
        print(f'Connectivity Score; {GREEN} {sum(dist_o3pp) } {RESET}')
594
        print('Connectivity Score; ', sum(dist_o3pp) / len(dist_o3pp))
595
        p_ps = np.array(p_ps)
596
597
        plots = True
598
599
        ic(v)
0 ignored issues
show
introduced by
The variable v does not seem to be defined for all execution paths.
Loading history...
600
601
        plots = False
602
603
        if plots:
604
            import matplotlib.pyplot as plt
605
            plt.hist(op, bins='auto')
606
            plt.xlim(-1, 10)
607
            plt.title("prev-o3' p")
608
            plt.show()
609
610
        #np.set_printoptions(threshold=np.inf)
611
        # ic.disable()
612
        ic(p_ps)
613
        ic(p_ps.mean(), p_ps.std())
614
615
        if plots:
616
            import matplotlib.pyplot as plt
617
            plt.hist(p_ps, bins='auto')
618
            plt.title("p-ps")
619
            plt.xlim(0, 10)
620
            plt.show()
621
622
            #plt.savefig("histogram.png")  # Save the figure as a PNG file
623
        
624
        op = np.array(op)
625
        ic(op.mean(), op.std())
626
627
        po = np.array(po)
628
        ic(po.mean(), po.std())
629
630
        print("c3'-o3'-p")
631
        angles = np.array(angles)
632
        ic(angles)
633
        ic(angles.mean(), angles.std())
634
635
        import math
636
        angle_degrees = angles.mean()
637
        angle_radians = math.radians(angle_degrees)  # Convert to radians
638
        cos_value = math.cos(angle_radians)  # Calculate cosine
639
        ic(cos_value)
640
        
641
        coss = []
642
        for a in angles:
643
            angle_radians = math.radians(a)
644
            cos_value = math.cos(angle_radians)  # Calculate cosine
645
            coss.append(cos_value)
646
        coss = np.array(coss)
647
        ic(coss.std())
648
649
        if plots:
650
            import matplotlib.pyplot as plt
651
            plt.hist(angles, bins='auto')
652
            plt.title("Histogram of angles o3'-p-o5'")
653
            plt.xlim(0, 360)
654
            plt.show()
655
656
        angles2 = np.array(angles2)
657
        print("o3'-p-o5'")
658
        ic(angles2)
659
        ic(angles2.mean(), angles2.std())
660
661
        angle_degrees = angles2.mean()
662
        angle_radians = math.radians(angle_degrees)  # Convert to radians
663
        cos_value = math.cos(angle_radians)  # Calculate cosine
664
665
        ic(cos_value)
666
667
        coss = []
668
        for a in angles2:
669
            angle_radians = math.radians(a)
670
            cos_value = math.cos(angle_radians)  # Calculate cosine
671
            coss.append(cos_value)
672
        coss = np.array(coss)
673
        ic(coss.std())
674
675
        if plots:
676
            import matplotlib.pyplot as plt
677
            plt.hist(angles2, bins='auto')
678
            plt.title("Histogram of angles c3'-o3'-p")
679
            plt.xlim(0, 360)
680
            plt.show()
681
        #ic.enable()
682
        
683
    def get_text(self, add_end=True):
684
        """works on self.lines."""
685
        txt = ''
686
        for l in self.lines:
687
            if l.startswith('END'):
688
                continue  # skip end
689
            txt += l + '\n' # .strip()
690
        if add_end:
691
            if not l.startswith('END'):
0 ignored issues
show
introduced by
The variable l does not seem to be defined in case the for loop on line 686 is not entered. Are you sure this can never be the case?
Loading history...
692
                txt += 'END'
693
        return txt.strip()
694
695
    def get_chain(self, chain_id='A'):
696
        txt = ''
697
        for l in self.lines:
698
            if l.startswith('ATOM') or l.startswith('HETATM'):
699
                if l[21] == chain_id:
700
                    txt += l.strip() + '\n'
701
            if l.startswith('TER'):
702
                if len(l) < 21: # this is only TER
703
                    #                     ter = 'TER   ' +  str(self.get_atom_num(txt) + 1).rjust(5) + '\n'
704
                    raise Exception('TER line has no chain id')
705
                else:     
706
                    if l[21] == chain_id:
707
                          txt += l.strip() + '\n'
708
        ## ll = txt.strip().split('\n')[-1] # last line = ll
709
        ## print(ll)
710
        ## #if ll.startswith('TER'):  # if the last line does not start with ter
711
        ## ter = self.set_res_code(ter, self.get_res_code(ll)) # + self.get_chain_id(l).rjust(11)
712
        ## print(ter)
713
        return txt.strip()
714
715
    def rename_chain(self, chain_id_old, chain_id_new, debug=False):
716
        """Rename chains
717
718
        Args:
719
            chain_id_old (str): e.g., A
720
            chain_id_new (str): e.g., B
721
            debug (bool): show some diagnostics
722
723
        Returns:
724
            pdb content (txt)
725
            self.lines is updated with new lines
726
        """
727
        txt = ''
728
        lines = []
729
        for l in self.lines:
730
            if l.startswith('ATOM') or l.startswith('HETATM') or l.startswith('TER'):
731
                # TER
732
                try:
733
                    l[21]
734
                except IndexError:
735
                    continue
736
                if l[21] == chain_id_old:
737
                    l = l[:21] + chain_id_new + l[22:]
738
            if debug: print(l)
739
            txt += l.strip() + '\n'  # ok, actually keep all lines as it was
740
            lines.append(l)
741
        self.lines = lines
742
        return txt
743
744
745
    def get_resn_uniq(self):
746
        res = set()
747
        for l in self.lines:
748
            r = l[17:20].strip().upper()
749
            res.add(r)
750
        return res
751
752
    def check_res_if_std_na(self):
753
        wrong = []
754
755
        for r in self.res:
756
            if r not in RES:
757
                wrong.append(r)
758
        return wrong
759
760
    def get_seq(self, compact=False, chainfirst=False, fasta=False, addfn='', color=False):
761
        """Get seq (v2) gets segments of chains with correct numbering
762
763
        Run::
764
765
            python rna_pdb_seq.py input/1ykq_clx.pdb
766
            > 1ykq_clx A:101-111
767
            GGAGCUCGCCC
768
            > 1ykq_clx B:201-238
769
            GGGCGAGGCCGUGCCAGCUCUUCGGAGCAAUACUCGGC
770
771
            > 6_solution_0 A:1-19 26-113 117-172
772
            GGCGGCAGGUGCUCCCGACGUCGGGAGUUAAAAGGGAAG
773
774
        Chains is ``{'A': {'header': 'A:1-19 26-113 117-172', 'resi': [1, 2, 3, ..., \
775
        19, 26, 27, ..., 172], 'seq': ['G', 'G', 'C', 'G', ... C', 'G', 'U', 'C']}}``
776
777
        Chains are in other as the appear in the file.
778
779
        .. warning :: take only ATOM and HETATM lines.
780
        """
781
        if addfn:
782
            addfn += ' ' # add space to file name
783
        seq = self.lines[0][19]
784
        chains = OrderedDict()
785
        resi_prev = None
786
        chain_prev = None
787
788
        for l in self.lines:
789
            if l.startswith('ATOM') or l.startswith('HETATM'):
790
                resi = int(l[22:26])
791
                if resi_prev != resi:
792
                    resname = l[17:20].strip()
793
                    chain_curr = l[21]
794
                    if resname in AMINOACID_CODES:
795
                        resname = aa3to1(resname)
796
                    if len(resname) == 'GTP':  # DG -> g GTP
797
                        resname = 'g'
798
                    if len(resname) > 1:  # DG -> g GTP
799
                        resname = resname[-1].lower()
800
                    try:
801
                        chains[chain_curr]['resi'].append(resi)
802
                        chains[chain_curr]['seq'].append(resname)
803
                    except KeyError:
804
                        chains[chain_curr] = {}
805
                        chains[chain_curr]['resi'] = []
806
                        chains[chain_curr]['resi'].append(resi)
807
                        chains[chain_curr]['seq'] = []
808
                        chains[chain_curr]['seq'].append(resname)
809
810
                    resi_prev = resi
811
                    chain_prev = chain_curr
812
                    
813
        def color_seq(seq, color):
814
            if not color:
815
                return seq
816
            else:
817
                from termcolor import colored
818
                seqc = ''
819
                for s in seq:
820
                    if s in ['G']:
821
                        seqc += colored(s, 'green')
822
                    if s in ['G']:
823
                        seqc += colored(s, 'red')
824
                    if s in ['T', 'U']:
825
                        seqc += colored(s, 'blue')                        
826
                    if s in ['C']:
827
                        seqc += colored(s, 'yellow')                        
828
                return seqc
829
            
830
831
        for c in list(chains.keys()):
832
            header = c + ':' + str(chains[c]['resi'][0]) + '-'  # add A:1-
833
            for i in range(1, len(chains[c]['resi'])):  # start from second element
834
                if chains[c]['resi'][i] - chains[c]['resi'][i - 1] > 1:
835
                    header += '' + str(chains[c]['resi'][i - 1]) + ' '
836
                    header += '' + str(chains[c]['resi'][i]) + '-'
837
            header += '' + str(chains[c]['resi'][-1])
838
            chains[c]['header'] = header  # add -163 (last element)
839
840
        if compact:
841
            txt = ''
842
            for c in list(chains.keys()):
843
                if chainfirst:
844
                    txt += '' + chains[c]['header'].ljust(15) + color_seq(''.join(chains[c]['seq']), color) + ' '
845
                elif fasta:
846
                    txt += color_seq(''.join(chains[c]['seq']), color) + ' '
847
                else:
848
                    txt += color_seq(''.join(chains[c]['seq']), color) + ' # ' + chains[c]['header'] + ' '
849
            return txt.strip()
850
        else:
851
            txt = ''
852
            for c in list(chains.keys()):
853
                txt += '>' + addfn + chains[c]['header'] + '\n'
854
                txt += color_seq(''.join(chains[c]['seq']), color) + '\n' # color ;-)
855
            return txt.strip()
856
857
    def __get_seq(self):
858
        """get_seq DEPRECATED
859
860
        You get `chains` such as:
861
        OrderedDict([('A', {'header': 'A:1-47', 'seq': 'CGUGGUUAGGGCCACGUUAAAUAGUUGCUUAAGCCCUAAGCGUUGAU'}), ('B', {'header': 'B:48-58', 'seq': 'AUCAGGUGCAA'})])
862
863
        .. warning:: take only ATOM and HETATM lines.
864
        """
865
        seq = ''
866
        curri = int(self.lines[0][22:26])
867
        seq = self.lines[0][19]
868
        chains = OrderedDict()
869
        curri = -100000000000000000000000000000000  # ugly
870
        chain_prev = None
871
872
        for l in self.lines:
873
            if l.startswith('ATOM') or l.startswith('HETATM'):
874
                resi = int(l[22:26])
875
                if curri != resi:
876
                    print(l)
877
                    resname = l[17:20].strip()
878
                    if len(resname) == 'GTP':  # DG -> g GTP
879
                        resname = 'g'
880
                    if len(resname) > 1:  # DG -> g GTP
881
                        resname = resname[-1].lower()
882
883
                    seq += resname
884
                    chain_curr = l[21]
885
886
                    # if distances between curr res and previevs is bigger than 1, then show it as a fragment
887
                    if resi - curri > 1 and resi - curri < 100000000000000000000000000000000:  # ugly hack
888
                        print(resi - curri)
889
                        chains[chain_prev]['header'] += '-' + str(resi_prev)
0 ignored issues
show
introduced by
The variable resi_prev does not seem to be defined for all execution paths.
Loading history...
890
                    if chain_prev != chain_curr and chain_prev:
891
                        chains[chain_prev]['header'] += '-' + str(resi_prev)
892
                    if chaoin_curr in chains:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable chaoin_curr does not seem to be defined.
Loading history...
893
                        chains[chain_curr]['seq'] += resname
894
                    else:
895
                        chains[chain_curr] = dict()
896
                        chains[chain_curr]['header'] = chain_curr + ':' + str(resi)  # resi_prev)
897
                        chains[chain_curr]['seq'] = resname
898
                    resi_prev = resi
899
                    chain_prev = chain_curr
900
                curri = resi
901
        chains[chain_prev]['header'] += '-' + str(resi_prev)
902
        seq = ''
903
        for c in list(chains.keys()):
904
            seq += '> ' + os.path.basename(self.fn) + ' ' + chains[c]['header'] + '\n'
905
            seq += chains[c]['seq'] + '\n'
906
        return seq.strip()
907
908
    def get_info_chains(self):
909
        """return A:3-21 B:22-32
910
        """
911
        seq = ''
912
        curri = int(self.lines[0][22:26])
913
        seq = self.lines[0][19]
914
        chains = OrderedDict()
915
        curri = -100000000000000  # ugly
916
        chain_prev = None
917
        for l in self.lines:
918
            if l.startswith('ATOM') or l.startswith('HETATM'):
919
                resi = int(l[22:26])
920
                if curri != resi:
921
                    resname = l[17:20].strip()
922
                    if len(resname) == 'GTP':  # DG -> g GTP
923
                        resname = 'g'
924
                    if len(resname) > 1:  # DG -> g GTP
925
                        resname = resname[-1].lower()
926
                    seq += resname
927
                    chain_curr = l[21]
928
                    if chain_prev != chain_curr and chain_prev:
929
                        chains[chain_prev]['header'] += '-' + str(resi_prev)
0 ignored issues
show
introduced by
The variable resi_prev does not seem to be defined for all execution paths.
Loading history...
930
                    if chain_curr in chains:
931
                        chains[chain_curr]['seq'] += resname
932
                    else:
933
                        chains[chain_curr] = dict()
934
                        chains[chain_curr]['header'] = chain_curr + ':' + str(resi)  # resi_prev)
935
                        chains[chain_curr]['seq'] = resname
936
                    resi_prev = resi
937
                    chain_prev = chain_curr
938
                curri = resi
939
        chains[chain_prev]['header'] += '-' + str(resi_prev)
940
        seq = ''
941
        for c in list(chains.keys()):
942
            seq += chains[c]['header'] + ' '
943
        return seq.strip()
944
945
    def detect_file_format(self):
946
        pass
947
948
    def detect_molecule_type(self):
949
        aa = []
950
        na = []
951
        for r in self.res:
952
            if r in AMINOACID_CODES:
953
                aa.append(r)
954
            if r in RESS:
955
                na.append(r)
956
957
        aa = float(len(aa)) / len(self.res)
958
        na = float(len(na)) / len(self.res)
959
960
        if aa == 0 and na == 0:
961
            return 'error'
962
        if aa > na:
963
            return '>protein< vs na', aa, na
964
        else:
965
            return 'protein vs >na<', aa, na
966
967
    def get_head(self):
968
        return '\n'.join(self.lines[:5])
969
970
    def get_tail(self):
971
        return '\n'.join(self.lines[-5:])
972
973
    def get_preview(self):
974
        t = '\n'.join(self.lines[:5])
975
        t += '\n-------------------------------------------------------------------\n'
976
        t += '\n'.join(self.lines[-5:])
977
        return t
978
979
    def remove_hydrogen(self):
980
        lines = []
981
        for l in self.lines:
982
            if l[77:79].strip() == 'H':
983
                continue
984
            if l[12:16].strip() in HYDROGEN_NAMES:
985
                # if l[12:16].strip().startswith('H'):
986
                continue
987
            else:
988
                # print l[12:16]
989
                lines.append(l)
990
        self.lines = lines
991
992
    def remove_water(self):
993
        """Remove HOH and TIP3"""
994
        lines = []
995
        for l in self.lines:
996
            if l[17:21].strip() in ['HOH', 'TIP3', 'WAT']:
997
                continue
998
            else:
999
                lines.append(l)
1000
        self.lines = lines
1001
1002
    def remove_ion(self):
1003
        """
1004
    TER    1025        U A  47
1005
    HETATM 1026 MG    MG A 101      42.664  34.395  50.249  1.00 70.99          MG
1006
    HETATM 1027 MG    MG A 201      47.865  33.919  48.090  1.00 67.09          MG
1007
        :rtype: object
1008
        """
1009
        lines = []
1010
        for l in self.lines:
1011
            element = l[76:78].strip().upper()
1012
            element2 = l[17:20].strip().upper()
1013
            if element in IONS:
1014
                continue
1015
            if element2 in IONS:
1016
                continue
1017
            else:
1018
                lines.append(l)
1019
        self.lines = lines
1020
1021
    def fixU__to__U(self):
1022
        lines = []
1023
        for l in self.lines:
1024
            if l.startswith('ATOM') or l.startswith('HETATM'):
1025
                rn = l[17:20]
1026
                rn = rn.replace('G  ', '  G')
1027
                rn = rn.replace('U  ', '  U')
1028
                rn = rn.replace('C  ', '  C')
1029
                rn = rn.replace('A  ', '  A')
1030
                l = l[:16] + ' ' + rn + ' ' + l[21:]
1031
            # print l.strip()
1032
            # print l2
1033
            #l = l.replace(' U   ', '   U ')
1034
            #l = l.replace(' G   ', '   G ')
1035
            #l = l.replace(' A   ', '   A ')
1036
            #l = l.replace(' C   ', '   C ')
1037
            lines.append(l)
1038
        print ('fixU__to__U OK')
1039
        self.report.append('  Fix: U__ -> __U')
1040
        self.lines = lines
1041
1042
    def resn_as_dna(self):
1043
        lines = []
1044
        for l in self.lines:
1045
            if l.startswith('ATOM') or l.startswith('HETATM'):
1046
                # print l
1047
                nl = l.replace('DA5', ' DA')  # RA should be the last!!!!
1048
                nl = nl.replace('DA3', ' DA')
1049
                nl = nl.replace(' DA', ' DA')
1050
                nl = nl.replace(' rA', ' DA')
1051
1052
                nl = nl.replace('DC5', ' DC')
1053
                nl = nl.replace('DC3', ' DC')
1054
                nl = nl.replace(' DC', ' DC')
1055
                nl = nl.replace(' rC', ' DC')
1056
1057
                nl = nl.replace('DG5', ' DG')
1058
                nl = nl.replace('DG3', ' DG')
1059
                nl = nl.replace(' DG', ' DG')
1060
                nl = nl.replace(' rG', ' DG')
1061
1062
                nl = nl.replace('DU5', ' DU')
1063
                nl = nl.replace('DU3', ' DU')
1064
                nl = nl.replace(' DU', ' DU')
1065
                nl = nl.replace(' rU', ' DU')
1066
1067
                nl = nl.replace('DT5', ' DT')
1068
                nl = nl.replace('DT3', ' DT')
1069
                nl = nl.replace(' DT', ' DT')
1070
                nl = nl.replace(' rT', ' DT')
1071
1072
                nl = nl.replace('C5M', 'C7 ')
1073
1074
                if l[17:20].strip() == 'G':
1075
                    nl = nl[:17] + ' DG' + nl[20:]
1076
1077
                if l[17:20].strip() == 'C':
1078
                    nl = nl[:17] + ' DC' + nl[20:]
1079
1080
                if l[17:20].strip() == 'T':
1081
                    nl = nl[:17] + ' DT' + nl[20:]
1082
1083
                if l[17:20].strip() == 'U':
1084
                    nl = nl[:17] + ' DU' + nl[20:]
1085
1086
                if l[17:20].strip() == 'A':
1087
                    nl = nl[:17] + ' DA' + nl[20:]
1088
1089
                lines.append(nl)
1090
            if l.startswith("END") or l.startswith("TER"):
1091
                lines.append(l)
1092
1093
        print ('resn_as_dna')
1094
        self.report.append('  resn_as_dna')
1095
        self.lines = lines
1096
1097
    def fix_O_in_UC(self):
1098
        """.. warning: remove RU names before using this function"""
1099
        lines = []
1100
        for l in self.lines:
1101
            # if l[12:16].strip() in
1102
            # if l[12:16].strip().startswith('H'):
1103
            nl = l.replace('O     U',
1104
                           'O2    U')
1105
            nl = nl.replace('O     C',
1106
                            'O2    C')
1107
            lines.append(nl)
1108
        self.lines = lines
1109
1110
    def fix_op_atoms(self):
1111
        """Replace OXP' to OPX1, e.g ('O1P' -> 'OP1')"""
1112
        lines = []
1113
        for l in self.lines:
1114
            nl = l.replace('*', '\'')
1115
            nl = nl.replace('O1P', 'OP1')
1116
            nl = nl.replace('O2P', 'OP2')
1117
            nl = nl.replace('O3P', 'OP3')
1118
            lines.append(nl)
1119
        self.lines = lines
1120
1121
    def get_report(self):
1122
        """
1123
        Returns:
1124
            string: report, messages collected on the way of parsing this file
1125
        """
1126
        return '\n'.join(self.report)
1127
1128
    def is_rna(self):
1129
        wrong = []
1130
        for r in self.res:
1131
            if r.upper().strip() in ['RC', 'RU', 'RA', 'RG', 'RT']:
1132
                if r not in wrong_res:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable wrong_res does not seem to be defined.
Loading history...
1133
                    wrong_res.append(r)
1134
        return wrong_res
1135
1136
    def check_res_if_std_dna(self):
1137
        wrong_res = []
1138
        for r in self.res:
1139
            if r.upper().strip() in ['A', 'T', 'C', 'G']:
1140
                if r not in wrong_res:
1141
                    wrong_res.append(r)
1142
        return wrong_res
1143
1144
    def check_res_if_supid_rna(self):
1145
        wrong_res = []
1146
        for r in self.res:
1147
            if r.upper().strip() in ['RC', 'RU', 'RA', 'RG', 'RT']:
1148
                if r not in wrong_res:
1149
                    wrong_res.append(r)
1150
        return wrong_res
1151
1152
1153
    def is_rna(self):
1154
        for r in self.res:
1155
            if r.upper().strip() in ['RC', 'RU', 'RA', 'RG', 'RT']:
1156
                if r not in wrong_res:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable wrong_res does not seem to be defined.
Loading history...
1157
                    wrong_res.append(r)
1158
        return wrong_res
1159
1160
    def renum_atoms(self):
1161
        """Renum atoms, from 1 to X for line; ATOM/HETATM"""
1162
        lines = []
1163
        c = 1
1164
        for l in self.lines:
1165
            l = l[:6] + str(c).rjust(5) + l[11:]
1166
            c += 1
1167
            lines.append(l)
1168
        self.lines = lines
1169
1170
    def std_resn(self):
1171
        """'Fix' residue names which means to change them to standard, e.g. RA5 -> A
1172
1173
        Works on self.lines, and returns the result to self.lines.
1174
1175
        Will change things like::
1176
1177
            # URI -> U, URA -> U
1178
            1xjr_clx_charmm.pdb:ATOM    101  P   URA A   5      58.180  39.153  30.336  1.00 70.94
1179
            rp13_Dokholyan_1_URI_CYT_ADE_GUA_hydrogens.pdb:ATOM  82  P   URI A   4     501.633 506.561 506.256  1.00  0.00         P
1180
        """
1181
        lines = []
1182
        for l in self.lines:
1183
            nl = l.replace('RA5', '  A')  # RA should be the last!!!!
1184
            nl = nl.replace('RA3', '  A')
1185
            nl = nl.replace('ADE', '  A')
1186
            nl = nl.replace(' RA', '  A')
1187
            nl = nl.replace(' rA', '  A')
1188
1189
            nl = nl.replace('RC5', '  C')
1190
            nl = nl.replace('RC3', '  C')
1191
            nl = nl.replace('CYT', '  C')
1192
            nl = nl.replace(' RC', '  C')
1193
            nl = nl.replace(' rC', '  C')
1194
1195
            nl = nl.replace('RG5', '  G')
1196
            nl = nl.replace('RG3', '  G')
1197
            nl = nl.replace('GUA', '  G')
1198
            nl = nl.replace(' RG', '  G')
1199
            nl = nl.replace(' rG', '  G')
1200
1201
            nl = nl.replace('RU5', '  U')
1202
            nl = nl.replace('RU3', '  U')
1203
            nl = nl.replace('URA', '  U')
1204
            nl = nl.replace('URI', '  U')
1205
            nl = nl.replace('URY', '  U')
1206
            nl = nl.replace(' RU', '  U')
1207
            nl = nl.replace(' rU', '  U')
1208
1209
            nl = nl.replace('RT5', '  T')
1210
            nl = nl.replace('RT3', '  T')
1211
            nl = nl.replace('THY', '  T')
1212
            nl = nl.replace(' RT', '  T')
1213
            nl = nl.replace(' rT', '  T')
1214
1215
            lines.append(nl)
1216
1217
        self.lines = lines
1218
1219
    def check_res_if_std_prot(self):
1220
        wrong = []
1221
        for r in self.res:
1222
            if r not in AMINOACID_CODES:
1223
                wrong.append(r)
1224
        return wrong
1225
1226
    def write(self, outfn='', verbose=False):
1227
        """Write ```self.lines``` to a file (and add END file)
1228
1229
        Args:
1230
            outfn (str): file to save, if outfn is '', then simply use self.fn
1231
            verbose (Boolen): be verbose or not
1232
1233
        Returns:
1234
            None
1235
1236
        """
1237
        if outfn == '':
1238
            outfn = self.fn
1239
        f = open(outfn, 'w')
1240
        # test if there is anything to write, if not, it's likely that the
1241
        # file is not a PDB file, e.g. .outCR
1242
        if not self.lines:
1243
            raise Exception('Nothing to write. Is the input a PDB file? ', self.fn)
1244
        for l in self.lines:
1245
            f.write(l + '\n')
1246
        if not l.startswith('END'):
0 ignored issues
show
introduced by
The variable l does not seem to be defined in case the for loop on line 1244 is not entered. Are you sure this can never be the case?
Loading history...
1247
            f.write('END')
1248
        f.close()
1249
        if verbose:
1250
            print('Write %s' % outfn)
1251
1252
    def get_atom_num(self, line):
1253
        """Extract atom number from a line of PDB file
1254
        Arguments:
1255
          * line = ATOM line from a PDB file
1256
        Output:
1257
          * atom number (int)
1258
        """
1259
        return int(''.join([x for x in line[6:11] if x.isdigit()]))
1260
1261
    def get_res_num(self, line):
1262
        """Extract residue number from a line of PDB file
1263
        Arguments:
1264
          * line = ATOM line from a PDB file
1265
        Output:
1266
          * residue number as an integer
1267
        """
1268
        return int(''.join([x for x in line[22:27] if x.isdigit()]))
1269
1270
    def get_res_code(self, line):
1271
        """Get residue code from a line of a PDB file
1272
        """
1273
        #if not line.startswith('ATOM'):
1274
        #    return None
1275
        return line[17:20]
1276
1277
    def shift_atom_names(self):
1278
        nlines = []
1279
        for l in self.lines:
1280
            if l.startswith('ATOM'):
1281
                atom_name = self.get_atom_code(l)
1282
                l = self.set_atom_code(l, atom_name)
1283
            nlines.append(l)
1284
        self.lines = nlines
1285
1286
    def prune_elements(self):
1287
        nlines = []
1288
        for l in self.lines:
1289
            if l.startswith('ATOM'):
1290
                l = l[:76] + ' ' + l[78:]
1291
            nlines.append(l)
1292
        self.lines = nlines
1293
1294
    def get_atom_code(self, line):
1295
        """Get atom code from a line of a PDB file
1296
        """
1297
        if not line.startswith('ATOM'):
1298
            return None
1299
        return line[12:16].replace(' ', '').strip()
1300
1301
    def get_atom_coords(self, line):
1302
        """Get atom coordinates from a line of a PDB file
1303
        """
1304
        if not line.startswith('ATOM'):
1305
            return None
1306
        return tuple(map(float, line[31:54].split()))
1307
1308
    def set_atom_coords(self, line, x, y, z):
1309
        """Get atom coordinates from a line of a PDB file
1310
        """
1311
        line = line[:30] + (f"{x:>8.3f}") + line[38:]
1312
        line = line[:38] + (f"{y:>8.3f}") + line[46:]
1313
        line = line[:46] + (f"{z:>8.3f}") + line[54:]
1314
        return  line
1315
1316
    def set_line_bfactor(self, line, bfactor):
1317
        if not line.startswith('ATOM'):
1318
            return None
1319
        return line[:60] + (" %5.2f" % bfactor) + line[66:]
1320
1321
    def set_atom_occupancy(self, line, occupancy):
1322
        """set occupancy for line"""
1323
        return line[:54] + (" %5.2f" % occupancy) + line[60:]
1324
1325
    def set_atom_code(self, line, code):
1326
        """Add atom name/code:
1327
        
1328
           ATOM      1  OP2   C A   1      29.615  36.892  42.657  1.00  1.00          O
1329
                        ^^^                                                            ^ and element
1330
        """
1331
        return line[:12] + ' ' + code + ' ' * (3 - len(code)) + line[16:77] + code[0] + line[78:]
1332
1333
    def set_res_code(self, line, code):
1334
        """
1335
        Args:
1336
            lines
1337
            code
1338
        path (str): The path of the file to wrap
1339
        field_storage (FileStorage): The :class:Y instance to wrap
1340
            temporary (bool): Whether or not to delete the file when the File
1341
            instance is destructed
1342
1343
        Returns:
1344
            BufferedFileStorage: A buffered writable file descriptor
1345
        """
1346
        return line[:17] + code.rjust(3) + line[20:]
1347
1348
    def get_chain_id(self, line):
1349
        return line[21:22]
1350
1351
    def get_all_chain_ids(self):
1352
        """
1353
        Returns:
1354
           set: chain ids, e.g. set(['A', 'B'])
1355
        """
1356
        chain_ids = []
1357
        for l in self.lines:
1358
            chain_id = self.get_chain_id(l)
1359
            if chain_id and chain_id not in chain_ids:
1360
                chain_ids.append(chain_id)
1361
        return chain_ids
1362
1363
    def get_atom_index(self, line):
1364
        try:
1365
            return int(line[6:11])
1366
        except:
1367
            return None
1368
1369
    def set_atom_index(self, line, index):
1370
        return line[:6] + str(index).rjust(5) + line[11:]
1371
1372
    def reindex_atom_index(self):
1373
        self.nlines = []
1374
        for i, l in enumerate(self.lines):
1375
            nl = self.set_atom_index(l, i + 1)  # start from 1
1376
            self.nlines.append(nl)
1377
        self.lines = self.nlines
1378
        return self.nlines
1379
1380
    def get_res_index(self, line):
1381
        return int(line[22:26])
1382
1383
    def set_res_index(self, line, index):
1384
        return line[:23] + str(index).rjust(3) + line[26:]
1385
1386
    def set_chain_id(self, line, chain_id):
1387
        return line[:21] + chain_id + line[22:]
1388
1389
    def get_rnapuzzle_ready(self, renumber_residues=True, fix_missing_atoms=True,
1390
                            rename_chains=True,
1391
                            ignore_op3 = False,
1392
                            report_missing_atoms=True, keep_hetatm=False, backbone_only=False,
1393
                            no_backbone = False, bases_only = False, save_single_res = False,
1394
                            ref_frame_only = False,
1395
                            p_only = False,
1396
                            check_geometry = False,
1397
                            conect = False,
1398
                            conect_no_linkage = False,
1399
                            verbose=False):  # :, ready_for="RNAPuzzle"):
1400
        """Get rnapuzzle (SimRNA) ready structure.
1401
1402
        Clean up a structure, get current order of atoms.
1403
1404
        :param renumber_residues: boolean, from 1 to ..., second chain starts from 1 etc.
1405
        :param fix_missing_atoms: boolean, superimpose motifs from the minilibrary and copy-paste missing atoms, this is super crude, so should be used with caution.
1406
1407
        Submission format @http://ahsoka.u-strasbg.fr/rnapuzzles/
1408
1409
        Run :func:`rna_tools.rna_tools.lib.RNAStructure.std_resn` before this function to fix names.
1410
        
1411
        .. image:: ../pngs/rebuild_op1op2_backbone.png
1412
        Figure: (Starting from left) input structure, structure with rebuilded atoms, and reference. The B fragment is observed in the reference used here as a “benchmark”, fragment A is reconstructed atoms (not observed in the reference"). 201122
1413
1414
        - 170305 Merged with get_simrna_ready and fixing OP3 terminal added
1415
        - 170308 Fix missing atoms for bases, and O2'
1416
1417
        .. image:: ../pngs/fix_missing_o_before_after.png
1418
        Fig. Add missing O2' atom (before and after).
1419
1420
        .. image:: ../pngs/fix_missing_superposition.png
1421
        Fig. The residue to fix is in cyan. The G base from the library in red. Atoms O4', C2', C1' are shared between the sugar (in cyan) and the G base from the library (in red). These atoms are used to superimpose the G base on the sugar, and then all atoms from the base are copied to the residues.
1422
1423
        .. image:: ../pngs/fix_missing_bases.png
1424
        **Fig.** Rebuild ACGU base-less. It's not perfect but good enough for some applications.
1425
1426
        .. warning:: It was only tested with the whole base missing!
1427
1428
        .. warning:: requires: Biopython
1429
1430
        Selection of atoms:
1431
        
1432
        - posphate group (3x, OP1 ,P, OP2),
1433
        - connector (2x O5', C5'), /5x
1434
        - sugar (5x, C4', O4', C3', O3', C1', C2'), /10
1435
        - extra oxygens from sugar (2x, O2' O3'), for now it's /12!
1436
        - A (10x), G (11x), C (8x), U(8x), max 12+11=23
1437
1438
        And 27 unique atoms: {'N9', 'O2', 'OP2', "O2'", "O4'", 'C8', "O3'", "C1'", 'C2', 'C6', "C5'", 'N6', 'C5', "C4'", 'C4', "O5'", "C3'", 'O6', 'N2', 'N7', 'OP1', 'N1', 'N4', 'P', "C2'", 'N3', 'O4'}.
1439
        """
1440
1441
        if verbose:
1442
            logger.setLevel(logging.DEBUG)
1443
1444
        try:
1445
            from Bio import PDB
1446
            from Bio.PDB import PDBIO
1447
            import warnings
1448
            warnings.filterwarnings('ignore', '.*Invalid or missing.*',)
1449
            warnings.filterwarnings('ignore', '.*with given element *',)
1450
        except:
1451
            sys.exit('Error: Install biopython to use this function (pip biopython)')
1452
1453
        import copy
1454
        # for debugging
1455
        #renumber_residues = True
1456
        # if ready_for == "RNAPuzzle":
1457
1458
        if p_only:
1459
            G_ATOMS = "P".split()
1460
            A_ATOMS = "P".split()
1461
            U_ATOMS = "P".split()
1462
            C_ATOMS = "P".split()
1463
        elif backbone_only:
1464
            G_ATOMS = "P OP1 OP2 O5' C5'".split()
1465
            A_ATOMS = "P OP1 OP2 O5' C5'".split()
1466
            U_ATOMS = "P OP1 OP2 O5' C5'".split()
1467
            C_ATOMS = "P OP1 OP2 O5' C5'".split()
1468
            #G_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split()
1469
            #A_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split()
1470
            #U_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split()
1471
            #C_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split()
1472
1473
        elif ref_frame_only:
1474
            G_ATOMS = "P OP1 OP2".split()
1475
            A_ATOMS = "P OP1 OP2".split()
1476
            U_ATOMS = "P OP1 OP2".split()
1477
            C_ATOMS = "P OP1 OP2".split()
1478
            
1479
        elif no_backbone:
1480
            G_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split() 
1481
            A_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split()     
1482
            U_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 O4 C5 C6".split()
1483
            C_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 N4 C5 C6".split()
1484
1485
        elif bases_only:
1486
            G_ATOMS = "N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split()
1487
            A_ATOMS = "N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split()
1488
            U_ATOMS = "N1 C2 O2 N3 C4 O4 C5 C6".split()
1489
            C_ATOMS = "N1 C2 O2 N3 C4 N4 C5 C6".split()
1490
        else:
1491
            G_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split() # 23 
1492
            A_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split()    # 22
1493
            U_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 O4 C5 C6".split()          # 20
1494
            C_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 N4 C5 C6".split()          # 20
1495
1496
        # hmm.. is it the same as RNApuzzle???
1497
        # if ready_for == "SimRNA":
1498
        #    G_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split()
1499
        #    A_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split()
1500
        #    U_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 O4 C5 C6".split()
1501
        #    C_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 N4 C5 C6".split()
1502
1503
        tf = tempfile.NamedTemporaryFile(delete=False)
1504
        ftmp = tf.name
1505
        self.write(ftmp, verbose=False)
1506
1507
        parser = PDB.PDBParser()
1508
        #try:
1509
        struct = parser.get_structure('', ftmp)
1510
        #except:
1511
        #    print('Error in ' + self.fn)
1512
        #    sys.exit(1)
1513
        model = struct[0]
1514
1515
        s2 = PDB.Structure.Structure(struct.id)
1516
        m2 = PDB.Model.Model(model.id)
1517
1518
        chains2 = []
1519
1520
        missing = []
1521
        fixed = []
1522
        protein_chains_remmoved = []
1523
1524
        new_chains = list(string.ascii_uppercase)
1525
        remarks = []
1526
        if ignore_op3:
1527
             remarks.append("REMARK 250  don't add OP3 at the start of chain")
1528
        # get path to mini db with models to rebuilt structures
1529
        currfn = __file__
1530
        if currfn == '':
1531
            path = '.'
1532
        else:
1533
            path = os.path.dirname(currfn)
1534
        if os.path.islink(currfn):  # path + os.sep + os.path.basename(__file__)):
1535
            path = os.path.dirname(os.readlink(
1536
                path + os.sep + os.path.basename(currfn)))
1537
1538
        # for each chain #
1539
        for chain in model.get_list():
1540
            logger.debug('chain: %s' % chain)
1541
1542
            # is it RNA? ############################
1543
            protein_like = 0
1544
            for c, r in enumerate(chain, 1):
1545
                if r.resname in AMINOACID_CODES:
1546
                    protein_like += 1
1547
            if (protein_like / float(c + 1)) > .8:  # 90%
0 ignored issues
show
introduced by
The variable c does not seem to be defined in case the for loop on line 1539 is not entered. Are you sure this can never be the case?
Loading history...
1548
                protein_chains_remmoved.append(chain.get_id())
1549
            # ######################################
1550
1551
            res = []
1552
            for r in chain:
1553
                res.append(r)
1554
1555
            res = copy.copy(res)
1556
1557
            # start chains from A..BCD. etc
1558
1559
            if rename_chains:
1560
                try:
1561
                    chain.id = new_chains.pop(0)
1562
                except ValueError:
1563
                    # ValueError: Cannot change id from `A` to `A`.
1564
                    # The id `A` is already used for a sibling of this entity.
1565
                    # so keep it as it was
1566
                    pass
1567
1568
            c2 = PDB.Chain.Chain(chain.id)
1569
1570
            c = 1  # new chain, goes from 1 !!! if renumber True
1571
            prev_r = '' # init prev_r 
1572
1573
            conects = []
1574
            atom_index = 0
1575
            for r in res:
1576
                #
1577
                #  deal with heteratm
1578
                #
1579
                if r.id[0].startswith('H_') or r.id[0].startswith('W'):
1580
                    if keep_hetatm:
1581
                        c2.add(r)
1582
                    else:
1583
                        # str(r) % r.id[0].replace('H_', '').strip())
1584
                        remarks.append('REMARK 250 Hetatom to be removed from file: ' + str(r))
1585
                    continue
1586
                # hack for amber/qrna
1587
                r.resname = r.resname.strip()
1588
                if r.resname == 'RC3':
1589
                    r.resname = 'C'
1590
                if r.resname == 'RU3':
1591
                    r.resname = 'U'
1592
                if r.resname == 'RG3':
1593
                    r.resname = 'G'
1594
                if r.resname == 'RA3':
1595
                    r.resname = 'A'
1596
1597
                if r.resname == 'C3':
1598
                    r.resname = 'C'
1599
                if r.resname == 'U3':
1600
                    r.resname = 'U'
1601
                if r.resname == 'G3':
1602
                    r.resname = 'G'
1603
                if r.resname == 'A3':
1604
                    r.resname = 'A'
1605
1606
                if r.resname == 'RC5':
1607
                    r.resname = 'C'
1608
                if r.resname == 'RU5':
1609
                    r.resname = 'U'
1610
                if r.resname == 'RG5':
1611
                    r.resname = 'G'
1612
                if r.resname == 'RA5':
1613
                    r.resname = 'A'
1614
1615
                if r.resname == 'C5':
1616
                    r.resname = 'C'
1617
                if r.resname == 'U5':
1618
                    r.resname = 'U'
1619
                if r.resname == 'G5':
1620
                    r.resname = 'G'
1621
                if r.resname == 'A5':
1622
                    r.resname = 'A'
1623
1624
                if r.resname.strip() == 'RC':
1625
                    r.resname = 'C'
1626
                if r.resname.strip() == 'RU':
1627
                    r.resname = 'U'
1628
                if r.resname.strip() == 'RG':
1629
                    r.resname = 'G'
1630
                if r.resname.strip() == 'RA':
1631
                    r.resname = 'A'
1632
1633
                # unmodified rna 2MG -> G and take only G atoms
1634
                if (r.resname.strip() not in ['C', 'U', 'G', 'A']) and \
1635
                        (r.resname.strip()[-1] in ['C', 'U', 'G', 'A']):
1636
                    r.resname = r.resname.strip()[-1].strip()
1637
1638
                r2 = PDB.Residue.Residue(r.id, r.resname.strip(), r.segid)
1639
                if renumber_residues:
1640
                    r2.id = (r2.id[0], c, r2.id[2])  # renumber residues
1641
                #
1642
                # experimental: fixing missing OP3.
1643
                # Only for the first residues.
1644
                #
1645
                if c == 1:
1646
                    # if p_missing
1647
                    p_missing = True
1648
                    # if p_missing:
1649
                    #    try:
1650
                    #        x = r["O5'"]
1651
                    #        x.id =       ' P'
1652
                    #        x.name =     ' P'
1653
                    #        x.fullname = ' P'
1654
                    #        print "REMARK 000 FIX O5' -> P fix in chain ", chain.id
1655
                    #    except:
1656
                    #        pass
1657
                    for a in r:
1658
                        if a.id == 'P':
1659
                            p_missing = False
1660
                    logger.debug('p_missing %s' % p_missing)
1661
1662
                    if p_missing and fix_missing_atoms:
1663
                        currfn = __file__
1664
                        if currfn == '':
1665
                            path = '.'
1666
                        else:
1667
                            path = os.path.dirname(currfn)
1668
                        if os.path.islink(currfn):  # path + os.sep + os.path.basename(__file__)):
1669
                            path = os.path.dirname(os.readlink(
1670
                                path + os.sep + os.path.basename(currfn)))
1671
1672
                        po3_struc = PDB.PDBParser().get_structure('', path + '/data/PO3_inner.pdb')
1673
                        po3 = [po3_atom for po3_atom in po3_struc[0].get_residues()][0]
1674
1675
                        r_atoms = [r["O4'"], r["C4'"], r["C3'"]]
1676
                        po3_atoms = [po3["O4'"], po3["C4'"], po3["C3'"]]
1677
1678
                        sup = PDB.Superimposer()
1679
                        sup.set_atoms(r_atoms, po3_atoms)
1680
                        rms = round(sup.rms, 3)
1681
1682
                        sup.apply(po3_struc.get_atoms())  # to all atoms of po3
1683
1684
                        r.add(po3['P'])
1685
                        r.add(po3['OP1'])
1686
                        r.add(po3['OP2'])
1687
                        try:
1688
                            r.add(po3["O5'"])
1689
                        except:
1690
                            del r["O5'"]
1691
                            r.add(po3["O5'"])
1692
1693
                        fixed.append(['add OP3 at the beginning of the chain ', chain.id, r, r.id[1]])
1694
1695
                    p_missing = False  # off this function
1696
1697
                    # save it
1698
                    #io = PDB.PDBIO()
1699
                    #io.set_structure( po3_struc )
1700
                    # io.save("po3.pdb")
1701
1702
                #
1703
                # fix missing O2'
1704
                #
1705
                o2p_missing = True
1706
                for a in r:
1707
                    logger.debug('o2p_missing: %s %s %s' % (r, o2p_missing, a.id))
1708
                    if a.id == "O2'":
1709
                        o2p_missing = False
1710
                logger.debug('o2p_missing: %s', o2p_missing)
1711
1712
                if o2p_missing and fix_missing_atoms:
1713
                    currfn = __file__
1714
                    if currfn == '':
1715
                        path = '.'
1716
                    else:
1717
                        path = os.path.dirname(currfn)
1718
                    if os.path.islink(currfn):  # path + os.sep + os.path.basename(__file__)):
1719
                        path = os.path.dirname(os.readlink(
1720
                            path + os.sep + os.path.basename(currfn)))
1721
1722
                    o2p_struc = PDB.PDBParser().get_structure('', path + '/data/o2prim.pdb')
1723
                    o2p = [o2p_atom for o2p_atom in o2p_struc[0].get_residues()][0]
1724
1725
                    try:
1726
                        r_atoms = [r["C3'"], r["C2'"], r["C1'"]]
1727
                    except:
1728
                        ic(r)
1729
                    o2p_atoms = [o2p["C3'"], o2p["C2'"], o2p["C1'"]]
1730
1731
                    sup = PDB.Superimposer()
1732
                    sup.set_atoms(r_atoms, o2p_atoms)
1733
                    rms = round(sup.rms, 3)
1734
1735
                    sup.apply(o2p_struc.get_atoms())  # to all atoms of o2p
1736
1737
                    r.add(o2p["O2'"])
1738
                    logger.debug('fixing o2p for ' % r)
1739
                    fixed.append(['add O2\' ', chain.id, r, c])
1740
1741
                o2p_missing = False  # off this function
1742
                ######## fixing of missing OP1 and OP2 atoms in backbone ###########
1743
                if fix_missing_atoms:
1744
                    if 'OP1' not in r:
1745
                        if prev_r:
1746
                            op4 = PDB.PDBParser().get_structure('', path + '/data/op4.pdb')
1747
                            op4a = [a for a in op4[0].get_residues()][0]
1748
                            if 'P' not in r:
1749
                                print("Error missing P of residue " + chain.id +  ':' + str(r.id[1]) + ". Sorry, I can't rebuild missing atoms!")
1750
                                exit()
1751
                            r_atoms = [r["O5'"], r["P"], prev_r["O3'"]] #  r["C5'"], 
1752
                            op4_atoms = [op4a["O5'"], op4a["P"], op4a["O3'"]] # op4a["C5'"]] #, 
1753
1754
                            sup = PDB.Superimposer()
1755
                            sup.set_atoms(r_atoms, op4_atoms)
1756
                            sup.apply(op4.get_atoms())
1757
1758
                            r.add(op4a["OP1"])
1759
                            r.add(op4a["OP2"])
1760
                            fixed.append(['add the missing OP1 and OP2', chain.id, r, r.id[1]])
1761
1762
                        else:
1763
                            remarks.append("REMARK 250 Warning: Can't fix missing OP1 or/and OP2 atoms of " + str(r))
1764
                #
1765
                # fix missing C (the whole base at the moment)
1766
                #
1767 View Code Duplication
                if str(r.get_resname()).strip() == "C" and fix_missing_atoms:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1768
                    for a in r:
1769
                        if a.id == "N1":
1770
                            break
1771
                    else:
1772
                        C_struc = PDB.PDBParser().get_structure('', path + '/data/C.pdb')
1773
                        C = [C_atom for C_atom in C_struc[0].get_residues()][0]
1774
1775
                        r_atoms = [r["O4'"], r["C2'"], r["C1'"]]
1776
                        C_atoms = [C["O4'"], C["C2'"], C["C1'"]]
1777
1778
                        sup = PDB.Superimposer()
1779
                        sup.set_atoms(r_atoms, C_atoms)
1780
                        rms = round(sup.rms, 3)
1781
1782
                        sup.apply(C_struc.get_atoms())  # to all atoms of C
1783
1784
                        r.add(C["N1"])
1785
                        r.add(C["C2"])
1786
                        r.add(C["O2"])
1787
                        r.add(C["N3"])
1788
                        r.add(C["C4"])
1789
                        r.add(C["N4"])
1790
                        r.add(C["C5"])
1791
                        r.add(C["C6"])
1792
1793
                        fixed.append(['add the whole base C', chain.id, r, r.id[1]])
1794
1795
                #
1796
                # fix missing U (the whole base at the moment)
1797
                #
1798 View Code Duplication
                if str(r.get_resname()).strip() == "U" and fix_missing_atoms:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1799
                    for a in r:
1800
                        if a.id == "N1":
1801
                            break
1802
                    else:
1803
                        U_struc = PDB.PDBParser().get_structure('', path + '/data/U.pdb')
1804
                        U = [U_atom for U_atom in U_struc[0].get_residues()][0]
1805
1806
                        r_atoms = [r["O4'"], r["C2'"], r["C1'"]]
1807
                        U_atoms = [U["O4'"], U["C2'"], U["C1'"]]
1808
1809
                        sup = PDB.Superimposer()
1810
                        sup.set_atoms(r_atoms, U_atoms)
1811
                        rms = round(sup.rms, 3)
1812
1813
                        sup.apply(U_struc.get_atoms())  # to all atoms of U
1814
1815
                        r.add(U["N1"])
1816
                        r.add(U["C2"])
1817
                        r.add(U["O2"])
1818
                        r.add(U["N3"])
1819
                        r.add(U["C4"])
1820
                        r.add(U["O4"])
1821
                        r.add(U["C5"])
1822
                        r.add(U["C6"])
1823
1824
                        fixed.append(['add the whole base U', chain.id, r, r.id[1]])
1825
                #
1826
                # fix missing G (the whole base at the moment)
1827
                #
1828 View Code Duplication
                if str(r.get_resname()).strip() == "G" and fix_missing_atoms:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1829
                    for a in r:
1830
                        if a.id == "N1":
1831
                            break
1832
                    else:
1833
                        G_struc = PDB.PDBParser().get_structure('', path + '/data/G.pdb')
1834
                        G = [G_atom for G_atom in G_struc[0].get_residues()][0]
1835
1836
                        r_atoms = [r["O4'"], r["C2'"], r["C1'"]]
1837
                        G_atoms = [G["O4'"], G["C2'"], G["C1'"]]
1838
1839
                        sup = PDB.Superimposer()
1840
                        sup.set_atoms(r_atoms, G_atoms)
1841
                        rms = round(sup.rms, 3)
1842
1843
                        sup.apply(G_struc.get_atoms())  # to all atoms of G
1844
1845
                        r.add(G["N9"])
1846
                        r.add(G["C8"])
1847
                        r.add(G["N7"])
1848
                        r.add(G["C5"])
1849
                        r.add(G["C6"])
1850
                        r.add(G["O6"])
1851
                        r.add(G["N1"])
1852
                        r.add(G["C2"])
1853
                        r.add(G["N2"])
1854
                        r.add(G["N3"])
1855
                        r.add(G["C4"])
1856
1857
                        fixed.append(['add the whole base G', chain.id, r, r.id[1]])
1858
                #
1859
                # fix missing A (the whole base at the moment)
1860
                #
1861 View Code Duplication
                if str(r.get_resname()).strip() == "A" and fix_missing_atoms:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1862
                    for a in r:
1863
                        if a.id == "N1":
1864
                            break
1865
                    else:
1866
                        A_struc = PDB.PDBParser().get_structure('', path + '/data/A.pdb')
1867
                        A = [A_atom for A_atom in A_struc[0].get_residues()][0]
1868
1869
                        r_atoms = [r["O4'"], r["C2'"], r["C1'"]]
1870
                        A_atoms = [A["O4'"], A["C2'"], A["C1'"]]
1871
1872
                        sup = PDB.Superimposer()
1873
                        sup.set_atoms(r_atoms, A_atoms)
1874
                        rms = round(sup.rms, 3)
1875
1876
                        sup.apply(A_struc.get_atoms())  # to all atoms of A
1877
1878
                        r.add(A["N9"])
1879
                        r.add(A["C8"])
1880
                        r.add(A["N7"])
1881
                        r.add(A["C5"])
1882
                        r.add(A["C6"])
1883
                        r.add(A["N6"])
1884
                        r.add(A["N1"])
1885
                        r.add(A["C2"])
1886
                        r.add(A["N3"])
1887
                        r.add(A["C4"])
1888
1889
                        fixed.append(['add the whole base A', chain.id, r, r.id[1]])
1890
1891
                #
1892
                # strip residues of extra atoms, not in G_ATOMS in this case
1893
                #
1894
                if str(r.get_resname()).strip() == "G":
1895
                    for an in G_ATOMS:
1896
                        if c == 1 and ignore_op3:
1897
                            if an in ['P', 'OP1', 'OP2']:
1898
                                continue
1899
                        try:
1900
                            if c == 1 and an == "O5'" and p_missing:
0 ignored issues
show
introduced by
The variable p_missing does not seem to be defined for all execution paths.
Loading history...
1901
                                r2.add(x)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable x does not seem to be defined.
Loading history...
1902
                            else:
1903
                                r2.add(r[an])
1904
                        except KeyError:
1905
                            # print 'Missing:', an, r, ' new resi', c
1906
                            missing.append([an, chain.id, r, r.id[1]])
1907
                    c2.add(r2)
1908
1909
                elif str(r.get_resname()).strip() == "A":
1910
                    for an in A_ATOMS:
1911
                        if c == 1 and ignore_op3:
1912
                            if an in ['P', 'OP1', 'OP2']:
1913
                                continue
1914
                        try:
1915
                            if c == 1 and an == "O5'" and p_missing:
1916
                                r2.add(x)
1917
                            else:
1918
                                r2.add(r[an])
1919
                        except KeyError:
1920
                            # print 'Missing:', an, r, ' new resi', c
1921
                            missing.append([an, chain.id, r, r.id[1]])
1922
                    c2.add(r2)
1923
1924
                elif str(r.get_resname()).strip() == "C":
1925
                    for an in C_ATOMS:
1926
                        if c == 1 and ignore_op3:
1927
                            if an in ['P', 'OP1', 'OP2']:
1928
                                continue
1929
                        try:
1930
                            if c == 1 and an == "O5'" and p_missing:
1931
                                r2.add(x)
1932
                            else:
1933
                                r2.add(r[an])
1934
                        except:
1935
                            # print 'Missing:', an, r, ' new resi', c
1936
                            missing.append([an, chain.id, r, r.id[1]])
1937
                    c2.add(r2)
1938
1939
                elif str(r.get_resname()).strip() == "U":
1940
                    for an in U_ATOMS:
1941
                        if c == 1 and ignore_op3:
1942
                            if an in ['P', 'OP1', 'OP2']:
1943
                                continue
1944
                        try:
1945
                            if c == 1 and an == "O5'" and p_missing:
1946
                                r2.add(x)
1947
                            else:
1948
                                r2.add(r[an])
1949
                        except KeyError:
1950
                            # print 'Missing:', an, r,' new resi', c
1951
                            missing.append([an, chain.id, r, r.id[1]])
1952
                    c2.add(r2)
1953
1954
1955
                # save each residue a sa single ife
1956
                if save_single_res:
1957
                    from Bio.PDB import Structure
1958
                    # Create an empty structure
1959
                    structure = Structure.Structure("my_structure")
1960
                    model = Model.Model(0)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable Model does not seem to be defined.
Loading history...
1961
                    chain = Chain.Chain('A')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable Chain does not seem to be defined.
Loading history...
1962
                    chain.add(r2)
1963
                    model.add(chain)
1964
                    structure.add(model)
1965
1966
                    io = PDBIO()
1967
                    io.set_structure(structure)
1968
                    io.save(f"{self.fn}_{r2.id[1]}.pdb")
1969
1970
                # add CONET
1971
                if conect:
1972
                    ic(atom_index, r2)
1973
                    conects.extend(generate_conect_records(r2.get_resname().strip(), atom_index, conect_no_linkage))
1974
                    atom_index += len(r2)
1975
                    print(conect)
1976
1977
                prev_r = r  # hack to keep preview residue to be used in the function
1978
                            # e.g., to get an atom from this residue
1979
                c += 1
1980
            chains2.append(c2)
1981
1982
        io = PDBIO()
1983
        s2.add(m2)
1984
        for chain2 in chains2:
1985
            m2.add(chain2)
1986
        # print c2
1987
        # print m2
1988
        io.set_structure(s2)
1989
1990
        tf = tempfile.NamedTemporaryFile(delete=False)
1991
        fout = tf.name
1992
        io.save(fout)
1993
1994
        if fixed:
1995
            remarks.append('REMARK 250 Fixed atoms/residues:')
1996
            for i in fixed:
1997
                remarks.append(
1998
                    ' '.join(['REMARK 250  -', str(i[0]), 'in chain:', str(i[1]), str(i[2]), 'residue #', str(i[3])]))
1999
2000
        if missing and report_missing_atoms:
2001
            remarks.append('REMARK 250 Missing atoms:')
2002
            for i in missing:
2003
                remarks.append(' '.join(['REMARK 250   +', str(i[0]),
2004
                                         'chain:', str(i[1]), str(i[2]), 'residue #', str(i[3])]))
2005
            #raise Exception('Missing atoms in %s' % self.fn)
2006
2007
        if protein_chains_remmoved:
2008
            remarks.append('REMARK 250 Chains that seem to be proteins removed and : ' +
2009
                           ' '.join(protein_chains_remmoved))
2010
        #
2011
2012
        # fix ter 'TER' -> TER    1528        G A  71
2013
        #
2014
        s = RNAStructure(fout)
2015
        if check_geometry:
2016
            s.check_geometry()
2017
        self.lines = s.lines
2018
        c = 0
2019
        # ATOM   1527  C4    G A  71       0.000   0.000   0.000  1.00  0.00           C
2020
        nlines = []
2021
        no_ters = 0
2022
        for l in self.lines:
2023
            ## align atoms to the left #######################################################
2024
            # ATOM   3937    P   C B 185      11.596  -7.045  26.165  1.00  0.00           P
2025
            # ATOM   3937  P     C B 185      11.596  -7.045  26.165  1.00  0.00           P
2026
            if l.startswith('ATOM'):
2027
                atom_code = self.get_atom_code(l)
2028
                l = self.set_atom_code(l, atom_code)
2029
            ##################################################################################
2030
            if l.startswith('TER'):
2031
                #                pass  # leave it for now this
2032
                atom_l = self.lines[c - 1]
2033
                new_l = 'TER'.ljust(80)   # TER    1528        G A  71 <<<'
2034
                new_l = self.set_atom_index(new_l, str(self.get_atom_index(atom_l) + 1 + no_ters))
2035
                new_l = self.set_res_code(new_l, self.get_res_code(atom_l))
2036
                new_l = self.set_chain_id(new_l, self.get_chain_id(atom_l))
2037
                new_l = self.set_res_index(new_l, self.get_res_index(atom_l))
2038
                nlines.append(new_l)
2039
                #nlines.append(l)
2040
                no_ters += 1
2041
            else:
2042
                if self.get_atom_index(l):
2043
                    l = self.set_atom_index(l, self.get_atom_index(
2044
                        l) + no_ters)  # 1 ter +1 2 ters +2 etc
2045
                nlines.append(l)
2046
            c += 1
2047
        self.lines = nlines
2048
        if conect:
2049
            self.lines += conects
0 ignored issues
show
introduced by
The variable conects does not seem to be defined in case the for loop on line 1539 is not entered. Are you sure this can never be the case?
Loading history...
2050
        return remarks
2051
2052
    def set_occupancy_atoms(self, occupancy):
2053
        """
2054
        :param occupancy:
2055
        """
2056
        nlines = []
2057
        for l in self.lines:
2058
            if l.startswith('ATOM'):
2059
                l = self.set_atom_occupancy(l, 0.00)
2060
                nlines.append(l)
2061
            else:
2062
                nlines.append(l)
2063
        self.lines = nlines
2064
2065
    def edit_occupancy_of_pdb(txt, pdb, pdb_out, v=False):
2066
        """Make all atoms 1 (flexi) and then set occupancy 0 for seletected atoms.
2067
        Return False if error. True if OK
2068
        """
2069
        struc = PDB.PDBParser().get_structure('struc', pdb)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable PDB does not seem to be defined.
Loading history...
2070
2071
        txt = txt.replace(' ', '')
2072
        if v:
2073
            print (txt)
2074
        l = re.split('[,:;]', txt)
2075
        if v:
2076
            print (l)
2077
2078
        for s in struc:
2079
            for c in s:
2080
                for r in c:
2081
                    for a in r:
2082
                        a.set_occupancy(1)  # make it flaxi
2083
2084
        for i in l:  # ['A', '1-10', '15', '25-30', 'B', '1-10']
2085
2086
            if i in string.ascii_letters:
2087
                if v:
2088
                    print('chain', i)
2089
                chain_curr = i
2090
                continue
2091
2092
            if i.find('-') > -1:
2093
                start, ends = i.split('-')
2094
                if start > ends:
2095
                    print('Error: range start > end ' + i)  # >>sys.stderr
2096
                    return False
2097
                index = list(range(int(start), int(ends) + 1))
2098
            else:
2099
                index = [int(i)]
2100
2101
            for i in index:
2102
                # change b_factor
2103
                try:
2104
                    atoms = struc[0][chain_curr][i]
0 ignored issues
show
introduced by
The variable chain_curr does not seem to be defined in case the for loop on line 2084 is not entered. Are you sure this can never be the case?
Loading history...
2105
                except KeyError:
2106
                    if i == chain_curr:
2107
                        print('Error: Chain ' + chain_curr +
2108
                              ' not found in the PDB structure')  # >>sys.stderr,
2109
                    else:
2110
                        print('Error: Residue ' + chain_curr + ':' + str(i) +
2111
                              ' found in the PDB structure')  # >>sys.stderr,
2112
                        return False
2113
                for a in atoms:
2114
                    a.set_occupancy(0)
2115
2116
        io = PDBIO()
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable PDBIO does not seem to be defined.
Loading history...
2117
        io.set_structure(struc)
2118
        io.save(pdb_out)
2119
        print('Saved ', pdb_out)
2120
        return True
2121
2122
    def view(self):
2123
        os.system('pymol ' + self.fn)
2124
2125
    def remove(self, verbose):
2126
        """Delete file, self.fn"""
2127
        os.remove(self.fn)
2128
        if verbose:
2129
            'File %s removed' % self.fn
2130
2131
    def __repr__(self):
2132
        return 'RNAStructure %s' % self.fn
2133
2134
2135
    def get_remarks_text(self):
2136
        """Get remarks as text for given file.
2137
        This function re-open files, as define as self.fn to get remarks.
2138
2139
        Example::
2140
2141
            r = RNAStructure(fout)
2142
            remarks = r.get_remarks_txt()
2143
            r1 = r.get_res_txt('A', 1)
2144
            r2 = r.get_res_txt('A', 2)
2145
            r3 = r.get_res_txt('A', 3)
2146
            with open(fout, 'w') as f:
2147
                f.write(remarks)
2148
                f.write(r1)
2149
                f.write(r2)
2150
                f.write(r3)
2151
2152
            remarks is
2153
2154
            REMARK 250 Model edited with rna-tools
2155
            REMARK 250  ver 3.5.4+63.g4338516.dirty 
2156
            REMARK 250  https://github.com/mmagnus/rna-tools 
2157
            REMARK 250  Fri Nov 13 10:15:19 2020
2158
2159
        """
2160
        txt = ''
2161
        for l in open(self.fn):
2162
            if l.startswith("REMARK"):
2163
                txt += l
2164
        return txt
2165
2166
    def get_res_text(self, chain_id, resi):
2167
        """Get a residue of given resi of chain_id and return as a text
2168
2169
        Args:
2170
            chain_id (str): e.g., 'A'
2171
            resi (int): e.g., 1
2172
2173
        Returns:
2174
             txt: 
2175
2176
        Example::
2177
2178
            r = RNAStructure(fn)
2179
            print(r.get_res_txt('A', 1))
2180
2181
            ATOM      1  O5'   G A   1      78.080 -14.909  -0.104  1.00  9.24           O  
2182
            ATOM      2  C5'   G A   1      79.070 -15.499  -0.956  1.00  9.70           C  
2183
            ATOM      3  C4'   G A   1      78.597 -16.765  -1.648  1.00  9.64           C  
2184
            ATOM      4  O4'   G A   1      78.180 -17.761  -0.672  1.00  9.88           O  
2185
            (...)
2186
2187
        """
2188
        txt = ''
2189
        for l in self.lines:
2190
            if l.startswith("ATOM"):
2191
                if self.get_res_num(l) == resi and self.get_chain_id(l) == chain_id:
2192
                    txt += l + '\n'
2193
        return txt
2194
2195
    def mq(self, method="RASP", verbose=False):
2196
2197
        if method.lower() == "rasp":
2198
            from rna_tools.tools.mq.RASP import RASP
2199
            r = RASP.RASP()
2200
            return [float(i) for i in r.run(self.fn, potentials=['all'])]
2201
            # ['-42058.4', '466223', '-0.0902109', '0', '0', '0']
2202
2203
        if method.lower() == "dfire":
2204
            from rna_tools.tools.mq.Dfire import Dfire
2205
            r = Dfire.Dfire()
2206
            return r.run(self.fn, verbose)
2207
        
2208
        if method.lower() == "rna3dcnn":
2209
            from rna_tools.tools.mq.RNA3DCNN import RNA3DCNN
2210
            r = RNA3DCNN.RNA3DCNN()
2211
            return r.run(self.fn, verbose)
2212
2213
        if method.lower() == "qrna":
2214
            from rna_tools.tools.mq.QRNA import QRNA
2215
            r = QRNA.QRNA()
2216
            return r.run(self.fn, verbose)
2217
2218
        if method.lower() == "clashscore":
2219
            from rna_tools.tools.mq.ClashScore import ClashScore
2220
            r = ClashScore.ClashScore()
2221
            return r.run(self.fn, verbose)
2222
2223
        if method.lower() == "analyzegeometry":
2224
            from rna_tools.tools.mq.AnalyzeGeometry import AnalyzeGeometry
2225
            r = AnalyzeGeometry.AnalyzeGeometry()
2226
            return r.run(self.fn, verbose)
2227
2228
        raise MethodUnknown('Define corrent method')
2229
2230
    def get_empty_line(self):
2231
        l = "ATOM      1  C5'   A A   3      25.674  19.091   3.459  1.00  1.00           X"
2232
        return l
2233
    
2234
    def add_line(self, l):
2235
        self.lines.append(l)
2236
2237
def add_header(version=None):
2238
    now = time.strftime("%c")
2239
    txt = 'REMARK 250 Model edited with rna-tools\n'
2240
    txt += 'REMARK 250  ver %s \nREMARK 250  https://github.com/mmagnus/rna-tools \nREMARK 250  %s' % (
2241
        version, now)
2242
    return txt
2243
2244
2245
def edit_pdb(f, args):
2246
    """Edit your structure.
2247
2248
    The function can take ``A:3-21>A:1-19`` or even syntax like this
2249
    ``A:3-21>A:1-19,B:22-32>B:20-30`` and will do an editing.
2250
2251
    The output is printed, line by line. Only ATOM lines are edited!
2252
2253
    Examples::
2254
2255
        $ rna_pdb_tools.py --edit 'A:3-21>A:1-19' 1f27_clean.pdb > 1f27_clean_A1-19.pdb
2256
2257
    or even::
2258
2259
        $ rna_pdb_tools.py --edit 'A:3-21>A:1-19,B:22-32>B:20-30' 1f27_clean.pdb > 1f27_clean_renumb.pdb
2260
2261
    """
2262
    # open a new file
2263
    s = RNAStructure(f)
2264
    output = ''
2265
    if not args.no_hr:
2266
        output += add_header() + '\n'
2267
        output += 'REMARK 250    HEADER --edit ' + args.edit + '\n'
2268
2269
    # --edit 'A:3-21>A:1-19,B:22-32>B:20-30'
2270
    if args.edit.find(',') > -1:
2271
        # more than one edits
2272
        edits = args.edit.split(',')  # ['A:3-21>A:1-19', 'B:22-32>B:20-30']
2273
        selects = []
2274
        for e in edits:
2275
            selection_from, selection_to = select_pdb_fragment(
2276
                e.split('>')[0]), select_pdb_fragment(e.split('>')[1])
2277
            if len(selection_to) != len(selection_from):
2278
                raise Exception('len(selection_to) != len(selection_from)')
2279
            selects.append([selection_from, selection_to])
2280
    else:
2281
        # one edit
2282
        e = args.edit
2283
        selection_from, selection_to = select_pdb_fragment(
2284
            e.split('>')[0]), select_pdb_fragment(e.split('>')[1])
2285
        if len(selection_to) != len(selection_from):
2286
            raise Exception('len(selection_to) != len(selection_from)')
2287
        selects = [[selection_from, selection_to]]
2288
2289
    # go ever all edits: ['A:3-21>A:1-19','B:22-32>B:20-30']
2290
    for l in s.lines:
2291
        if l.startswith('ATOM'):
2292
                # get chain and resi
2293
            chain = l[21:22].strip()
2294
            resi = int(l[22:26].strip())
2295
2296
            if_selected_dont_print = False
2297
            # for selections
2298
            for select in selects:
2299
                selection_from, selection_to = select
2300
                if chain in selection_from:
2301
                    if resi in selection_from[chain]:
2302
                            # [1,2,3] mapping from [4,5,10], you want to know how to map 1
2303
                            # 1 is [0] element of first list, so you have to index first list
2304
                            # to get 0, with this 0 you can get 4 out of second list [4,5,10][0] -> 4
2305
                        nl = list(l)
2306
                        chain_new = list(selection_to.keys())[0]  # chain form second list
2307
                        nl[21] = chain_new  # new chain
2308
                        index = selection_from[chain].index(int(resi))  # get index of 1
2309
                        resi_new = str(selection_to[chain_new][index]).rjust(
2310
                            4)  # 'A' [1,2,3] -> '  1'
2311
                        nl[22:26] = resi_new
2312
                        nl = ''.join(nl)
2313
                        if_selected_dont_print = True
2314
                        output += nl + '\n'
2315
            if not if_selected_dont_print:
2316
                output += l + '\n'
2317
        else:  # if not atom
2318
            output += l + '\n'
2319
    return output
2320
2321
2322
def collapsed_view(args):
2323
    """Collapsed view of pdb file. Only lines with C5' atoms are shown and TER, MODEL, END.
2324
2325
    example::
2326
2327
        [mm] rna_tools git:(master) $ python rna-pdb-tools.py --cv input/1f27.pdb
2328
        ATOM      1  C5'   A A   3      25.674  19.091   3.459  1.00 16.99           C
2329
        ATOM     23  C5'   C A   4      19.700  19.206   5.034  1.00 12.65           C
2330
        ATOM     43  C5'   C A   5      14.537  16.130   6.444  1.00  8.74           C
2331
        ATOM     63  C5'   G A   6      11.726  11.579   9.544  1.00  9.81           C
2332
        ATOM     86  C5'   U A   7      12.007   7.281  13.726  1.00 11.35           C
2333
        ATOM    106  C5'   C A   8      12.087   6.601  18.999  1.00 12.74           C
2334
        TER"""
2335
    r = RNAStructure(args.file)
2336
    for l in r.lines:
2337
        at = r.get_atom_code(l)
2338
        if at == "P": # C5'":
2339
            print(l)
2340
        if l.startswith('TER') or l.startswith('MODEL') or l.startswith('END'):
2341
            print(l)
2342
2343
2344
def fetch(pdb_id, path="."):
2345
    """fetch pdb file from RCSB.org
2346
    https://files.rcsb.org/download/1Y26.pdb
2347
2348
    Args:
2349
    - pdb_id, but also a chain can be specified, 1jj2:A+B+C
2350
    
2351
    Returns:
2352
    - a path to a file
2353
2354
    TODO: na_pdb_tools.py --extract A:1-25+B:30-57 1jj2.pdb"""
2355
2356
    chains = ''
2357
    pdb_id = pdb_id.replace('_', ':') # to accept also 1jj2_A
2358
    if ':' in pdb_id:
2359
        pdb_id, chains = pdb_id.split(':') # >>> 'X:A+B+C'.split(':') ['X', 'A+B+C']
2360
2361
    if pdb_id == 'rp':
2362
        os.system('wget https://github.com/RNA-Puzzles/standardized_dataset/archive/master.tar.gz -O - | tar -xz')
2363
        return
2364
    
2365
    import urllib3
2366
    http = urllib3.PoolManager()
2367
2368
    # try:
2369
    pdb_id = pdb_id.replace('.pdb', '')
2370
    response = http.request('GET', 'https://files.rcsb.org/download/' + pdb_id + '.pdb')
2371
    if not response.status == 200:
2372
        raise PDBFetchError()
2373
2374
    # except urllib3.HTTPError:
2375
    #    raise Exception('The PDB does not exists: ' + pdb_id)
2376
    txt = response.data
2377
2378
    if path != '.':
2379
        npath = path + os.sep + pdb_id + '.pdb'
2380
    else:
2381
        npath = pdb_id + '.pdb'
2382
    print('downloading... ' + npath)
2383
    with open(npath, 'wb') as f:
2384
        f.write(txt)
2385
    if chains:
2386
        for chain in chains.split('+'):
2387
            cmd = f'rna_pdb_tools.py --extract-chain {chain} {pdb_id}.pdb > {pdb_id}_{chain}.pdb'
2388
            print(cmd)
2389
            exe(cmd)
2390
    # print('ok')
2391
    return npath
2392
2393
2394 View Code Duplication
def fetch_ba(pdb_id, path="."):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2395
    """fetch biological assembly pdb file from RCSB.org
2396
2397
    >>> fetch_ba('1xjr')
2398
    ...
2399
    """
2400
    try:
2401
        import urllib3
2402
    except ImportError:
2403
        print('urllib3 is required')
2404
        return
2405
    http = urllib3.PoolManager()
2406
    # try:
2407
    response = http.request('GET', url='https://files.rcsb.org/download/' +
2408
                            pdb_id.lower() + '.pdb1')
2409
    if not response.status == 200:
2410
        raise PDBFetchError()
2411
    txt = response.data
2412
2413
    npath = path + os.sep + pdb_id + '_ba.pdb'
2414
    print('downloading...' + npath)
2415
    with open(npath, 'wb') as f:
2416
        f.write(txt)
2417
    print('ok')
2418
    return pdb_id + '_ba.pdb'
2419
2420
2421 View Code Duplication
def fetch_cif_ba(cif_id, path="."):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
2422
    """fetch biological assembly cif file from RCSB.org"""
2423
    import urrlib3
2424
    http = urllib3.PoolManager()
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable urllib3 does not seem to be defined.
Loading history...
2425
    # try:
2426
    response = http.request('GET', url='https://files.rcsb.org/download/' +
2427
                            cif_id.lower() + '-assembly1.cif')
2428
    if not response.status == 200:
2429
        raise PDBFetchError()
2430
    txt = response.data
2431
2432
    npath = path + os.sep + cif_id + '_ba.cif'
2433
    print('downloading...' + npath)
2434
    with open(npath, 'wb') as f:
2435
        f.write(txt)
2436
    print('ok')
2437
    return cif_id + '_ba.cif'
2438
2439
def fetch_cif(cif_id, path="."):
2440
    """fetch biological assembly cif file from RCSB.org"""
2441
    import urllib3
2442
    http = urllib3.PoolManager()
2443
    # try:
2444
    if '_' in cif_id:
2445
        cif_id = cif_id.split('_')[0]  
2446
    ic(cif_id)
2447
    cif_id = cif_id.replace('.cif', '')
2448
    url = 'https://files.rcsb.org/download/' + cif_id.lower() + '.cif'
2449
    print(url)
2450
    response = http.request('GET', url)
2451
    if not response.status == 200:
2452
        raise PDBFetchError()
2453
    txt = response.data
2454
2455
    npath = path + os.sep + cif_id + '.cif'
2456
    #print('downloading... ' + npath)
2457
    with open(npath, 'wb') as f:
2458
        f.write(txt)
2459
    return cif_id + '.cif'
2460
2461
def replace_chain(struc_fn, insert_fn, chain_id):
2462
    """Replace chain of the main file (struc_fn) with some new chain (insert_fn) of given chain id.
2463
2464
    Args:
2465
       struc_fn (str): path to the main PDB file
2466
       insert_fn (str): path to the file that will be injected in into the main PDB file
2467
       chain_id (str): chain that will be inserted into the main PDB file
2468
2469
    Returns:
2470
       string: text in the PDB format
2471
    """
2472
    struc = RNAStructure(struc_fn)
2473
    insert = RNAStructure(insert_fn)
2474
2475
    output = ''
2476
    inserted = False
2477
    for l in struc.lines:
2478
        if l.startswith('ATOM'):
2479
            chain = l[21]
2480
            if chain == chain_id:
2481
                if not inserted:
2482
                    for insertl in insert.lines:
2483
                        if not insertl.startswith('HEADER') and not insertl.startswith('END'):
2484
                            output += insertl + '\n'
2485
                    inserted = True
2486
                continue
2487
            # insert pdb
2488
            output += l + '\n'
2489
    return output.strip()
2490
2491
2492
def replace_atoms(struc_fn, insert_fn, verbose=False):
2493
    """Replace XYZ coordinate of the file (struc_fn)  with XYZ from another file (insert_fn).
2494
2495
    This can be useful if you want to replace positions of atoms, for example, one base only. The lines are muted based on atom name, residue name, chain, residue index (marked with XXXX below).::
2496
    
2497
    ATOM     11  N1    A 2  27     303.441 273.472 301.457  1.00  0.00           N   # file
2498
    ATOM      1  N1    A 2  27     300.402 273.627 303.188  1.00 99.99           N   # insert 
2499
    ATOM     11  N1    A 2  27     300.402 273.627 303.188  1.00  0.00           N   # inserted
2500
                 XXXXXXXXXXXXXXXX    # part used to find lines to be replaced
2501
2502
     ATOM      1  P     A 2  27     295.653 270.783 300.135  1.00119.29           P   # next line
2503
2504
    Args:
2505
       struc_fn (str): path to the main PDB file
2506
       insert_fn (str): path to the file that will be injected in into the main PDB file
2507
2508
    Returns:
2509
       string: text in the PDB format
2510
    """
2511
    struc = RNAStructure(struc_fn)
2512
    insert = RNAStructure(insert_fn)
2513
2514
    output = ''
2515
    for l in struc.lines:
2516
        inserted = False
2517
        if l.startswith('ATOM'):
2518
            idx = l[13:26]#.strip() # P     A 2  27
2519
            for li in insert.lines:
2520
                idxi = li[13:26]#.strip()
2521
                if idx == idxi:
2522
                    ln = l[:30] + li[30:54] + l[54:] + '\n' # only coords replace (!)
2523
                    if verbose:
2524
                        print(l)
2525
                        print(li)
2526
                        print(ln)
2527
                        print()
2528
                    inserted = True
2529
                    output += ln
2530
            if not inserted:
2531
                output += l + '\n'
2532
                inserted = False
2533
    return output.strip()
2534
2535
def set_chain_for_struc(struc_fn, chain_id, save_file_inplace=False, skip_ter=True):
2536
    """Quick & dirty function to set open a fn PDB format, set chain_id
2537
    and save it to a file. Takes only lines with ATOM and TER."""
2538
    txt = ''
2539
    for line in open(struc_fn):
2540
        if skip_ter:
2541
            if line.startswith('ATOM') or line.startswith('TER'):
2542
                # if TER line is only TER, not like TER atom chain etc
2543
                # then just keep TER there and don't add any chain to id
2544
                if line == 'TER':
2545
                    pass
2546
                else:
2547
                    l = line[:21] + chain_id + line[22:]
2548
                txt += l
0 ignored issues
show
introduced by
The variable l does not seem to be defined for all execution paths.
Loading history...
2549
            else:
2550
                txt += line
2551
        else:
2552
            if line.startswith('ATOM'):
2553
                l = line[:21] + chain_id + line[22:]
2554
                txt += l
2555
            elif line.startswith('TER'):
2556
                continue
2557
            else:
2558
                txt += line
2559
    if save_file_inplace:
2560
        with open(struc_fn, 'w') as f:
2561
             f.write(txt)
2562
    return txt.strip()
2563
2564
def load_rnas(path, verbose=True):
2565
    """Load structural files (via glob) and return a list of RNAStructure objects.
2566
    
2567
    Examples::
2568
2569
        rnas = rtl.load_rnas('../rna_tools/input/mq/*.pdb')
2570
    
2571
    """
2572
    rs = []
2573
    import glob
2574
    for f in glob.glob(path):
2575
        r = RNAStructure(f)
2576
        rs.append(r)
2577
    return rs
2578
2579
# main
2580
if '__main__' == __name__:
2581
    f1 = "input/t2-4-ACA.pdb"
2582
    f2 = "input/to-replace.pdb"
2583
    r1 = RNAStructure(f1)
2584
    r2 = RNAStructure(f2)    
2585
    t = replace_atoms(f1, f2)
2586
    with open('output/replaced.pdb', 'w') as f: f.write(t)
2587
2588
    fn = "input/1a9l_NMR_1_2_models.pdb"
2589
    r = RNAStructure(fn)
2590
    t = r.get_res_text('A', 1)
2591
    with open('output/1a9l_NMR_1_2_models_R1.pdb', 'w') as f: f.write(t)
2592
2593
    fn = "input/remarks.pdb"
2594
    r = RNAStructure(fn)
2595
    t = r.get_remarks_text()
2596
    with open('output/remarks_only.pdb', 'w') as f: f.write(t)
2597
2598
    fn = 'input/image'
2599
    print('fn:', fn)
2600
    struc = RNAStructure(fn)
2601
    print(' pdb?:', struc.is_pdb())
2602
    # print( atoms:', struc.get_no_lines())
2603
2604
    fn = 'input/na.pdb'
2605
    s = RNAStructure(fn)
2606
    print(s.detect_molecule_type())
2607
    #res = get_all_res(na)
2608
    # print 'what is?', what_is(res)
2609
    # print res
2610
    print('non standard:', s.check_res_if_std_na())
2611
    print('is protein:', s.detect_molecule_type())
2612
2613
    fn = 'input/prot.pdb'
2614
    s = RNAStructure(fn)
2615
    print('non standard:', s.check_res_if_std_prot())
2616
    print('is protein:',  s.detect_molecule_type())
2617
2618
    fn = 'input/rna-ru.pdb'
2619
    s = RNAStructure(fn)
2620
    print('non standard:', s.check_res_if_supid_rna())
2621
    print('is protein:', s.detect_molecule_type())
2622
2623
    fn = 'input/na_highAtomNum.pdb'
2624
    print(fn)
2625
    s = RNAStructure(fn)
2626
    s.renum_atoms()
2627
    s.write('output/na_highAtomNum.pdb')
2628
2629
    fn = 'input/na_solvet_old_format.pdb'
2630
    print(fn)
2631
    s = RNAStructure(fn)
2632
    s.fix_op_atoms()
2633
    s.remove_hydrogen()
2634
    s.remove_ion()
2635
    s.remove_water()
2636
    s.write('output/na_solvet_old_format.pdb')
2637
2638
    fn = 'input/na_solvet_old_format.pdb'
2639
    print(fn)
2640
    s = RNAStructure(fn)
2641
    s.std_resn()
2642
    s.remove_hydrogen()
2643
    s.remove_ion()
2644
    s.remove_water()
2645
    s.write('output/na_solvet_old_format.pdb')
2646
2647
    #fn = 'input/na_solvet_old_format__.pdb'
2648
    #s = RNAStructure(fn)
2649
    # s.std_resn()
2650
    # s.remove_hydrogen()
2651
    # s.remove_ion()
2652
    # s.remove_water()
2653
    # s.renum_atoms()
2654
    # s.fix_op_atoms()
2655
    # s.write('output/na_solvet_old_format__.pdb')
2656
2657
    fn = 'input/1xjr.pdb'
2658
    s.std_resn()
2659
    s.remove_hydrogen()
2660
    s.remove_ion()
2661
    s.remove_water()
2662
    s.renum_atoms()
2663
    s.fix_op_atoms()
2664
    s.write('output/1xjr.pdb')
2665
2666
    fn = 'input/decoy0165_amb.pdb'
2667
    print(fn)
2668
    s = RNAStructure(fn)
2669
    s.std_resn()
2670
    s.remove_hydrogen()
2671
    s.remove_ion()
2672
    s.remove_water()
2673
    s.renum_atoms()
2674
    s.fix_O_in_UC()
2675
    s.fix_op_atoms()
2676
    s.write('output/decoy0165_amb_clx.pdb')
2677
2678
    fn = 'input/farna.pdb'
2679
    print(fn)
2680
    s = RNAStructure(fn)
2681
    s.std_resn()
2682
    s.remove_hydrogen()
2683
    s.remove_ion()
2684
    s.remove_water()
2685
    s.fix_op_atoms()
2686
    s.renum_atoms()
2687
    s.write('output/farna.pdb')
2688
2689
    fn = 'input/farna.pdb'
2690
    print(fn)
2691
2692
    r = RNAStructure(fn)
2693
    print(r.is_mol2())
2694
2695
    if True:
2696
        print('================================================')
2697
        print ("input/1xjr_clx_fChimera_noIncludeNumbers.mol2")
2698
        r = RNAStructure("input/1xjr_clx_fChimera_noIncludeNumbers.mol2")
2699
        print(r.is_mol2())
2700
        r.mol2toPDB('/tmp/x.pdb')
2701
2702
        r = RNAStructure('/tmp/x.pdb')
2703
        print(r.get_report)
2704
        r.std_resn()
2705
        r.remove_hydrogen()
2706
        r.remove_ion()
2707
        r.remove_water()
2708
        r.fix_op_atoms()
2709
        r.renum_atoms()
2710
        r.fixU__to__U()
2711
        r.write("output/1xjr_clx_fChimera_noIncludeNumbers.mol2")
2712
2713
    if True:
2714
        r = RNAStructure("input/2du3_prot_bound.mol2")
2715
        print(r.is_mol2())
2716
        outfn = r.mol2toPDB()
2717
        print(r.get_report)
2718
2719
    print('================================================')
2720
    fn = "input/3e5fA-nogtp_processed_zephyr.pdb"
2721
    r = RNAStructure(fn)
2722
    print(r.is_mol2())
2723
    #outfn = r.mol2toPDB()
2724
    print(r.is_amber_like())
2725
    print(r.get_report)
2726
2727
    print(r.get_preview())
2728
2729
    r.std_resn()
2730
2731
    print(r.get_preview())
2732
2733
    r.remove_hydrogen()
2734
    r.remove_ion()
2735
    r.remove_water()
2736
    #renum_atoms(t, t)
2737
    #fix_O_in_UC(t, t)
2738
    #fix_op_atoms(t, t)
2739
    r.write('output/3e5fA-nogtp_processed_zephyr.pdb')
2740
2741
    print()
2742
    fn = "input/1xjr_clx_charmm.pdb"
2743
    print(fn)
2744
    s = RNAStructure(fn)
2745
    s.std_resn()
2746
    s.remove_hydrogen()
2747
    s.remove_ion()
2748
    s.remove_water()
2749
    s.write('output/1xjr_clx_charmm.pdb')
2750
2751
    #renum_atoms(t, t)
2752
    #fix_O_in_UC(t, t)
2753
    #fix_op_atoms(t, t)
2754
2755
    print()
2756
    fn = "input/dna_fconvpdb_charmm22.pdb"
2757
    print(fn)
2758
    r = RNAStructure(fn)
2759
    r.get_preview()
2760
    r.resn_as_dna()
2761
    r.remove_hydrogen()
2762
    r.remove_ion()
2763
    r.remove_water()
2764
    r.std_resn()
2765
    print(r.get_head())
2766
    print(r.get_tail())
2767
    print(r.get_preview())
2768
    r.write("output/dna_fconvpdb_charmm22.pdb")
2769
2770
    print()
2771
    fn = "input/1a9l_NMR_1_2_models.pdb"
2772
    print(fn)
2773
    r = RNAStructure(fn)
2774
    r.write("output/1a9l_NMR_1_2_models_lib.pdb")
2775
    # r.get_text() # get #1 model
2776
2777
    import doctest
2778
    doctest.testmod()
2779