1
|
|
|
#!/usr/bin/env python3 |
2
|
|
|
# -*- coding: utf-8 -*- |
3
|
|
|
"""rna_tools_lib.py - main lib file, many tools in this lib is using this file.""" |
4
|
|
|
|
5
|
|
|
import os |
6
|
|
|
import sys |
7
|
|
|
from collections import OrderedDict |
8
|
|
|
import re |
9
|
|
|
import string |
10
|
|
|
import time |
11
|
|
|
import gzip |
12
|
|
|
import tempfile |
13
|
|
|
import shutil |
14
|
|
|
import subprocess |
15
|
|
|
|
16
|
|
|
from rna_tools.tools.extra_functions.select_fragment import select_pdb_fragment_pymol_style, select_pdb_fragment |
17
|
|
|
|
18
|
|
|
from icecream import ic |
19
|
|
|
import sys |
20
|
|
|
ic.configureOutput(outputFunction=lambda *a: print(*a, file=sys.stderr), includeContext=True) |
21
|
|
|
ic.configureOutput(prefix='') |
22
|
|
|
|
23
|
|
|
import logging |
24
|
|
|
logger = logging.getLogger('rna-tools') |
25
|
|
|
handler = logging.StreamHandler() |
26
|
|
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
27
|
|
|
handler.setFormatter(formatter) |
28
|
|
|
logger.addHandler(handler) |
29
|
|
|
|
30
|
|
|
# Don't fix OP3, ignore it |
31
|
|
|
ignore_op3 = False |
32
|
|
|
|
33
|
|
|
# Settings: what is what in a PDB file |
34
|
|
|
AMINOACID_CODES = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "GLN", "GLY", |
35
|
|
|
"HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", |
36
|
|
|
"TRP", "TYR", "VAL"] |
37
|
|
|
|
38
|
|
|
def aa3to1(aaa): |
39
|
|
|
"""based on https://pymolwiki.org/index.php/Aa_codes""" |
40
|
|
|
if len(aaa) != 3: # aaa is 'G', like for RNA ;-) |
41
|
|
|
return '' # dont do it for rna test aaa |
42
|
|
|
one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', \ |
43
|
|
|
'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y', \ |
44
|
|
|
'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A', \ |
45
|
|
|
'GLY':'G', 'PRO':'P', 'CYS':'C'} |
46
|
|
|
return one_letter[aaa] |
47
|
|
|
|
48
|
|
|
|
49
|
|
|
RES = ['DA', 'DG', 'DT', 'DC'] |
50
|
|
|
RES += ['A', 'G', 'U', 'C'] |
51
|
|
|
|
52
|
|
|
RESS = ['A', 'C', 'G', 'U', 'ADE', 'CYT', 'GUA', 'URY', 'URI', 'U34', 'U31', 'C31', '4SU', 'H2U', 'QUO', 'G7M', '5MU', |
53
|
|
|
'5MC', 'PSU', '2MG', '1MG', '1MA', 'M2G', '5BU', 'FHU', 'FMU', 'IU', 'OMG', 'OMC', 'OMU', 'A2M', 'A23', 'CCC', |
54
|
|
|
'I'] + ['RC', 'RU', 'RA', 'RG', 'RT'] |
55
|
|
|
DNA = ['DA', 'DG', 'DT', 'DC'] |
56
|
|
|
RNA = ['A', 'G', 'U', 'C'] |
57
|
|
|
IONS = ['NA', 'MG', 'MN', 'JOS'] # and ligands JOS |
58
|
|
|
HYDROGEN_NAMES = ["H", "H5'", "H5''", "H4'", "H3'", "H2'", "HO2'", "H1'", "H3", "H5", "H6", "H5T", "H41", "1H5'", |
59
|
|
|
"2H5'", "HO2'", "1H4", "2H4", "1H2", "2H2", "H1", "H8", "H2", "1H6", "2H6", "HO5'", "H21", "H22", |
60
|
|
|
"H61", "H62", "H42", "HO3'", "1H2'", "2HO'", "HO'2", "H2'1", "HO'2", "HO'2", "H2", "H2'1", "H1", "H2", |
61
|
|
|
"1H5*", "2H5*", "H4*", "H3*", "H1*", "1H2*", "2HO*", "1H2", "2H2", "1H4", "2H4", "1H6", "2H6", "H1", |
62
|
|
|
"H2", "H3", "H5", "H6", "H8", "H5'1", "H5'2", "H3T"] |
63
|
|
|
|
64
|
|
|
|
65
|
|
|
class PDBFetchError(Exception): |
66
|
|
|
pass |
67
|
|
|
|
68
|
|
|
class MethodUnknown(Exception): |
69
|
|
|
pass |
70
|
|
|
|
71
|
|
|
|
72
|
|
|
try: |
73
|
|
|
from Bio.PDB import * |
74
|
|
|
except ImportError: |
75
|
|
|
print("Biopython is not detected. It is required for some functions.") |
76
|
|
|
|
77
|
|
|
import warnings |
78
|
|
|
warnings.filterwarnings('ignore', '.*Invalid or missing.*',) |
79
|
|
|
warnings.filterwarnings('ignore', '.*with given element *',) |
80
|
|
|
warnings.filterwarnings('ignore', '.*is discontinuous*',) |
81
|
|
|
warnings.filterwarnings('ignore', '.*Some atoms or residues may be missing in the data structure.*',) |
82
|
|
|
warnings.filterwarnings('ignore', '.*Ignoring unrecognized record.*',) |
83
|
|
|
|
84
|
|
|
|
85
|
|
|
import subprocess |
86
|
|
|
def exe(cmd): |
87
|
|
|
o = subprocess.Popen( |
88
|
|
|
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
89
|
|
|
out = o.stdout.read().strip().decode() |
90
|
|
|
err = o.stderr.read().strip().decode() |
91
|
|
|
return out, err |
92
|
|
|
|
93
|
|
|
|
94
|
|
|
def get_rna_tools_path(): |
95
|
|
|
"""Return path to the rt.""" |
96
|
|
|
import inspect |
97
|
|
|
import rna_tools |
98
|
|
|
return os.path.dirname(inspect.getfile(rna_tools)) |
99
|
|
|
|
100
|
|
|
def get_version(currfn='', verbose=False): |
101
|
|
|
import rna_tools |
102
|
|
|
return rna_tools.__version__ |
103
|
|
|
|
104
|
|
|
|
105
|
|
|
def sort_strings(l): |
106
|
|
|
""" Sort the given list in the way that humans expect. |
107
|
|
|
http://blog.codinghorror.com/sorting-for-humans-natural-sort-order/ |
108
|
|
|
""" |
109
|
|
|
def convert(text): return int(text) if text.isdigit() else text |
110
|
|
|
|
111
|
|
|
def alphanum_key(key): return [convert(c) for c in re.split('([0-9]+)', key)] |
112
|
|
|
l.sort(key=alphanum_key) |
113
|
|
|
return l |
114
|
|
|
|
115
|
|
|
|
116
|
|
|
class RNAStructure: |
117
|
|
|
"""RNAStructure - handles an RNA pdb file. |
118
|
|
|
|
119
|
|
|
Attributes: |
120
|
|
|
|
121
|
|
|
fn (string) : path to the structural file, e.g., "../rna_tools/input/4ts2.pdb" |
122
|
|
|
name(string) : filename of the structural file, "4ts2.pdb" |
123
|
|
|
lines (list) : the PDB file is loaded and ATOM/HETATM/TER/END go to self.lines |
124
|
|
|
|
125
|
|
|
""" |
126
|
|
|
|
127
|
|
|
def __init__(self, fn=''): |
128
|
|
|
if not fn: # if empty |
129
|
|
|
tf = tempfile.NamedTemporaryFile(delete=False) |
130
|
|
|
fn = tf.name |
131
|
|
|
|
132
|
|
|
self.fn = os.path.abspath(fn) # ../rna_tools/input/4ts2.pdb |
133
|
|
|
# add name attribute with filename 4ts2.pdb |
134
|
|
|
self.name = self.fn.split(os.sep)[-1] # OS.path.splitext(self.fn)[0] |
135
|
|
|
|
136
|
|
|
self.report = [] |
137
|
|
|
self.report.append('The RNARNAStructure report: %s ' % self.fn) |
138
|
|
|
|
139
|
|
|
self.mol2_format = False |
140
|
|
|
|
141
|
|
|
self.lines = [] |
142
|
|
|
|
143
|
|
|
try: |
144
|
|
|
# lines = open(fn).read().strip().split('\n') # don't strip?, good or bed? |
145
|
|
|
lines = open(fn).read().split('\n') |
146
|
|
|
except UnicodeDecodeError: |
147
|
|
|
print("Can't open a binary file") |
148
|
|
|
self.lines = '' |
149
|
|
|
return |
150
|
|
|
|
151
|
|
|
self.has_many_models = False |
152
|
|
|
|
153
|
|
|
for l in lines: |
154
|
|
|
# multi-models pdb files |
155
|
|
|
if l.startswith('MODEL'): |
156
|
|
|
self.has_many_models = True |
157
|
|
|
if l.startswith('ENDMDL'): |
158
|
|
|
break |
159
|
|
|
|
160
|
|
|
if l.startswith('ATOM') or l.startswith('HETATM') or l.startswith('TER') or l.startswith('END'): |
161
|
|
|
self.lines.append(l) # don't strip .strip()) |
162
|
|
|
if l.startswith("@<TRIPOS>"): |
163
|
|
|
self.mol2_format = True |
164
|
|
|
self.report.append('This is mol2 format') |
165
|
|
|
|
166
|
|
|
self.res = self.get_resn_uniq() |
167
|
|
|
|
168
|
|
|
def reload(self): |
169
|
|
|
"""Reload the object.""" |
170
|
|
|
self.__init__(self.fn) |
171
|
|
|
|
172
|
|
|
def is_pdb(self): |
173
|
|
|
"""Return True if the files is in PDB format. |
174
|
|
|
|
175
|
|
|
If self.lines is empty it means that nothing was parsed into the PDB format.""" |
176
|
|
|
if len(self.lines): |
177
|
|
|
return True |
178
|
|
|
else: |
179
|
|
|
return False |
180
|
|
|
|
181
|
|
|
def is_nmr(self): |
182
|
|
|
"""True if the file is an NMR-style multiple model pdb |
183
|
|
|
|
184
|
|
|
:returns: True or Fo |
185
|
|
|
:rtype: boolean |
186
|
|
|
""" |
187
|
|
|
return self.has_many_models |
188
|
|
|
|
189
|
|
|
def un_nmr(self, startwith1=True, directory='', verbose=False): |
190
|
|
|
"""Un NMR - Split NMR-style multiple model pdb files into individual models. |
191
|
|
|
|
192
|
|
|
Take self.fn and create new files in the way:: |
193
|
|
|
|
194
|
|
|
input/1a9l_NMR_1_2_models.pdb |
195
|
|
|
input/1a9l_NMR_1_2_models_0.pdb |
196
|
|
|
input/1a9l_NMR_1_2_models_1.pdb |
197
|
|
|
|
198
|
|
|
.. warning:: This function requires biopython. |
199
|
|
|
|
200
|
|
|
rna_pdb_tools.py --un-nmr AF-Q5TCX8-F1-model_v1_core_Ctrim_mdr_MD.pdb |
201
|
|
|
36 |
202
|
|
|
cat *MD_* > md.pdb |
203
|
|
|
""" |
204
|
|
|
# |
205
|
|
|
npdb = '' |
206
|
|
|
c = 0 |
207
|
|
|
|
208
|
|
|
nmodels = 0 |
209
|
|
|
for l in open(self.fn): |
210
|
|
|
if l.startswith('MODEL'): |
211
|
|
|
nmodels += 1 |
212
|
|
|
print(nmodels) |
213
|
|
|
|
214
|
|
|
for l in open(self.fn): |
215
|
|
|
if 1: |
216
|
|
|
if not l.startswith('HETATM'): |
217
|
|
|
npdb += l |
218
|
|
|
if l.startswith('MODEL'): |
219
|
|
|
c += 1 |
220
|
|
|
if l.startswith('ENDMDL'): |
221
|
|
|
id = str(c).zfill(len(str(nmodels))) |
222
|
|
|
nf = self.fn.replace('.pdb', '_%s.pdb' % id) |
223
|
|
|
verbose = 1 |
224
|
|
|
if verbose: |
225
|
|
|
print(nf) |
226
|
|
|
with open(nf, 'w') as f: |
227
|
|
|
f.write(npdb) |
228
|
|
|
npdb = '' |
229
|
|
|
biopython = 0 |
230
|
|
|
|
231
|
|
|
if biopython: |
232
|
|
|
parser = PDBParser() |
|
|
|
|
233
|
|
|
structure = parser.get_structure('', self.fn) |
234
|
|
|
for c, m in enumerate(structure): |
235
|
|
|
if verbose: |
236
|
|
|
print(m) |
237
|
|
|
io = PDBIO() |
|
|
|
|
238
|
|
|
io.set_structure(m) |
239
|
|
|
if startwith1: |
240
|
|
|
c += 1 |
241
|
|
|
if directory: |
242
|
|
|
io.save(directory + os.sep + self.fn.replace('.pdb', '_%i.pdb' % c)) |
243
|
|
|
else: |
244
|
|
|
io.save(self.fn.replace('.pdb', '_%i.pdb' % c)) |
245
|
|
|
|
246
|
|
|
def is_mol2(self): |
247
|
|
|
"""Return True if is_mol2 based on the presence of ```@<TRIPOS>```.""" |
248
|
|
|
return self.mol2_format |
249
|
|
|
|
250
|
|
|
def decap_gtp(self): |
251
|
|
|
lines = [] |
252
|
|
|
for l in self.lines: |
253
|
|
|
if l.startswith('ATOM') or l.startswith('HETATM'): |
254
|
|
|
if l[12:16].strip() in ['PG', 'O1G', 'O2G', 'O3G', 'O3B', 'PB', 'O1B', 'O2B', 'O3A']: |
255
|
|
|
continue |
256
|
|
|
if l[12:16].strip() == 'PA': |
257
|
|
|
l = l.replace('PA', 'P ') |
258
|
|
|
if l[12:16].strip() == 'O1A': |
259
|
|
|
l = l.replace('O1A', 'O1P') |
260
|
|
|
if l[12:16].strip() == 'O2A': |
261
|
|
|
l = l.replace('O2A', 'O2P') |
262
|
|
|
if l[17:20].strip() == 'GTP': |
263
|
|
|
l = l[:17] + ' G' + l[20:] |
264
|
|
|
l = l.replace('HETATM', 'ATOM ') |
265
|
|
|
lines.append(l) |
266
|
|
|
self.lines = lines |
267
|
|
|
|
268
|
|
|
def is_amber_like(self): |
269
|
|
|
"""Use self.lines and check if there is XX line |
270
|
|
|
""" |
271
|
|
|
for l in self.lines: |
272
|
|
|
if l.startswith('ATOM') or l.startswith('HETATM'): |
273
|
|
|
rn = l[17:20] |
274
|
|
|
if rn in ['RU5', 'RC5', 'RA5', 'RT5', 'RG5']: |
275
|
|
|
self.report.append('This is amber-like format') |
276
|
|
|
return True |
277
|
|
|
return False |
278
|
|
|
|
279
|
|
|
def replace_hetatms(self): |
280
|
|
|
nlines = [] |
281
|
|
|
for l in self.lines: |
282
|
|
|
l = l.replace('HETATM', 'ATOM ') |
283
|
|
|
nlines.append(l) |
284
|
|
|
self.lines = nlines |
285
|
|
|
|
286
|
|
|
def fix_with_qrnas(self, outfn="", verbose=False): |
287
|
|
|
"""Add missing heavy atom. |
288
|
|
|
|
289
|
|
|
A residue is recognized base on a residue names. |
290
|
|
|
|
291
|
|
|
Copy QRNAS folder to curr folder, run QRNAS and remove QRNAS. |
292
|
|
|
|
293
|
|
|
.. warning:: QRNAS required (http://genesilico.pl/QRNAS/QRNAS.tgz) |
294
|
|
|
""" |
295
|
|
|
from rpt_config import QRNAS_PATH |
296
|
|
|
|
297
|
|
|
# prepare folder get ff folder |
298
|
|
|
to_go = os.path.abspath(os.path.dirname(self.fn)) |
299
|
|
|
curr = os.getcwd() |
300
|
|
|
|
301
|
|
|
# set occupancy to 0 |
302
|
|
|
s = RNAStructure(self.fn) |
303
|
|
|
s.set_occupancy_atoms(0.00) |
304
|
|
|
s.write(self.fn) |
305
|
|
|
|
306
|
|
|
os.chdir(to_go) |
307
|
|
|
try: |
308
|
|
|
shutil.copytree(QRNAS_PATH, to_go + os.sep + "QRNAS") |
309
|
|
|
except OSError: |
310
|
|
|
pass |
311
|
|
|
# prepare config file |
312
|
|
|
with open('qrna_config.txt', 'w') as f: |
313
|
|
|
f.write("WRITEFREQ 1\n") |
314
|
|
|
f.write("NSTEPS 1\n") |
315
|
|
|
# run qrnas |
316
|
|
|
print('QRNAS...') |
317
|
|
|
|
318
|
|
|
if not outfn: |
319
|
|
|
cmd = "QRNAS -c qrna_config.txt -i " + os.path.basename(self.fn) |
320
|
|
|
else: |
321
|
|
|
cmd = "QRNAS -c qrna_config.txt -i " + \ |
322
|
|
|
os.path.basename(self.fn) + " -o " + curr + os.sep + outfn |
323
|
|
|
#o = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
324
|
|
|
os.system(cmd) |
325
|
|
|
if False: |
326
|
|
|
out = o.stdout.read().strip() |
327
|
|
|
err = o.stderr.read().strip() |
328
|
|
|
if verbose: |
329
|
|
|
print(out) |
330
|
|
|
print(err) |
331
|
|
|
shutil.rmtree(to_go + os.sep + "QRNAS") |
332
|
|
|
# post cleaning |
333
|
|
|
if outfn: |
334
|
|
|
print('Cleaning...') |
335
|
|
|
s = RNAStructure(curr + os.sep + outfn) |
336
|
|
|
s.remove_hydrogen() |
337
|
|
|
s.std_resn() |
338
|
|
|
s.write(curr + os.sep + outfn) |
339
|
|
|
os.chdir(curr) |
340
|
|
|
|
341
|
|
|
def mol2toPDB(self, outfn=""): |
342
|
|
|
try: |
343
|
|
|
import pybel |
344
|
|
|
except ImportError: |
345
|
|
|
print ('pybel is needed for mol2 to pdb convertion') |
346
|
|
|
# sys.exit(1) |
347
|
|
|
sys.exit(0) |
348
|
|
|
|
349
|
|
|
if not outfn: |
350
|
|
|
outfn = self.fn.replace('.mol2', '.pdb') |
351
|
|
|
|
352
|
|
|
for mol in pybel.readfile("mol2", self.fn): |
353
|
|
|
mol.write("pdb", outfn, overwrite=True) |
354
|
|
|
|
355
|
|
|
print('outfn: ', outfn) |
356
|
|
|
self.report.append(' Converted from mol2 to PDB') |
357
|
|
|
return outfn |
358
|
|
|
|
359
|
|
|
def get_no_lines(self): |
360
|
|
|
return len(self.lines) |
361
|
|
|
|
362
|
|
|
def check_geometry(self, verbose=False): |
363
|
|
|
"""Check for correct "Polymer linkage, it should be around 1.6Å with a sigma around 0.01. |
364
|
|
|
|
365
|
|
|
Carrascoza, F., Antczak, M., Miao, Z., Westhof, E. & Szachniuk, M. Evaluation of the stereochemical quality of predicted RNA 3D models in the RNA-Puzzles submissions. Rna 28, 250–262 (2022). |
366
|
|
|
|
367
|
|
|
Values for 1xjr.pdb:: |
368
|
|
|
|
369
|
|
|
op.mean(): 1.599316 |
370
|
|
|
op.std(): 0.009274074 |
371
|
|
|
|
372
|
|
|
po.mean(): 1.5984017 |
373
|
|
|
po.std(): 0.0069191623 |
374
|
|
|
|
375
|
|
|
requires biopython |
376
|
|
|
""" |
377
|
|
|
if verbose: |
378
|
|
|
logger.setLevel(logging.DEBUG) |
379
|
|
|
|
380
|
|
|
try: |
381
|
|
|
from Bio import PDB |
382
|
|
|
from Bio.PDB import PDBIO |
383
|
|
|
import warnings |
384
|
|
|
warnings.filterwarnings('ignore', '.*Invalid or missing.*',) |
385
|
|
|
warnings.filterwarnings('ignore', '.*with given element *',) |
386
|
|
|
except: |
387
|
|
|
sys.exit('Error: Install biopython to use this function (pip biopython)') |
388
|
|
|
|
389
|
|
|
pdb = self.fn |
390
|
|
|
struc = PDB.PDBParser().get_structure('struc', pdb) |
391
|
|
|
resi_prev = {} |
392
|
|
|
op = [] |
393
|
|
|
po = [] |
394
|
|
|
angles = [] |
395
|
|
|
angles2 = [] |
396
|
|
|
|
397
|
|
|
p_ps = [] |
398
|
|
|
sigma = 0.009274074 |
399
|
|
|
mean = 1.599316 |
400
|
|
|
import numpy as np |
401
|
|
|
|
402
|
|
|
for s in struc: |
403
|
|
|
for c in s: |
404
|
|
|
# for new chain reset resi_prev |
405
|
|
|
resi_prev = None |
406
|
|
|
for r in c: |
407
|
|
|
if resi_prev: |
408
|
|
|
p_p = resi_prev["P"] - r['P'] |
409
|
|
|
p_ps.append(p_p) |
410
|
|
|
|
411
|
|
|
v = resi_prev["O3'"] - r['P'] |
412
|
|
|
op.append(v) |
413
|
|
|
x = mean - 6 * sigma |
414
|
|
|
y = mean + 6 * sigma |
415
|
|
|
|
416
|
|
|
v2 = r["P"] - r["O5'"] |
417
|
|
|
#ic(v2) |
418
|
|
|
po.append(v2) |
419
|
|
|
|
420
|
|
|
if not (x <= v <= y): |
421
|
|
|
print(f' ! lack of connectivity between {r}, {resi_prev}, distance between residues: {v:.2f}') |
422
|
|
|
|
423
|
|
|
# angle |
424
|
|
|
pc3p = resi_prev["O3'"].get_vector() |
425
|
|
|
p = r['P'].get_vector() |
426
|
|
|
o5p = r["O5'"].get_vector() |
427
|
|
|
bo5p = p - o5p # b as bond |
428
|
|
|
bpc3p = p - pc3p |
429
|
|
|
if 0: |
430
|
|
|
ic(bo5p.angle(bpc3p)) |
431
|
|
|
ic(bpc3p.angle(bo5p)) |
432
|
|
|
angle = np.rad2deg(bpc3p.angle(bo5p)) |
433
|
|
|
angles.append(angle) |
434
|
|
|
|
435
|
|
|
|
436
|
|
|
# angle |
437
|
|
|
po3p = resi_prev["O3'"].get_vector() |
438
|
|
|
p = r['P'].get_vector() |
439
|
|
|
o5p = r["O5'"].get_vector() |
440
|
|
|
bo5p = p - o5p # b as bond |
441
|
|
|
bpc3p = p - po3p |
442
|
|
|
if 0: |
443
|
|
|
ic(bo5p.angle(bpc3p)) |
444
|
|
|
ic(bpc3p.angle(bo5p)) |
445
|
|
|
angle = np.rad2deg(bpc3p.angle(bo5p)) |
446
|
|
|
angles.append(angle) |
447
|
|
|
|
448
|
|
|
pc3p = resi_prev["C3'"].get_vector() |
449
|
|
|
v1 = po3p - pc3p |
450
|
|
|
v2 = po3p - p |
451
|
|
|
angle2 = np.rad2deg(v2.angle(v1)) |
452
|
|
|
angles2.append(angle2) |
453
|
|
|
|
454
|
|
|
resi_prev = r |
455
|
|
|
|
456
|
|
|
p_ps = np.array(p_ps) |
457
|
|
|
#np.set_printoptions(threshold=np.inf) |
458
|
|
|
ic.disable() |
459
|
|
|
ic(p_ps) |
460
|
|
|
ic(p_ps.mean(), p_ps.std()) |
461
|
|
|
if False: |
462
|
|
|
import matplotlib.pyplot as plt |
463
|
|
|
plt.hist(p_ps, bins='auto') |
464
|
|
|
plt.title("p-ps") |
465
|
|
|
plt.show() |
466
|
|
|
|
467
|
|
|
op = np.array(op) |
468
|
|
|
ic(op.mean(), op.std()) |
469
|
|
|
|
470
|
|
|
po = np.array(po) |
471
|
|
|
ic(po.mean(), po.std()) |
472
|
|
|
|
473
|
|
|
angles = np.array(angles) |
474
|
|
|
ic(angles) |
475
|
|
|
ic(angles.mean(), angles.std()) |
476
|
|
|
if False: |
477
|
|
|
import matplotlib.pyplot as plt |
478
|
|
|
plt.hist(angles, bins='auto') |
479
|
|
|
plt.title("Histogram of angles o3'-p-o5'") |
480
|
|
|
plt.xlim(0, 360) |
481
|
|
|
plt.show() |
482
|
|
|
|
483
|
|
|
angles2 = np.array(angles2) |
484
|
|
|
ic(angles2) |
485
|
|
|
ic(angles2.mean(), angles2.std()) |
486
|
|
|
|
487
|
|
|
if False: |
488
|
|
|
import matplotlib.pyplot as plt |
489
|
|
|
plt.hist(angles2, bins='auto') |
490
|
|
|
plt.title("Histogram of angles c3'-o3'-p") |
491
|
|
|
plt.xlim(0, 360) |
492
|
|
|
plt.show() |
493
|
|
|
ic.enable() |
494
|
|
|
|
495
|
|
|
def get_text(self, add_end=True): |
496
|
|
|
"""works on self.lines.""" |
497
|
|
|
txt = '' |
498
|
|
|
for l in self.lines: |
499
|
|
|
if l.startswith('END'): |
500
|
|
|
continue # skip end |
501
|
|
|
txt += l + '\n' # .strip() |
502
|
|
|
if add_end: |
503
|
|
|
if not l.startswith('END'): |
|
|
|
|
504
|
|
|
txt += 'END' |
505
|
|
|
return txt.strip() |
506
|
|
|
|
507
|
|
|
def get_chain(self, chain_id='A'): |
508
|
|
|
txt = '' |
509
|
|
|
for l in self.lines: |
510
|
|
|
if l.startswith('ATOM') or l.startswith('HETATM'): |
511
|
|
|
if l[21] == chain_id: |
512
|
|
|
txt += l.strip() + '\n' |
513
|
|
|
if l.startswith('TER'): |
514
|
|
|
if len(l) < 21: # this is only TER |
515
|
|
|
# ter = 'TER ' + str(self.get_atom_num(txt) + 1).rjust(5) + '\n' |
516
|
|
|
raise Exception('TER line has no chain id') |
517
|
|
|
else: |
518
|
|
|
if l[21] == chain_id: |
519
|
|
|
txt += l.strip() + '\n' |
520
|
|
|
## ll = txt.strip().split('\n')[-1] # last line = ll |
521
|
|
|
## print(ll) |
522
|
|
|
## #if ll.startswith('TER'): # if the last line does not start with ter |
523
|
|
|
## ter = self.set_res_code(ter, self.get_res_code(ll)) # + self.get_chain_id(l).rjust(11) |
524
|
|
|
## print(ter) |
525
|
|
|
return txt.strip() |
526
|
|
|
|
527
|
|
|
def rename_chain(self, chain_id_old, chain_id_new, debug=False): |
528
|
|
|
"""Rename chains |
529
|
|
|
|
530
|
|
|
Args: |
531
|
|
|
chain_id_old (str): e.g., A |
532
|
|
|
chain_id_new (str): e.g., B |
533
|
|
|
debug (bool): show some diagnostics |
534
|
|
|
|
535
|
|
|
Returns: |
536
|
|
|
pdb content (txt) |
537
|
|
|
self.lines is updated with new lines |
538
|
|
|
""" |
539
|
|
|
txt = '' |
540
|
|
|
lines = [] |
541
|
|
|
for l in self.lines: |
542
|
|
|
if l.startswith('ATOM') or l.startswith('HETATM') or l.startswith('TER'): |
543
|
|
|
# TER |
544
|
|
|
try: |
545
|
|
|
l[21] |
546
|
|
|
except IndexError: |
547
|
|
|
continue |
548
|
|
|
if l[21] == chain_id_old: |
549
|
|
|
l = l[:21] + chain_id_new + l[22:] |
550
|
|
|
if debug: print(l) |
551
|
|
|
txt += l.strip() + '\n' # ok, actually keep all lines as it was |
552
|
|
|
lines.append(l) |
553
|
|
|
self.lines = lines |
554
|
|
|
return txt |
555
|
|
|
|
556
|
|
|
|
557
|
|
|
def get_resn_uniq(self): |
558
|
|
|
res = set() |
559
|
|
|
for l in self.lines: |
560
|
|
|
r = l[17:20].strip().upper() |
561
|
|
|
res.add(r) |
562
|
|
|
return res |
563
|
|
|
|
564
|
|
|
def check_res_if_std_na(self): |
565
|
|
|
wrong = [] |
566
|
|
|
|
567
|
|
|
for r in self.res: |
568
|
|
|
if r not in RES: |
569
|
|
|
wrong.append(r) |
570
|
|
|
return wrong |
571
|
|
|
|
572
|
|
|
def get_seq(self, compact=False, chainfirst=False, fasta=False, addfn='', color=False): |
573
|
|
|
"""Get seq (v2) gets segments of chains with correct numbering |
574
|
|
|
|
575
|
|
|
Run:: |
576
|
|
|
|
577
|
|
|
python rna_pdb_seq.py input/1ykq_clx.pdb |
578
|
|
|
> 1ykq_clx A:101-111 |
579
|
|
|
GGAGCUCGCCC |
580
|
|
|
> 1ykq_clx B:201-238 |
581
|
|
|
GGGCGAGGCCGUGCCAGCUCUUCGGAGCAAUACUCGGC |
582
|
|
|
|
583
|
|
|
> 6_solution_0 A:1-19 26-113 117-172 |
584
|
|
|
GGCGGCAGGUGCUCCCGACGUCGGGAGUUAAAAGGGAAG |
585
|
|
|
|
586
|
|
|
Chains is ``{'A': {'header': 'A:1-19 26-113 117-172', 'resi': [1, 2, 3, ..., \ |
587
|
|
|
19, 26, 27, ..., 172], 'seq': ['G', 'G', 'C', 'G', ... C', 'G', 'U', 'C']}}`` |
588
|
|
|
|
589
|
|
|
Chains are in other as the appear in the file. |
590
|
|
|
|
591
|
|
|
.. warning :: take only ATOM and HETATM lines. |
592
|
|
|
""" |
593
|
|
|
if addfn: |
594
|
|
|
addfn += ' ' # add space to file name |
595
|
|
|
seq = self.lines[0][19] |
596
|
|
|
chains = OrderedDict() |
597
|
|
|
resi_prev = None |
598
|
|
|
chain_prev = None |
599
|
|
|
|
600
|
|
|
for l in self.lines: |
601
|
|
|
if l.startswith('ATOM') or l.startswith('HETATM'): |
602
|
|
|
resi = int(l[22:26]) |
603
|
|
|
if resi_prev != resi: |
604
|
|
|
resname = l[17:20].strip() |
605
|
|
|
chain_curr = l[21] |
606
|
|
|
if resname in AMINOACID_CODES: |
607
|
|
|
resname = aa3to1(resname) |
608
|
|
|
if len(resname) == 'GTP': # DG -> g GTP |
609
|
|
|
resname = 'g' |
610
|
|
|
if len(resname) > 1: # DG -> g GTP |
611
|
|
|
resname = resname[-1].lower() |
612
|
|
|
try: |
613
|
|
|
chains[chain_curr]['resi'].append(resi) |
614
|
|
|
chains[chain_curr]['seq'].append(resname) |
615
|
|
|
except KeyError: |
616
|
|
|
chains[chain_curr] = {} |
617
|
|
|
chains[chain_curr]['resi'] = [] |
618
|
|
|
chains[chain_curr]['resi'].append(resi) |
619
|
|
|
chains[chain_curr]['seq'] = [] |
620
|
|
|
chains[chain_curr]['seq'].append(resname) |
621
|
|
|
|
622
|
|
|
resi_prev = resi |
623
|
|
|
chain_prev = chain_curr |
624
|
|
|
|
625
|
|
|
def color_seq(seq, color): |
626
|
|
|
if not color: |
627
|
|
|
return seq |
628
|
|
|
else: |
629
|
|
|
from termcolor import colored |
630
|
|
|
seqc = '' |
631
|
|
|
for s in seq: |
632
|
|
|
if s in ['G']: |
633
|
|
|
seqc += colored(s, 'green') |
634
|
|
|
if s in ['G']: |
635
|
|
|
seqc += colored(s, 'red') |
636
|
|
|
if s in ['T', 'U']: |
637
|
|
|
seqc += colored(s, 'blue') |
638
|
|
|
if s in ['C']: |
639
|
|
|
seqc += colored(s, 'yellow') |
640
|
|
|
return seqc |
641
|
|
|
|
642
|
|
|
|
643
|
|
|
for c in list(chains.keys()): |
644
|
|
|
header = c + ':' + str(chains[c]['resi'][0]) + '-' # add A:1- |
645
|
|
|
for i in range(1, len(chains[c]['resi'])): # start from second element |
646
|
|
|
if chains[c]['resi'][i] - chains[c]['resi'][i - 1] > 1: |
647
|
|
|
header += '' + str(chains[c]['resi'][i - 1]) + ' ' |
648
|
|
|
header += '' + str(chains[c]['resi'][i]) + '-' |
649
|
|
|
header += '' + str(chains[c]['resi'][-1]) |
650
|
|
|
chains[c]['header'] = header # add -163 (last element) |
651
|
|
|
|
652
|
|
|
if compact: |
653
|
|
|
txt = '' |
654
|
|
|
for c in list(chains.keys()): |
655
|
|
|
if chainfirst: |
656
|
|
|
txt += '' + chains[c]['header'].ljust(15) + color_seq(''.join(chains[c]['seq']), color) + ' ' |
657
|
|
|
elif fasta: |
658
|
|
|
txt += color_seq(''.join(chains[c]['seq']), color) + ' ' |
659
|
|
|
else: |
660
|
|
|
txt += color_seq(''.join(chains[c]['seq']), color) + ' # ' + chains[c]['header'] + ' ' |
661
|
|
|
return txt.strip() |
662
|
|
|
else: |
663
|
|
|
txt = '' |
664
|
|
|
for c in list(chains.keys()): |
665
|
|
|
txt += '>' + addfn + chains[c]['header'] + '\n' |
666
|
|
|
txt += color_seq(''.join(chains[c]['seq']), color) + '\n' # color ;-) |
667
|
|
|
return txt.strip() |
668
|
|
|
|
669
|
|
|
def __get_seq(self): |
670
|
|
|
"""get_seq DEPRECATED |
671
|
|
|
|
672
|
|
|
You get `chains` such as: |
673
|
|
|
OrderedDict([('A', {'header': 'A:1-47', 'seq': 'CGUGGUUAGGGCCACGUUAAAUAGUUGCUUAAGCCCUAAGCGUUGAU'}), ('B', {'header': 'B:48-58', 'seq': 'AUCAGGUGCAA'})]) |
674
|
|
|
|
675
|
|
|
.. warning:: take only ATOM and HETATM lines. |
676
|
|
|
""" |
677
|
|
|
seq = '' |
678
|
|
|
curri = int(self.lines[0][22:26]) |
679
|
|
|
seq = self.lines[0][19] |
680
|
|
|
chains = OrderedDict() |
681
|
|
|
curri = -100000000000000000000000000000000 # ugly |
682
|
|
|
chain_prev = None |
683
|
|
|
|
684
|
|
|
for l in self.lines: |
685
|
|
|
if l.startswith('ATOM') or l.startswith('HETATM'): |
686
|
|
|
resi = int(l[22:26]) |
687
|
|
|
if curri != resi: |
688
|
|
|
print(l) |
689
|
|
|
resname = l[17:20].strip() |
690
|
|
|
if len(resname) == 'GTP': # DG -> g GTP |
691
|
|
|
resname = 'g' |
692
|
|
|
if len(resname) > 1: # DG -> g GTP |
693
|
|
|
resname = resname[-1].lower() |
694
|
|
|
|
695
|
|
|
seq += resname |
696
|
|
|
chain_curr = l[21] |
697
|
|
|
|
698
|
|
|
# if distances between curr res and previevs is bigger than 1, then show it as a fragment |
699
|
|
|
if resi - curri > 1 and resi - curri < 100000000000000000000000000000000: # ugly hack |
700
|
|
|
print(resi - curri) |
701
|
|
|
chains[chain_prev]['header'] += '-' + str(resi_prev) |
|
|
|
|
702
|
|
|
if chain_prev != chain_curr and chain_prev: |
703
|
|
|
chains[chain_prev]['header'] += '-' + str(resi_prev) |
704
|
|
|
if chaoin_curr in chains: |
|
|
|
|
705
|
|
|
chains[chain_curr]['seq'] += resname |
706
|
|
|
else: |
707
|
|
|
chains[chain_curr] = dict() |
708
|
|
|
chains[chain_curr]['header'] = chain_curr + ':' + str(resi) # resi_prev) |
709
|
|
|
chains[chain_curr]['seq'] = resname |
710
|
|
|
resi_prev = resi |
711
|
|
|
chain_prev = chain_curr |
712
|
|
|
curri = resi |
713
|
|
|
chains[chain_prev]['header'] += '-' + str(resi_prev) |
714
|
|
|
seq = '' |
715
|
|
|
for c in list(chains.keys()): |
716
|
|
|
seq += '> ' + os.path.basename(self.fn) + ' ' + chains[c]['header'] + '\n' |
717
|
|
|
seq += chains[c]['seq'] + '\n' |
718
|
|
|
return seq.strip() |
719
|
|
|
|
720
|
|
|
def get_info_chains(self): |
721
|
|
|
"""return A:3-21 B:22-32 |
722
|
|
|
""" |
723
|
|
|
seq = '' |
724
|
|
|
curri = int(self.lines[0][22:26]) |
725
|
|
|
seq = self.lines[0][19] |
726
|
|
|
chains = OrderedDict() |
727
|
|
|
curri = -100000000000000 # ugly |
728
|
|
|
chain_prev = None |
729
|
|
|
for l in self.lines: |
730
|
|
|
if l.startswith('ATOM') or l.startswith('HETATM'): |
731
|
|
|
resi = int(l[22:26]) |
732
|
|
|
if curri != resi: |
733
|
|
|
resname = l[17:20].strip() |
734
|
|
|
if len(resname) == 'GTP': # DG -> g GTP |
735
|
|
|
resname = 'g' |
736
|
|
|
if len(resname) > 1: # DG -> g GTP |
737
|
|
|
resname = resname[-1].lower() |
738
|
|
|
seq += resname |
739
|
|
|
chain_curr = l[21] |
740
|
|
|
if chain_prev != chain_curr and chain_prev: |
741
|
|
|
chains[chain_prev]['header'] += '-' + str(resi_prev) |
|
|
|
|
742
|
|
|
if chain_curr in chains: |
743
|
|
|
chains[chain_curr]['seq'] += resname |
744
|
|
|
else: |
745
|
|
|
chains[chain_curr] = dict() |
746
|
|
|
chains[chain_curr]['header'] = chain_curr + ':' + str(resi) # resi_prev) |
747
|
|
|
chains[chain_curr]['seq'] = resname |
748
|
|
|
resi_prev = resi |
749
|
|
|
chain_prev = chain_curr |
750
|
|
|
curri = resi |
751
|
|
|
chains[chain_prev]['header'] += '-' + str(resi_prev) |
752
|
|
|
seq = '' |
753
|
|
|
for c in list(chains.keys()): |
754
|
|
|
seq += chains[c]['header'] + ' ' |
755
|
|
|
return seq.strip() |
756
|
|
|
|
757
|
|
|
def detect_file_format(self): |
758
|
|
|
pass |
759
|
|
|
|
760
|
|
|
def detect_molecule_type(self): |
761
|
|
|
aa = [] |
762
|
|
|
na = [] |
763
|
|
|
for r in self.res: |
764
|
|
|
if r in AMINOACID_CODES: |
765
|
|
|
aa.append(r) |
766
|
|
|
if r in RESS: |
767
|
|
|
na.append(r) |
768
|
|
|
|
769
|
|
|
aa = float(len(aa)) / len(self.res) |
770
|
|
|
na = float(len(na)) / len(self.res) |
771
|
|
|
|
772
|
|
|
if aa == 0 and na == 0: |
773
|
|
|
return 'error' |
774
|
|
|
if aa > na: |
775
|
|
|
return '>protein< vs na', aa, na |
776
|
|
|
else: |
777
|
|
|
return 'protein vs >na<', aa, na |
778
|
|
|
|
779
|
|
|
def get_head(self): |
780
|
|
|
return '\n'.join(self.lines[:5]) |
781
|
|
|
|
782
|
|
|
def get_tail(self): |
783
|
|
|
return '\n'.join(self.lines[-5:]) |
784
|
|
|
|
785
|
|
|
def get_preview(self): |
786
|
|
|
t = '\n'.join(self.lines[:5]) |
787
|
|
|
t += '\n-------------------------------------------------------------------\n' |
788
|
|
|
t += '\n'.join(self.lines[-5:]) |
789
|
|
|
return t |
790
|
|
|
|
791
|
|
|
def remove_hydrogen(self): |
792
|
|
|
lines = [] |
793
|
|
|
for l in self.lines: |
794
|
|
|
if l[77:79].strip() == 'H': |
795
|
|
|
continue |
796
|
|
|
if l[12:16].strip() in HYDROGEN_NAMES: |
797
|
|
|
# if l[12:16].strip().startswith('H'): |
798
|
|
|
continue |
799
|
|
|
else: |
800
|
|
|
# print l[12:16] |
801
|
|
|
lines.append(l) |
802
|
|
|
self.lines = lines |
803
|
|
|
|
804
|
|
|
def remove_water(self): |
805
|
|
|
"""Remove HOH and TIP3""" |
806
|
|
|
lines = [] |
807
|
|
|
for l in self.lines: |
808
|
|
|
if l[17:21].strip() in ['HOH', 'TIP3', 'WAT']: |
809
|
|
|
continue |
810
|
|
|
else: |
811
|
|
|
lines.append(l) |
812
|
|
|
self.lines = lines |
813
|
|
|
|
814
|
|
|
def remove_ion(self): |
815
|
|
|
""" |
816
|
|
|
TER 1025 U A 47 |
817
|
|
|
HETATM 1026 MG MG A 101 42.664 34.395 50.249 1.00 70.99 MG |
818
|
|
|
HETATM 1027 MG MG A 201 47.865 33.919 48.090 1.00 67.09 MG |
819
|
|
|
:rtype: object |
820
|
|
|
""" |
821
|
|
|
lines = [] |
822
|
|
|
for l in self.lines: |
823
|
|
|
element = l[76:78].strip().upper() |
824
|
|
|
element2 = l[17:20].strip().upper() |
825
|
|
|
if element in IONS: |
826
|
|
|
continue |
827
|
|
|
if element2 in IONS: |
828
|
|
|
continue |
829
|
|
|
else: |
830
|
|
|
lines.append(l) |
831
|
|
|
self.lines = lines |
832
|
|
|
|
833
|
|
|
def fixU__to__U(self): |
834
|
|
|
lines = [] |
835
|
|
|
for l in self.lines: |
836
|
|
|
if l.startswith('ATOM') or l.startswith('HETATM'): |
837
|
|
|
rn = l[17:20] |
838
|
|
|
rn = rn.replace('G ', ' G') |
839
|
|
|
rn = rn.replace('U ', ' U') |
840
|
|
|
rn = rn.replace('C ', ' C') |
841
|
|
|
rn = rn.replace('A ', ' A') |
842
|
|
|
l = l[:16] + ' ' + rn + ' ' + l[21:] |
843
|
|
|
# print l.strip() |
844
|
|
|
# print l2 |
845
|
|
|
#l = l.replace(' U ', ' U ') |
846
|
|
|
#l = l.replace(' G ', ' G ') |
847
|
|
|
#l = l.replace(' A ', ' A ') |
848
|
|
|
#l = l.replace(' C ', ' C ') |
849
|
|
|
lines.append(l) |
850
|
|
|
print ('fixU__to__U OK') |
851
|
|
|
self.report.append(' Fix: U__ -> __U') |
852
|
|
|
self.lines = lines |
853
|
|
|
|
854
|
|
|
def resn_as_dna(self): |
855
|
|
|
lines = [] |
856
|
|
|
for l in self.lines: |
857
|
|
|
if l.startswith('ATOM') or l.startswith('HETATM'): |
858
|
|
|
# print l |
859
|
|
|
nl = l.replace('DA5', ' DA') # RA should be the last!!!! |
860
|
|
|
nl = nl.replace('DA3', ' DA') |
861
|
|
|
nl = nl.replace(' DA', ' DA') |
862
|
|
|
nl = nl.replace(' rA', ' DA') |
863
|
|
|
|
864
|
|
|
nl = nl.replace('DC5', ' DC') |
865
|
|
|
nl = nl.replace('DC3', ' DC') |
866
|
|
|
nl = nl.replace(' DC', ' DC') |
867
|
|
|
nl = nl.replace(' rC', ' DC') |
868
|
|
|
|
869
|
|
|
nl = nl.replace('DG5', ' DG') |
870
|
|
|
nl = nl.replace('DG3', ' DG') |
871
|
|
|
nl = nl.replace(' DG', ' DG') |
872
|
|
|
nl = nl.replace(' rG', ' DG') |
873
|
|
|
|
874
|
|
|
nl = nl.replace('DU5', ' DU') |
875
|
|
|
nl = nl.replace('DU3', ' DU') |
876
|
|
|
nl = nl.replace(' DU', ' DU') |
877
|
|
|
nl = nl.replace(' rU', ' DU') |
878
|
|
|
|
879
|
|
|
nl = nl.replace('DT5', ' DT') |
880
|
|
|
nl = nl.replace('DT3', ' DT') |
881
|
|
|
nl = nl.replace(' DT', ' DT') |
882
|
|
|
nl = nl.replace(' rT', ' DT') |
883
|
|
|
|
884
|
|
|
nl = nl.replace('C5M', 'C7 ') |
885
|
|
|
|
886
|
|
|
if l[17:20].strip() == 'G': |
887
|
|
|
nl = nl[:17] + ' DG' + nl[20:] |
888
|
|
|
|
889
|
|
|
if l[17:20].strip() == 'C': |
890
|
|
|
nl = nl[:17] + ' DC' + nl[20:] |
891
|
|
|
|
892
|
|
|
if l[17:20].strip() == 'T': |
893
|
|
|
nl = nl[:17] + ' DT' + nl[20:] |
894
|
|
|
|
895
|
|
|
if l[17:20].strip() == 'U': |
896
|
|
|
nl = nl[:17] + ' DU' + nl[20:] |
897
|
|
|
|
898
|
|
|
if l[17:20].strip() == 'A': |
899
|
|
|
nl = nl[:17] + ' DA' + nl[20:] |
900
|
|
|
|
901
|
|
|
lines.append(nl) |
902
|
|
|
if l.startswith("END") or l.startswith("TER"): |
903
|
|
|
lines.append(l) |
904
|
|
|
|
905
|
|
|
print ('resn_as_dna') |
906
|
|
|
self.report.append(' resn_as_dna') |
907
|
|
|
self.lines = lines |
908
|
|
|
|
909
|
|
|
def fix_O_in_UC(self): |
910
|
|
|
""".. warning: remove RU names before using this function""" |
911
|
|
|
lines = [] |
912
|
|
|
for l in self.lines: |
913
|
|
|
# if l[12:16].strip() in |
914
|
|
|
# if l[12:16].strip().startswith('H'): |
915
|
|
|
nl = l.replace('O U', |
916
|
|
|
'O2 U') |
917
|
|
|
nl = nl.replace('O C', |
918
|
|
|
'O2 C') |
919
|
|
|
lines.append(nl) |
920
|
|
|
self.lines = lines |
921
|
|
|
|
922
|
|
|
def fix_op_atoms(self): |
923
|
|
|
"""Replace OXP' to OPX1, e.g ('O1P' -> 'OP1')""" |
924
|
|
|
lines = [] |
925
|
|
|
for l in self.lines: |
926
|
|
|
nl = l.replace('*', '\'') |
927
|
|
|
nl = nl.replace('O1P', 'OP1') |
928
|
|
|
nl = nl.replace('O2P', 'OP2') |
929
|
|
|
nl = nl.replace('O3P', 'OP3') |
930
|
|
|
lines.append(nl) |
931
|
|
|
self.lines = lines |
932
|
|
|
|
933
|
|
|
def get_report(self): |
934
|
|
|
""" |
935
|
|
|
Returns: |
936
|
|
|
string: report, messages collected on the way of parsing this file |
937
|
|
|
""" |
938
|
|
|
return '\n'.join(self.report) |
939
|
|
|
|
940
|
|
|
def is_rna(self): |
941
|
|
|
wrong = [] |
942
|
|
|
for r in self.res: |
943
|
|
|
if r.upper().strip() in ['RC', 'RU', 'RA', 'RG', 'RT']: |
944
|
|
|
if r not in wrong_res: |
|
|
|
|
945
|
|
|
wrong_res.append(r) |
946
|
|
|
return wrong_res |
947
|
|
|
|
948
|
|
|
def check_res_if_std_dna(self): |
949
|
|
|
wrong_res = [] |
950
|
|
|
for r in self.res: |
951
|
|
|
if r.upper().strip() in ['A', 'T', 'C', 'G']: |
952
|
|
|
if r not in wrong_res: |
953
|
|
|
wrong_res.append(r) |
954
|
|
|
return wrong_res |
955
|
|
|
|
956
|
|
|
def check_res_if_supid_rna(self): |
957
|
|
|
wrong_res = [] |
958
|
|
|
for r in self.res: |
959
|
|
|
if r.upper().strip() in ['RC', 'RU', 'RA', 'RG', 'RT']: |
960
|
|
|
if r not in wrong_res: |
961
|
|
|
wrong_res.append(r) |
962
|
|
|
return wrong_res |
963
|
|
|
|
964
|
|
|
|
965
|
|
|
def is_rna(self): |
966
|
|
|
for r in self.res: |
967
|
|
|
if r.upper().strip() in ['RC', 'RU', 'RA', 'RG', 'RT']: |
968
|
|
|
if r not in wrong_res: |
|
|
|
|
969
|
|
|
wrong_res.append(r) |
970
|
|
|
return wrong_res |
971
|
|
|
|
972
|
|
|
def renum_atoms(self): |
973
|
|
|
"""Renum atoms, from 1 to X for line; ATOM/HETATM""" |
974
|
|
|
lines = [] |
975
|
|
|
c = 1 |
976
|
|
|
for l in self.lines: |
977
|
|
|
l = l[:6] + str(c).rjust(5) + l[11:] |
978
|
|
|
c += 1 |
979
|
|
|
lines.append(l) |
980
|
|
|
self.lines = lines |
981
|
|
|
|
982
|
|
|
def std_resn(self): |
983
|
|
|
"""'Fix' residue names which means to change them to standard, e.g. RA5 -> A |
984
|
|
|
|
985
|
|
|
Works on self.lines, and returns the result to self.lines. |
986
|
|
|
|
987
|
|
|
Will change things like:: |
988
|
|
|
|
989
|
|
|
# URI -> U, URA -> U |
990
|
|
|
1xjr_clx_charmm.pdb:ATOM 101 P URA A 5 58.180 39.153 30.336 1.00 70.94 |
991
|
|
|
rp13_Dokholyan_1_URI_CYT_ADE_GUA_hydrogens.pdb:ATOM 82 P URI A 4 501.633 506.561 506.256 1.00 0.00 P |
992
|
|
|
""" |
993
|
|
|
lines = [] |
994
|
|
|
for l in self.lines: |
995
|
|
|
nl = l.replace('RA5', ' A') # RA should be the last!!!! |
996
|
|
|
nl = nl.replace('RA3', ' A') |
997
|
|
|
nl = nl.replace('ADE', ' A') |
998
|
|
|
nl = nl.replace(' RA', ' A') |
999
|
|
|
nl = nl.replace(' rA', ' A') |
1000
|
|
|
|
1001
|
|
|
nl = nl.replace('RC5', ' C') |
1002
|
|
|
nl = nl.replace('RC3', ' C') |
1003
|
|
|
nl = nl.replace('CYT', ' C') |
1004
|
|
|
nl = nl.replace(' RC', ' C') |
1005
|
|
|
nl = nl.replace(' rC', ' C') |
1006
|
|
|
|
1007
|
|
|
nl = nl.replace('RG5', ' G') |
1008
|
|
|
nl = nl.replace('RG3', ' G') |
1009
|
|
|
nl = nl.replace('GUA', ' G') |
1010
|
|
|
nl = nl.replace(' RG', ' G') |
1011
|
|
|
nl = nl.replace(' rG', ' G') |
1012
|
|
|
|
1013
|
|
|
nl = nl.replace('RU5', ' U') |
1014
|
|
|
nl = nl.replace('RU3', ' U') |
1015
|
|
|
nl = nl.replace('URA', ' U') |
1016
|
|
|
nl = nl.replace('URI', ' U') |
1017
|
|
|
nl = nl.replace('URY', ' U') |
1018
|
|
|
nl = nl.replace(' RU', ' U') |
1019
|
|
|
nl = nl.replace(' rU', ' U') |
1020
|
|
|
|
1021
|
|
|
nl = nl.replace('RT5', ' T') |
1022
|
|
|
nl = nl.replace('RT3', ' T') |
1023
|
|
|
nl = nl.replace('THY', ' T') |
1024
|
|
|
nl = nl.replace(' RT', ' T') |
1025
|
|
|
nl = nl.replace(' rT', ' T') |
1026
|
|
|
|
1027
|
|
|
lines.append(nl) |
1028
|
|
|
|
1029
|
|
|
self.lines = lines |
1030
|
|
|
|
1031
|
|
|
def check_res_if_std_prot(self): |
1032
|
|
|
wrong = [] |
1033
|
|
|
for r in self.res: |
1034
|
|
|
if r not in AMINOACID_CODES: |
1035
|
|
|
wrong.append(r) |
1036
|
|
|
return wrong |
1037
|
|
|
|
1038
|
|
|
def write(self, outfn='', verbose=False): |
1039
|
|
|
"""Write ```self.lines``` to a file (and add END file) |
1040
|
|
|
|
1041
|
|
|
Args: |
1042
|
|
|
outfn (str): file to save, if outfn is '', then simply use self.fn |
1043
|
|
|
verbose (Boolen): be verbose or not |
1044
|
|
|
|
1045
|
|
|
Returns: |
1046
|
|
|
None |
1047
|
|
|
|
1048
|
|
|
""" |
1049
|
|
|
if outfn == '': |
1050
|
|
|
outfn = self.fn |
1051
|
|
|
f = open(outfn, 'w') |
1052
|
|
|
# test if there is anything to write, if not, it's likely that the |
1053
|
|
|
# file is not a PDB file, e.g. .outCR |
1054
|
|
|
if not self.lines: |
1055
|
|
|
raise Exception('Nothing to write. Is the input a PDB file? ', self.fn) |
1056
|
|
|
for l in self.lines: |
1057
|
|
|
f.write(l + '\n') |
1058
|
|
|
if not l.startswith('END'): |
|
|
|
|
1059
|
|
|
f.write('END') |
1060
|
|
|
f.close() |
1061
|
|
|
if verbose: |
1062
|
|
|
print('Write %s' % outfn) |
1063
|
|
|
|
1064
|
|
|
def get_atom_num(self, line): |
1065
|
|
|
"""Extract atom number from a line of PDB file |
1066
|
|
|
Arguments: |
1067
|
|
|
* line = ATOM line from a PDB file |
1068
|
|
|
Output: |
1069
|
|
|
* atom number (int) |
1070
|
|
|
""" |
1071
|
|
|
return int(''.join([x for x in line[6:11] if x.isdigit()])) |
1072
|
|
|
|
1073
|
|
|
def get_res_num(self, line): |
1074
|
|
|
"""Extract residue number from a line of PDB file |
1075
|
|
|
Arguments: |
1076
|
|
|
* line = ATOM line from a PDB file |
1077
|
|
|
Output: |
1078
|
|
|
* residue number as an integer |
1079
|
|
|
""" |
1080
|
|
|
return int(''.join([x for x in line[22:27] if x.isdigit()])) |
1081
|
|
|
|
1082
|
|
|
def get_res_code(self, line): |
1083
|
|
|
"""Get residue code from a line of a PDB file |
1084
|
|
|
""" |
1085
|
|
|
#if not line.startswith('ATOM'): |
1086
|
|
|
# return None |
1087
|
|
|
return line[17:20] |
1088
|
|
|
|
1089
|
|
|
def shift_atom_names(self): |
1090
|
|
|
nlines = [] |
1091
|
|
|
for l in self.lines: |
1092
|
|
|
if l.startswith('ATOM'): |
1093
|
|
|
atom_name = self.get_atom_code(l) |
1094
|
|
|
l = self.set_atom_code(l, atom_name) |
1095
|
|
|
nlines.append(l) |
1096
|
|
|
self.lines = nlines |
1097
|
|
|
|
1098
|
|
|
def prune_elements(self): |
1099
|
|
|
nlines = [] |
1100
|
|
|
for l in self.lines: |
1101
|
|
|
if l.startswith('ATOM'): |
1102
|
|
|
l = l[:76] + ' ' + l[78:] |
1103
|
|
|
nlines.append(l) |
1104
|
|
|
self.lines = nlines |
1105
|
|
|
|
1106
|
|
|
def get_atom_code(self, line): |
1107
|
|
|
"""Get atom code from a line of a PDB file |
1108
|
|
|
""" |
1109
|
|
|
if not line.startswith('ATOM'): |
1110
|
|
|
return None |
1111
|
|
|
return line[12:16].replace(' ', '').strip() |
1112
|
|
|
|
1113
|
|
|
def get_atom_coords(self, line): |
1114
|
|
|
"""Get atom coordinates from a line of a PDB file |
1115
|
|
|
""" |
1116
|
|
|
if not line.startswith('ATOM'): |
1117
|
|
|
return None |
1118
|
|
|
return tuple(map(float, line[31:54].split())) |
1119
|
|
|
|
1120
|
|
|
def set_atom_coords(self, line, x, y, z): |
1121
|
|
|
"""Get atom coordinates from a line of a PDB file |
1122
|
|
|
""" |
1123
|
|
|
line = line[:30] + (f"{x:>8.3f}") + line[38:] |
1124
|
|
|
line = line[:38] + (f"{y:>8.3f}") + line[46:] |
1125
|
|
|
line = line[:46] + (f"{z:>8.3f}") + line[54:] |
1126
|
|
|
return line |
1127
|
|
|
|
1128
|
|
|
def set_line_bfactor(self, line, bfactor): |
1129
|
|
|
if not line.startswith('ATOM'): |
1130
|
|
|
return None |
1131
|
|
|
return line[:60] + (" %5.2f" % bfactor) + line[66:] |
1132
|
|
|
|
1133
|
|
|
def set_atom_occupancy(self, line, occupancy): |
1134
|
|
|
"""set occupancy for line""" |
1135
|
|
|
return line[:54] + (" %5.2f" % occupancy) + line[60:] |
1136
|
|
|
|
1137
|
|
|
def set_atom_code(self, line, code): |
1138
|
|
|
"""Add atom name/code: |
1139
|
|
|
|
1140
|
|
|
ATOM 1 OP2 C A 1 29.615 36.892 42.657 1.00 1.00 O |
1141
|
|
|
^^^ ^ and element |
1142
|
|
|
""" |
1143
|
|
|
return line[:12] + ' ' + code + ' ' * (3 - len(code)) + line[16:77] + code[0] + line[78:] |
1144
|
|
|
|
1145
|
|
|
def set_res_code(self, line, code): |
1146
|
|
|
""" |
1147
|
|
|
Args: |
1148
|
|
|
lines |
1149
|
|
|
code |
1150
|
|
|
path (str): The path of the file to wrap |
1151
|
|
|
field_storage (FileStorage): The :class:Y instance to wrap |
1152
|
|
|
temporary (bool): Whether or not to delete the file when the File |
1153
|
|
|
instance is destructed |
1154
|
|
|
|
1155
|
|
|
Returns: |
1156
|
|
|
BufferedFileStorage: A buffered writable file descriptor |
1157
|
|
|
""" |
1158
|
|
|
return line[:17] + code.rjust(3) + line[20:] |
1159
|
|
|
|
1160
|
|
|
def get_chain_id(self, line): |
1161
|
|
|
return line[21:22] |
1162
|
|
|
|
1163
|
|
|
def get_all_chain_ids(self): |
1164
|
|
|
""" |
1165
|
|
|
Returns: |
1166
|
|
|
set: chain ids, e.g. set(['A', 'B']) |
1167
|
|
|
""" |
1168
|
|
|
chain_ids = [] |
1169
|
|
|
for l in self.lines: |
1170
|
|
|
chain_id = self.get_chain_id(l) |
1171
|
|
|
if chain_id and chain_id not in chain_ids: |
1172
|
|
|
chain_ids.append(chain_id) |
1173
|
|
|
return chain_ids |
1174
|
|
|
|
1175
|
|
|
def get_atom_index(self, line): |
1176
|
|
|
try: |
1177
|
|
|
return int(line[6:11]) |
1178
|
|
|
except: |
1179
|
|
|
return None |
1180
|
|
|
|
1181
|
|
|
def set_atom_index(self, line, index): |
1182
|
|
|
return line[:6] + str(index).rjust(5) + line[11:] |
1183
|
|
|
|
1184
|
|
|
def reindex_atom_index(self): |
1185
|
|
|
self.nlines = [] |
1186
|
|
|
for i, l in enumerate(self.lines): |
1187
|
|
|
nl = self.set_atom_index(l, i + 1) # start from 1 |
1188
|
|
|
self.nlines.append(nl) |
1189
|
|
|
self.lines = self.nlines |
1190
|
|
|
return self.nlines |
1191
|
|
|
|
1192
|
|
|
def get_res_index(self, line): |
1193
|
|
|
return int(line[22:26]) |
1194
|
|
|
|
1195
|
|
|
def set_res_index(self, line, index): |
1196
|
|
|
return line[:23] + str(index).rjust(3) + line[26:] |
1197
|
|
|
|
1198
|
|
|
def set_chain_id(self, line, chain_id): |
1199
|
|
|
return line[:21] + chain_id + line[22:] |
1200
|
|
|
|
1201
|
|
|
def get_rnapuzzle_ready(self, renumber_residues=True, fix_missing_atoms=True, |
1202
|
|
|
rename_chains=True, |
1203
|
|
|
ignore_op3 = False, |
1204
|
|
|
report_missing_atoms=True, keep_hetatm=False, backbone_only=False, |
1205
|
|
|
no_backbone = False, bases_only = False, save_single_res = False, |
1206
|
|
|
ref_frame_only = False, |
1207
|
|
|
check_geometry = False, |
1208
|
|
|
verbose=False): # :, ready_for="RNAPuzzle"): |
1209
|
|
|
"""Get rnapuzzle (SimRNA) ready structure. |
1210
|
|
|
|
1211
|
|
|
Clean up a structure, get current order of atoms. |
1212
|
|
|
|
1213
|
|
|
:param renumber_residues: boolean, from 1 to ..., second chain starts from 1 etc. |
1214
|
|
|
:param fix_missing_atoms: boolean, superimpose motifs from the minilibrary and copy-paste missing atoms, this is super crude, so should be used with caution. |
1215
|
|
|
|
1216
|
|
|
Submission format @http://ahsoka.u-strasbg.fr/rnapuzzles/ |
1217
|
|
|
|
1218
|
|
|
Run :func:`rna_tools.rna_tools.lib.RNAStructure.std_resn` before this function to fix names. |
1219
|
|
|
|
1220
|
|
|
.. image:: ../pngs/rebuild_op1op2_backbone.png |
1221
|
|
|
Figure: (Starting from left) input structure, structure with rebuilded atoms, and reference. The B fragment is observed in the reference used here as a “benchmark”, fragment A is reconstructed atoms (not observed in the reference"). 201122 |
1222
|
|
|
|
1223
|
|
|
- 170305 Merged with get_simrna_ready and fixing OP3 terminal added |
1224
|
|
|
- 170308 Fix missing atoms for bases, and O2' |
1225
|
|
|
|
1226
|
|
|
.. image:: ../pngs/fix_missing_o_before_after.png |
1227
|
|
|
Fig. Add missing O2' atom (before and after). |
1228
|
|
|
|
1229
|
|
|
.. image:: ../pngs/fix_missing_superposition.png |
1230
|
|
|
Fig. The residue to fix is in cyan. The G base from the library in red. Atoms O4', C2', C1' are shared between the sugar (in cyan) and the G base from the library (in red). These atoms are used to superimpose the G base on the sugar, and then all atoms from the base are copied to the residues. |
1231
|
|
|
|
1232
|
|
|
.. image:: ../pngs/fix_missing_bases.png |
1233
|
|
|
**Fig.** Rebuild ACGU base-less. It's not perfect but good enough for some applications. |
1234
|
|
|
|
1235
|
|
|
.. warning:: It was only tested with the whole base missing! |
1236
|
|
|
|
1237
|
|
|
.. warning:: requires: Biopython |
1238
|
|
|
|
1239
|
|
|
Selection of atoms: |
1240
|
|
|
|
1241
|
|
|
- posphate group (3x, OP1 ,P, OP2), |
1242
|
|
|
- connector (2x O5', C5'), /5x |
1243
|
|
|
- sugar (5x, C4', O4', C3', O3', C1', C2'), /10 |
1244
|
|
|
- extra oxygens from sugar (2x, O2' O3'), for now it's /12! |
1245
|
|
|
- A (10x), G (11x), C (8x), U(8x), max 12+11=23 |
1246
|
|
|
|
1247
|
|
|
And 27 unique atoms: {'N9', 'O2', 'OP2', "O2'", "O4'", 'C8', "O3'", "C1'", 'C2', 'C6', "C5'", 'N6', 'C5', "C4'", 'C4', "O5'", "C3'", 'O6', 'N2', 'N7', 'OP1', 'N1', 'N4', 'P', "C2'", 'N3', 'O4'}. |
1248
|
|
|
""" |
1249
|
|
|
|
1250
|
|
|
if verbose: |
1251
|
|
|
logger.setLevel(logging.DEBUG) |
1252
|
|
|
|
1253
|
|
|
try: |
1254
|
|
|
from Bio import PDB |
1255
|
|
|
from Bio.PDB import PDBIO |
1256
|
|
|
import warnings |
1257
|
|
|
warnings.filterwarnings('ignore', '.*Invalid or missing.*',) |
1258
|
|
|
warnings.filterwarnings('ignore', '.*with given element *',) |
1259
|
|
|
except: |
1260
|
|
|
sys.exit('Error: Install biopython to use this function (pip biopython)') |
1261
|
|
|
|
1262
|
|
|
import copy |
1263
|
|
|
# for debugging |
1264
|
|
|
#renumber_residues = True |
1265
|
|
|
# if ready_for == "RNAPuzzle": |
1266
|
|
|
|
1267
|
|
|
if backbone_only: |
1268
|
|
|
G_ATOMS = "P OP1 OP2 O5' C5'".split() |
1269
|
|
|
A_ATOMS = "P OP1 OP2 O5' C5'".split() |
1270
|
|
|
U_ATOMS = "P OP1 OP2 O5' C5'".split() |
1271
|
|
|
C_ATOMS = "P OP1 OP2 O5' C5'".split() |
1272
|
|
|
#G_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split() |
1273
|
|
|
#A_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split() |
1274
|
|
|
#U_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split() |
1275
|
|
|
#C_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1'".split() |
1276
|
|
|
|
1277
|
|
|
elif ref_frame_only: |
1278
|
|
|
G_ATOMS = "P OP1 OP2".split() |
1279
|
|
|
A_ATOMS = "P OP1 OP2".split() |
1280
|
|
|
U_ATOMS = "P OP1 OP2".split() |
1281
|
|
|
C_ATOMS = "P OP1 OP2".split() |
1282
|
|
|
|
1283
|
|
|
elif no_backbone: |
1284
|
|
|
G_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split() |
1285
|
|
|
A_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split() |
1286
|
|
|
U_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 O4 C5 C6".split() |
1287
|
|
|
C_ATOMS = "C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 N4 C5 C6".split() |
1288
|
|
|
|
1289
|
|
|
elif bases_only: |
1290
|
|
|
G_ATOMS = "N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split() |
1291
|
|
|
A_ATOMS = "N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split() |
1292
|
|
|
U_ATOMS = "N1 C2 O2 N3 C4 O4 C5 C6".split() |
1293
|
|
|
C_ATOMS = "N1 C2 O2 N3 C4 N4 C5 C6".split() |
1294
|
|
|
else: |
1295
|
|
|
G_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split() # 23 |
1296
|
|
|
A_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split() # 22 |
1297
|
|
|
U_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 O4 C5 C6".split() # 20 |
1298
|
|
|
C_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 N4 C5 C6".split() # 20 |
1299
|
|
|
|
1300
|
|
|
# hmm.. is it the same as RNApuzzle??? |
1301
|
|
|
# if ready_for == "SimRNA": |
1302
|
|
|
# G_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 O6 N1 C2 N2 N3 C4".split() |
1303
|
|
|
# A_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N9 C8 N7 C5 C6 N6 N1 C2 N3 C4".split() |
1304
|
|
|
# U_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 O4 C5 C6".split() |
1305
|
|
|
# C_ATOMS = "P OP1 OP2 O5' C5' C4' O4' C3' O3' C2' O2' C1' N1 C2 O2 N3 C4 N4 C5 C6".split() |
1306
|
|
|
|
1307
|
|
|
tf = tempfile.NamedTemporaryFile(delete=False) |
1308
|
|
|
ftmp = tf.name |
1309
|
|
|
self.write(ftmp, verbose=False) |
1310
|
|
|
|
1311
|
|
|
parser = PDB.PDBParser() |
1312
|
|
|
#try: |
1313
|
|
|
struct = parser.get_structure('', ftmp) |
1314
|
|
|
#except: |
1315
|
|
|
# print('Error in ' + self.fn) |
1316
|
|
|
# sys.exit(1) |
1317
|
|
|
model = struct[0] |
1318
|
|
|
|
1319
|
|
|
s2 = PDB.Structure.Structure(struct.id) |
1320
|
|
|
m2 = PDB.Model.Model(model.id) |
1321
|
|
|
|
1322
|
|
|
chains2 = [] |
1323
|
|
|
|
1324
|
|
|
missing = [] |
1325
|
|
|
fixed = [] |
1326
|
|
|
protein_chains_remmoved = [] |
1327
|
|
|
|
1328
|
|
|
new_chains = list(string.ascii_uppercase) |
1329
|
|
|
remarks = [] |
1330
|
|
|
if ignore_op3: |
1331
|
|
|
remarks.append("REMARK 250 don't add OP3 at the start of chain") |
1332
|
|
|
# get path to mini db with models to rebuilt structures |
1333
|
|
|
currfn = __file__ |
1334
|
|
|
if currfn == '': |
1335
|
|
|
path = '.' |
1336
|
|
|
else: |
1337
|
|
|
path = os.path.dirname(currfn) |
1338
|
|
|
if os.path.islink(currfn): # path + os.sep + os.path.basename(__file__)): |
1339
|
|
|
path = os.path.dirname(os.readlink( |
1340
|
|
|
path + os.sep + os.path.basename(currfn))) |
1341
|
|
|
|
1342
|
|
|
# for each chain # |
1343
|
|
|
for chain in model.get_list(): |
1344
|
|
|
logger.debug('chain: %s' % chain) |
1345
|
|
|
|
1346
|
|
|
# is it RNA? ############################ |
1347
|
|
|
protein_like = 0 |
1348
|
|
|
for c, r in enumerate(chain, 1): |
1349
|
|
|
if r.resname in AMINOACID_CODES: |
1350
|
|
|
protein_like += 1 |
1351
|
|
|
if (protein_like / float(c + 1)) > .8: # 90% |
|
|
|
|
1352
|
|
|
protein_chains_remmoved.append(chain.get_id()) |
1353
|
|
|
# ###################################### |
1354
|
|
|
|
1355
|
|
|
res = [] |
1356
|
|
|
for r in chain: |
1357
|
|
|
res.append(r) |
1358
|
|
|
|
1359
|
|
|
res = copy.copy(res) |
1360
|
|
|
|
1361
|
|
|
# start chains from A..BCD. etc |
1362
|
|
|
|
1363
|
|
|
if rename_chains: |
1364
|
|
|
try: |
1365
|
|
|
chain.id = new_chains.pop(0) |
1366
|
|
|
except ValueError: |
1367
|
|
|
# ValueError: Cannot change id from `A` to `A`. |
1368
|
|
|
# The id `A` is already used for a sibling of this entity. |
1369
|
|
|
# so keep it as it was |
1370
|
|
|
pass |
1371
|
|
|
|
1372
|
|
|
c2 = PDB.Chain.Chain(chain.id) |
1373
|
|
|
|
1374
|
|
|
c = 1 # new chain, goes from 1 !!! if renumber True |
1375
|
|
|
prev_r = '' # init prev_r |
1376
|
|
|
|
1377
|
|
|
for r in res: |
1378
|
|
|
# |
1379
|
|
|
# deal with heteratm |
1380
|
|
|
# |
1381
|
|
|
if r.id[0].startswith('H_') or r.id[0].startswith('W'): |
1382
|
|
|
if keep_hetatm: |
1383
|
|
|
c2.add(r) |
1384
|
|
|
else: |
1385
|
|
|
# str(r) % r.id[0].replace('H_', '').strip()) |
1386
|
|
|
remarks.append('REMARK 250 Hetatom to be removed from file: ' + str(r)) |
1387
|
|
|
continue |
1388
|
|
|
# hack for amber/qrna |
1389
|
|
|
r.resname = r.resname.strip() |
1390
|
|
|
if r.resname == 'RC3': |
1391
|
|
|
r.resname = 'C' |
1392
|
|
|
if r.resname == 'RU3': |
1393
|
|
|
r.resname = 'U' |
1394
|
|
|
if r.resname == 'RG3': |
1395
|
|
|
r.resname = 'G' |
1396
|
|
|
if r.resname == 'RA3': |
1397
|
|
|
r.resname = 'A' |
1398
|
|
|
|
1399
|
|
|
if r.resname == 'C3': |
1400
|
|
|
r.resname = 'C' |
1401
|
|
|
if r.resname == 'U3': |
1402
|
|
|
r.resname = 'U' |
1403
|
|
|
if r.resname == 'G3': |
1404
|
|
|
r.resname = 'G' |
1405
|
|
|
if r.resname == 'A3': |
1406
|
|
|
r.resname = 'A' |
1407
|
|
|
|
1408
|
|
|
if r.resname == 'RC5': |
1409
|
|
|
r.resname = 'C' |
1410
|
|
|
if r.resname == 'RU5': |
1411
|
|
|
r.resname = 'U' |
1412
|
|
|
if r.resname == 'RG5': |
1413
|
|
|
r.resname = 'G' |
1414
|
|
|
if r.resname == 'RA5': |
1415
|
|
|
r.resname = 'A' |
1416
|
|
|
|
1417
|
|
|
if r.resname == 'C5': |
1418
|
|
|
r.resname = 'C' |
1419
|
|
|
if r.resname == 'U5': |
1420
|
|
|
r.resname = 'U' |
1421
|
|
|
if r.resname == 'G5': |
1422
|
|
|
r.resname = 'G' |
1423
|
|
|
if r.resname == 'A5': |
1424
|
|
|
r.resname = 'A' |
1425
|
|
|
|
1426
|
|
|
if r.resname.strip() == 'RC': |
1427
|
|
|
r.resname = 'C' |
1428
|
|
|
if r.resname.strip() == 'RU': |
1429
|
|
|
r.resname = 'U' |
1430
|
|
|
if r.resname.strip() == 'RG': |
1431
|
|
|
r.resname = 'G' |
1432
|
|
|
if r.resname.strip() == 'RA': |
1433
|
|
|
r.resname = 'A' |
1434
|
|
|
|
1435
|
|
|
# unmodified rna 2MG -> G and take only G atoms |
1436
|
|
|
if (r.resname.strip() not in ['C', 'U', 'G', 'A']) and \ |
1437
|
|
|
(r.resname.strip()[-1] in ['C', 'U', 'G', 'A']): |
1438
|
|
|
r.resname = r.resname.strip()[-1].strip() |
1439
|
|
|
|
1440
|
|
|
r2 = PDB.Residue.Residue(r.id, r.resname.strip(), r.segid) |
1441
|
|
|
if renumber_residues: |
1442
|
|
|
r2.id = (r2.id[0], c, r2.id[2]) # renumber residues |
1443
|
|
|
# |
1444
|
|
|
# experimental: fixing missing OP3. |
1445
|
|
|
# Only for the first residues. |
1446
|
|
|
# |
1447
|
|
|
if c == 1: |
1448
|
|
|
# if p_missing |
1449
|
|
|
p_missing = True |
1450
|
|
|
# if p_missing: |
1451
|
|
|
# try: |
1452
|
|
|
# x = r["O5'"] |
1453
|
|
|
# x.id = ' P' |
1454
|
|
|
# x.name = ' P' |
1455
|
|
|
# x.fullname = ' P' |
1456
|
|
|
# print "REMARK 000 FIX O5' -> P fix in chain ", chain.id |
1457
|
|
|
# except: |
1458
|
|
|
# pass |
1459
|
|
|
for a in r: |
1460
|
|
|
if a.id == 'P': |
1461
|
|
|
p_missing = False |
1462
|
|
|
logger.debug('p_missing %s' % p_missing) |
1463
|
|
|
|
1464
|
|
|
if p_missing and fix_missing_atoms: |
1465
|
|
|
currfn = __file__ |
1466
|
|
|
if currfn == '': |
1467
|
|
|
path = '.' |
1468
|
|
|
else: |
1469
|
|
|
path = os.path.dirname(currfn) |
1470
|
|
|
if os.path.islink(currfn): # path + os.sep + os.path.basename(__file__)): |
1471
|
|
|
path = os.path.dirname(os.readlink( |
1472
|
|
|
path + os.sep + os.path.basename(currfn))) |
1473
|
|
|
|
1474
|
|
|
po3_struc = PDB.PDBParser().get_structure('', path + '/data/PO3_inner.pdb') |
1475
|
|
|
po3 = [po3_atom for po3_atom in po3_struc[0].get_residues()][0] |
1476
|
|
|
|
1477
|
|
|
r_atoms = [r["O4'"], r["C4'"], r["C3'"]] |
1478
|
|
|
po3_atoms = [po3["O4'"], po3["C4'"], po3["C3'"]] |
1479
|
|
|
|
1480
|
|
|
sup = PDB.Superimposer() |
1481
|
|
|
sup.set_atoms(r_atoms, po3_atoms) |
1482
|
|
|
rms = round(sup.rms, 3) |
1483
|
|
|
|
1484
|
|
|
sup.apply(po3_struc.get_atoms()) # to all atoms of po3 |
1485
|
|
|
|
1486
|
|
|
r.add(po3['P']) |
1487
|
|
|
r.add(po3['OP1']) |
1488
|
|
|
r.add(po3['OP2']) |
1489
|
|
|
try: |
1490
|
|
|
r.add(po3["O5'"]) |
1491
|
|
|
except: |
1492
|
|
|
del r["O5'"] |
1493
|
|
|
r.add(po3["O5'"]) |
1494
|
|
|
|
1495
|
|
|
fixed.append(['add OP3 at the beginning of the chain ', chain.id, r, r.id[1]]) |
1496
|
|
|
|
1497
|
|
|
p_missing = False # off this function |
1498
|
|
|
|
1499
|
|
|
# save it |
1500
|
|
|
#io = PDB.PDBIO() |
1501
|
|
|
#io.set_structure( po3_struc ) |
1502
|
|
|
# io.save("po3.pdb") |
1503
|
|
|
|
1504
|
|
|
# |
1505
|
|
|
# fix missing O2' |
1506
|
|
|
# |
1507
|
|
|
o2p_missing = True |
1508
|
|
|
for a in r: |
1509
|
|
|
logger.debug('o2p_missing: %s %s %s' % (r, o2p_missing, a.id)) |
1510
|
|
|
if a.id == "O2'": |
1511
|
|
|
o2p_missing = False |
1512
|
|
|
logger.debug('o2p_missing: %s', o2p_missing) |
1513
|
|
|
|
1514
|
|
|
if o2p_missing and fix_missing_atoms: |
1515
|
|
|
currfn = __file__ |
1516
|
|
|
if currfn == '': |
1517
|
|
|
path = '.' |
1518
|
|
|
else: |
1519
|
|
|
path = os.path.dirname(currfn) |
1520
|
|
|
if os.path.islink(currfn): # path + os.sep + os.path.basename(__file__)): |
1521
|
|
|
path = os.path.dirname(os.readlink( |
1522
|
|
|
path + os.sep + os.path.basename(currfn))) |
1523
|
|
|
|
1524
|
|
|
o2p_struc = PDB.PDBParser().get_structure('', path + '/data/o2prim.pdb') |
1525
|
|
|
o2p = [o2p_atom for o2p_atom in o2p_struc[0].get_residues()][0] |
1526
|
|
|
|
1527
|
|
|
try: |
1528
|
|
|
r_atoms = [r["C3'"], r["C2'"], r["C1'"]] |
1529
|
|
|
except: |
1530
|
|
|
ic(r) |
1531
|
|
|
o2p_atoms = [o2p["C3'"], o2p["C2'"], o2p["C1'"]] |
1532
|
|
|
|
1533
|
|
|
sup = PDB.Superimposer() |
1534
|
|
|
sup.set_atoms(r_atoms, o2p_atoms) |
1535
|
|
|
rms = round(sup.rms, 3) |
1536
|
|
|
|
1537
|
|
|
sup.apply(o2p_struc.get_atoms()) # to all atoms of o2p |
1538
|
|
|
|
1539
|
|
|
r.add(o2p["O2'"]) |
1540
|
|
|
logger.debug('fixing o2p for ' % r) |
1541
|
|
|
fixed.append(['add O2\' ', chain.id, r, c]) |
1542
|
|
|
|
1543
|
|
|
o2p_missing = False # off this function |
1544
|
|
|
######## fixing of missing OP1 and OP2 atoms in backbone ########### |
1545
|
|
|
if fix_missing_atoms: |
1546
|
|
|
if 'OP1' not in r: |
1547
|
|
|
if prev_r: |
1548
|
|
|
op4 = PDB.PDBParser().get_structure('', path + '/data/op4.pdb') |
1549
|
|
|
op4a = [a for a in op4[0].get_residues()][0] |
1550
|
|
|
if 'P' not in r: |
1551
|
|
|
print("Error missing P of residue " + chain.id + ':' + str(r.id[1]) + ". Sorry, I can't rebuild missing atoms!") |
1552
|
|
|
exit() |
1553
|
|
|
r_atoms = [r["O5'"], r["P"], prev_r["O3'"]] # r["C5'"], |
1554
|
|
|
op4_atoms = [op4a["O5'"], op4a["P"], op4a["O3'"]] # op4a["C5'"]] #, |
1555
|
|
|
|
1556
|
|
|
sup = PDB.Superimposer() |
1557
|
|
|
sup.set_atoms(r_atoms, op4_atoms) |
1558
|
|
|
sup.apply(op4.get_atoms()) |
1559
|
|
|
|
1560
|
|
|
r.add(op4a["OP1"]) |
1561
|
|
|
r.add(op4a["OP2"]) |
1562
|
|
|
fixed.append(['add the missing OP1 and OP2', chain.id, r, r.id[1]]) |
1563
|
|
|
|
1564
|
|
|
else: |
1565
|
|
|
remarks.append("REMARK 250 Warning: Can't fix missing OP1 or/and OP2 atoms of " + str(r)) |
1566
|
|
|
# |
1567
|
|
|
# fix missing C (the whole base at the moment) |
1568
|
|
|
# |
1569
|
|
View Code Duplication |
if str(r.get_resname()).strip() == "C" and fix_missing_atoms: |
|
|
|
|
1570
|
|
|
for a in r: |
1571
|
|
|
if a.id == "N1": |
1572
|
|
|
break |
1573
|
|
|
else: |
1574
|
|
|
C_struc = PDB.PDBParser().get_structure('', path + '/data/C.pdb') |
1575
|
|
|
C = [C_atom for C_atom in C_struc[0].get_residues()][0] |
1576
|
|
|
|
1577
|
|
|
r_atoms = [r["O4'"], r["C2'"], r["C1'"]] |
1578
|
|
|
C_atoms = [C["O4'"], C["C2'"], C["C1'"]] |
1579
|
|
|
|
1580
|
|
|
sup = PDB.Superimposer() |
1581
|
|
|
sup.set_atoms(r_atoms, C_atoms) |
1582
|
|
|
rms = round(sup.rms, 3) |
1583
|
|
|
|
1584
|
|
|
sup.apply(C_struc.get_atoms()) # to all atoms of C |
1585
|
|
|
|
1586
|
|
|
r.add(C["N1"]) |
1587
|
|
|
r.add(C["C2"]) |
1588
|
|
|
r.add(C["O2"]) |
1589
|
|
|
r.add(C["N3"]) |
1590
|
|
|
r.add(C["C4"]) |
1591
|
|
|
r.add(C["N4"]) |
1592
|
|
|
r.add(C["C5"]) |
1593
|
|
|
r.add(C["C6"]) |
1594
|
|
|
|
1595
|
|
|
fixed.append(['add the whole base C', chain.id, r, r.id[1]]) |
1596
|
|
|
|
1597
|
|
|
# |
1598
|
|
|
# fix missing U (the whole base at the moment) |
1599
|
|
|
# |
1600
|
|
View Code Duplication |
if str(r.get_resname()).strip() == "U" and fix_missing_atoms: |
|
|
|
|
1601
|
|
|
for a in r: |
1602
|
|
|
if a.id == "N1": |
1603
|
|
|
break |
1604
|
|
|
else: |
1605
|
|
|
U_struc = PDB.PDBParser().get_structure('', path + '/data/U.pdb') |
1606
|
|
|
U = [U_atom for U_atom in U_struc[0].get_residues()][0] |
1607
|
|
|
|
1608
|
|
|
r_atoms = [r["O4'"], r["C2'"], r["C1'"]] |
1609
|
|
|
U_atoms = [U["O4'"], U["C2'"], U["C1'"]] |
1610
|
|
|
|
1611
|
|
|
sup = PDB.Superimposer() |
1612
|
|
|
sup.set_atoms(r_atoms, U_atoms) |
1613
|
|
|
rms = round(sup.rms, 3) |
1614
|
|
|
|
1615
|
|
|
sup.apply(U_struc.get_atoms()) # to all atoms of U |
1616
|
|
|
|
1617
|
|
|
r.add(U["N1"]) |
1618
|
|
|
r.add(U["C2"]) |
1619
|
|
|
r.add(U["O2"]) |
1620
|
|
|
r.add(U["N3"]) |
1621
|
|
|
r.add(U["C4"]) |
1622
|
|
|
r.add(U["O4"]) |
1623
|
|
|
r.add(U["C5"]) |
1624
|
|
|
r.add(U["C6"]) |
1625
|
|
|
|
1626
|
|
|
fixed.append(['add the whole base U', chain.id, r, r.id[1]]) |
1627
|
|
|
# |
1628
|
|
|
# fix missing G (the whole base at the moment) |
1629
|
|
|
# |
1630
|
|
View Code Duplication |
if str(r.get_resname()).strip() == "G" and fix_missing_atoms: |
|
|
|
|
1631
|
|
|
for a in r: |
1632
|
|
|
if a.id == "N1": |
1633
|
|
|
break |
1634
|
|
|
else: |
1635
|
|
|
G_struc = PDB.PDBParser().get_structure('', path + '/data/G.pdb') |
1636
|
|
|
G = [G_atom for G_atom in G_struc[0].get_residues()][0] |
1637
|
|
|
|
1638
|
|
|
r_atoms = [r["O4'"], r["C2'"], r["C1'"]] |
1639
|
|
|
G_atoms = [G["O4'"], G["C2'"], G["C1'"]] |
1640
|
|
|
|
1641
|
|
|
sup = PDB.Superimposer() |
1642
|
|
|
sup.set_atoms(r_atoms, G_atoms) |
1643
|
|
|
rms = round(sup.rms, 3) |
1644
|
|
|
|
1645
|
|
|
sup.apply(G_struc.get_atoms()) # to all atoms of G |
1646
|
|
|
|
1647
|
|
|
r.add(G["N9"]) |
1648
|
|
|
r.add(G["C8"]) |
1649
|
|
|
r.add(G["N7"]) |
1650
|
|
|
r.add(G["C5"]) |
1651
|
|
|
r.add(G["C6"]) |
1652
|
|
|
r.add(G["O6"]) |
1653
|
|
|
r.add(G["N1"]) |
1654
|
|
|
r.add(G["C2"]) |
1655
|
|
|
r.add(G["N2"]) |
1656
|
|
|
r.add(G["N3"]) |
1657
|
|
|
r.add(G["C4"]) |
1658
|
|
|
|
1659
|
|
|
fixed.append(['add the whole base G', chain.id, r, r.id[1]]) |
1660
|
|
|
# |
1661
|
|
|
# fix missing A (the whole base at the moment) |
1662
|
|
|
# |
1663
|
|
View Code Duplication |
if str(r.get_resname()).strip() == "A" and fix_missing_atoms: |
|
|
|
|
1664
|
|
|
for a in r: |
1665
|
|
|
if a.id == "N1": |
1666
|
|
|
break |
1667
|
|
|
else: |
1668
|
|
|
A_struc = PDB.PDBParser().get_structure('', path + '/data/A.pdb') |
1669
|
|
|
A = [A_atom for A_atom in A_struc[0].get_residues()][0] |
1670
|
|
|
|
1671
|
|
|
r_atoms = [r["O4'"], r["C2'"], r["C1'"]] |
1672
|
|
|
A_atoms = [A["O4'"], A["C2'"], A["C1'"]] |
1673
|
|
|
|
1674
|
|
|
sup = PDB.Superimposer() |
1675
|
|
|
sup.set_atoms(r_atoms, A_atoms) |
1676
|
|
|
rms = round(sup.rms, 3) |
1677
|
|
|
|
1678
|
|
|
sup.apply(A_struc.get_atoms()) # to all atoms of A |
1679
|
|
|
|
1680
|
|
|
r.add(A["N9"]) |
1681
|
|
|
r.add(A["C8"]) |
1682
|
|
|
r.add(A["N7"]) |
1683
|
|
|
r.add(A["C5"]) |
1684
|
|
|
r.add(A["C6"]) |
1685
|
|
|
r.add(A["N6"]) |
1686
|
|
|
r.add(A["N1"]) |
1687
|
|
|
r.add(A["C2"]) |
1688
|
|
|
r.add(A["N3"]) |
1689
|
|
|
r.add(A["C4"]) |
1690
|
|
|
|
1691
|
|
|
fixed.append(['add the whole base A', chain.id, r, r.id[1]]) |
1692
|
|
|
|
1693
|
|
|
# |
1694
|
|
|
# strip residues of extra atoms, not in G_ATOMS in this case |
1695
|
|
|
# |
1696
|
|
|
if str(r.get_resname()).strip() == "G": |
1697
|
|
|
for an in G_ATOMS: |
1698
|
|
|
if c == 1 and ignore_op3: |
1699
|
|
|
if an in ['P', 'OP1', 'OP2']: |
1700
|
|
|
continue |
1701
|
|
|
try: |
1702
|
|
|
if c == 1 and an == "O5'" and p_missing: |
|
|
|
|
1703
|
|
|
r2.add(x) |
|
|
|
|
1704
|
|
|
else: |
1705
|
|
|
r2.add(r[an]) |
1706
|
|
|
except KeyError: |
1707
|
|
|
# print 'Missing:', an, r, ' new resi', c |
1708
|
|
|
missing.append([an, chain.id, r, r.id[1]]) |
1709
|
|
|
c2.add(r2) |
1710
|
|
|
|
1711
|
|
|
elif str(r.get_resname()).strip() == "A": |
1712
|
|
|
for an in A_ATOMS: |
1713
|
|
|
if c == 1 and ignore_op3: |
1714
|
|
|
if an in ['P', 'OP1', 'OP2']: |
1715
|
|
|
continue |
1716
|
|
|
try: |
1717
|
|
|
if c == 1 and an == "O5'" and p_missing: |
1718
|
|
|
r2.add(x) |
1719
|
|
|
else: |
1720
|
|
|
r2.add(r[an]) |
1721
|
|
|
except KeyError: |
1722
|
|
|
# print 'Missing:', an, r, ' new resi', c |
1723
|
|
|
missing.append([an, chain.id, r, r.id[1]]) |
1724
|
|
|
c2.add(r2) |
1725
|
|
|
|
1726
|
|
|
elif str(r.get_resname()).strip() == "C": |
1727
|
|
|
for an in C_ATOMS: |
1728
|
|
|
if c == 1 and ignore_op3: |
1729
|
|
|
if an in ['P', 'OP1', 'OP2']: |
1730
|
|
|
continue |
1731
|
|
|
try: |
1732
|
|
|
if c == 1 and an == "O5'" and p_missing: |
1733
|
|
|
r2.add(x) |
1734
|
|
|
else: |
1735
|
|
|
r2.add(r[an]) |
1736
|
|
|
except: |
1737
|
|
|
# print 'Missing:', an, r, ' new resi', c |
1738
|
|
|
missing.append([an, chain.id, r, r.id[1]]) |
1739
|
|
|
c2.add(r2) |
1740
|
|
|
|
1741
|
|
|
elif str(r.get_resname()).strip() == "U": |
1742
|
|
|
for an in U_ATOMS: |
1743
|
|
|
if c == 1 and ignore_op3: |
1744
|
|
|
if an in ['P', 'OP1', 'OP2']: |
1745
|
|
|
continue |
1746
|
|
|
try: |
1747
|
|
|
if c == 1 and an == "O5'" and p_missing: |
1748
|
|
|
r2.add(x) |
1749
|
|
|
else: |
1750
|
|
|
r2.add(r[an]) |
1751
|
|
|
except KeyError: |
1752
|
|
|
# print 'Missing:', an, r,' new resi', c |
1753
|
|
|
missing.append([an, chain.id, r, r.id[1]]) |
1754
|
|
|
c2.add(r2) |
1755
|
|
|
|
1756
|
|
|
|
1757
|
|
|
# save each residue a sa single ife |
1758
|
|
|
if save_single_res: |
1759
|
|
|
from Bio.PDB import Structure |
1760
|
|
|
# Create an empty structure |
1761
|
|
|
structure = Structure.Structure("my_structure") |
1762
|
|
|
model = Model.Model(0) |
|
|
|
|
1763
|
|
|
chain = Chain.Chain('A') |
|
|
|
|
1764
|
|
|
chain.add(r2) |
1765
|
|
|
model.add(chain) |
1766
|
|
|
structure.add(model) |
1767
|
|
|
|
1768
|
|
|
io = PDBIO() |
1769
|
|
|
io.set_structure(structure) |
1770
|
|
|
io.save(f"{self.fn}_{r2.id[1]}.pdb") |
1771
|
|
|
|
1772
|
|
|
prev_r = r # hack to keep preview residue to be used in the function |
1773
|
|
|
# e.g., to get an atom from this residue |
1774
|
|
|
c += 1 |
1775
|
|
|
chains2.append(c2) |
1776
|
|
|
|
1777
|
|
|
io = PDBIO() |
1778
|
|
|
s2.add(m2) |
1779
|
|
|
for chain2 in chains2: |
1780
|
|
|
m2.add(chain2) |
1781
|
|
|
# print c2 |
1782
|
|
|
# print m2 |
1783
|
|
|
io.set_structure(s2) |
1784
|
|
|
|
1785
|
|
|
tf = tempfile.NamedTemporaryFile(delete=False) |
1786
|
|
|
fout = tf.name |
1787
|
|
|
io.save(fout) |
1788
|
|
|
|
1789
|
|
|
if fixed: |
1790
|
|
|
remarks.append('REMARK 250 Fixed atoms/residues:') |
1791
|
|
|
for i in fixed: |
1792
|
|
|
remarks.append( |
1793
|
|
|
' '.join(['REMARK 250 -', str(i[0]), 'in chain:', str(i[1]), str(i[2]), 'residue #', str(i[3])])) |
1794
|
|
|
|
1795
|
|
|
if missing and report_missing_atoms: |
1796
|
|
|
remarks.append('REMARK 250 Missing atoms:') |
1797
|
|
|
for i in missing: |
1798
|
|
|
remarks.append(' '.join(['REMARK 250 +', str(i[0]), |
1799
|
|
|
'chain:', str(i[1]), str(i[2]), 'residue #', str(i[3])])) |
1800
|
|
|
#raise Exception('Missing atoms in %s' % self.fn) |
1801
|
|
|
|
1802
|
|
|
if protein_chains_remmoved: |
1803
|
|
|
remarks.append('REMARK 250 Chains that seem to be proteins removed and : ' + |
1804
|
|
|
' '.join(protein_chains_remmoved)) |
1805
|
|
|
# |
1806
|
|
|
|
1807
|
|
|
# fix ter 'TER' -> TER 1528 G A 71 |
1808
|
|
|
# |
1809
|
|
|
s = RNAStructure(fout) |
1810
|
|
|
if check_geometry: |
1811
|
|
|
s.check_geometry() |
1812
|
|
|
self.lines = s.lines |
1813
|
|
|
c = 0 |
1814
|
|
|
# ATOM 1527 C4 G A 71 0.000 0.000 0.000 1.00 0.00 C |
1815
|
|
|
nlines = [] |
1816
|
|
|
no_ters = 0 |
1817
|
|
|
for l in self.lines: |
1818
|
|
|
## align atoms to the left ####################################################### |
1819
|
|
|
# ATOM 3937 P C B 185 11.596 -7.045 26.165 1.00 0.00 P |
1820
|
|
|
# ATOM 3937 P C B 185 11.596 -7.045 26.165 1.00 0.00 P |
1821
|
|
|
if l.startswith('ATOM'): |
1822
|
|
|
atom_code = self.get_atom_code(l) |
1823
|
|
|
l = self.set_atom_code(l, atom_code) |
1824
|
|
|
################################################################################## |
1825
|
|
|
if l.startswith('TER'): |
1826
|
|
|
# pass # leave it for now this |
1827
|
|
|
atom_l = self.lines[c - 1] |
1828
|
|
|
new_l = 'TER'.ljust(80) # TER 1528 G A 71 <<<' |
1829
|
|
|
new_l = self.set_atom_index(new_l, str(self.get_atom_index(atom_l) + 1 + no_ters)) |
1830
|
|
|
new_l = self.set_res_code(new_l, self.get_res_code(atom_l)) |
1831
|
|
|
new_l = self.set_chain_id(new_l, self.get_chain_id(atom_l)) |
1832
|
|
|
new_l = self.set_res_index(new_l, self.get_res_index(atom_l)) |
1833
|
|
|
nlines.append(new_l) |
1834
|
|
|
#nlines.append(l) |
1835
|
|
|
no_ters += 1 |
1836
|
|
|
else: |
1837
|
|
|
if self.get_atom_index(l): |
1838
|
|
|
l = self.set_atom_index(l, self.get_atom_index( |
1839
|
|
|
l) + no_ters) # 1 ter +1 2 ters +2 etc |
1840
|
|
|
nlines.append(l) |
1841
|
|
|
c += 1 |
1842
|
|
|
self.lines = nlines |
1843
|
|
|
return remarks |
1844
|
|
|
|
1845
|
|
|
def set_occupancy_atoms(self, occupancy): |
1846
|
|
|
""" |
1847
|
|
|
:param occupancy: |
1848
|
|
|
""" |
1849
|
|
|
nlines = [] |
1850
|
|
|
for l in self.lines: |
1851
|
|
|
if l.startswith('ATOM'): |
1852
|
|
|
l = self.set_atom_occupancy(l, 0.00) |
1853
|
|
|
nlines.append(l) |
1854
|
|
|
else: |
1855
|
|
|
nlines.append(l) |
1856
|
|
|
self.lines = nlines |
1857
|
|
|
|
1858
|
|
|
def edit_occupancy_of_pdb(txt, pdb, pdb_out, v=False): |
1859
|
|
|
"""Make all atoms 1 (flexi) and then set occupancy 0 for seletected atoms. |
1860
|
|
|
Return False if error. True if OK |
1861
|
|
|
""" |
1862
|
|
|
struc = PDB.PDBParser().get_structure('struc', pdb) |
|
|
|
|
1863
|
|
|
|
1864
|
|
|
txt = txt.replace(' ', '') |
1865
|
|
|
if v: |
1866
|
|
|
print (txt) |
1867
|
|
|
l = re.split('[,:;]', txt) |
1868
|
|
|
if v: |
1869
|
|
|
print (l) |
1870
|
|
|
|
1871
|
|
|
for s in struc: |
1872
|
|
|
for c in s: |
1873
|
|
|
for r in c: |
1874
|
|
|
for a in r: |
1875
|
|
|
a.set_occupancy(1) # make it flaxi |
1876
|
|
|
|
1877
|
|
|
for i in l: # ['A', '1-10', '15', '25-30', 'B', '1-10'] |
1878
|
|
|
|
1879
|
|
|
if i in string.ascii_letters: |
1880
|
|
|
if v: |
1881
|
|
|
print('chain', i) |
1882
|
|
|
chain_curr = i |
1883
|
|
|
continue |
1884
|
|
|
|
1885
|
|
|
if i.find('-') > -1: |
1886
|
|
|
start, ends = i.split('-') |
1887
|
|
|
if start > ends: |
1888
|
|
|
print('Error: range start > end ' + i) # >>sys.stderr |
1889
|
|
|
return False |
1890
|
|
|
index = list(range(int(start), int(ends) + 1)) |
1891
|
|
|
else: |
1892
|
|
|
index = [int(i)] |
1893
|
|
|
|
1894
|
|
|
for i in index: |
1895
|
|
|
# change b_factor |
1896
|
|
|
try: |
1897
|
|
|
atoms = struc[0][chain_curr][i] |
|
|
|
|
1898
|
|
|
except KeyError: |
1899
|
|
|
if i == chain_curr: |
1900
|
|
|
print('Error: Chain ' + chain_curr + |
1901
|
|
|
' not found in the PDB structure') # >>sys.stderr, |
1902
|
|
|
else: |
1903
|
|
|
print('Error: Residue ' + chain_curr + ':' + str(i) + |
1904
|
|
|
' found in the PDB structure') # >>sys.stderr, |
1905
|
|
|
return False |
1906
|
|
|
for a in atoms: |
1907
|
|
|
a.set_occupancy(0) |
1908
|
|
|
|
1909
|
|
|
io = PDBIO() |
|
|
|
|
1910
|
|
|
io.set_structure(struc) |
1911
|
|
|
io.save(pdb_out) |
1912
|
|
|
print('Saved ', pdb_out) |
1913
|
|
|
return True |
1914
|
|
|
|
1915
|
|
|
def view(self): |
1916
|
|
|
os.system('pymol ' + self.fn) |
1917
|
|
|
|
1918
|
|
|
def remove(self, verbose): |
1919
|
|
|
"""Delete file, self.fn""" |
1920
|
|
|
os.remove(self.fn) |
1921
|
|
|
if verbose: |
1922
|
|
|
'File %s removed' % self.fn |
1923
|
|
|
|
1924
|
|
|
def __repr__(self): |
1925
|
|
|
return 'RNAStructure %s' % self.fn |
1926
|
|
|
|
1927
|
|
|
|
1928
|
|
|
def get_remarks_text(self): |
1929
|
|
|
"""Get remarks as text for given file. |
1930
|
|
|
This function re-open files, as define as self.fn to get remarks. |
1931
|
|
|
|
1932
|
|
|
Example:: |
1933
|
|
|
|
1934
|
|
|
r = RNAStructure(fout) |
1935
|
|
|
remarks = r.get_remarks_txt() |
1936
|
|
|
r1 = r.get_res_txt('A', 1) |
1937
|
|
|
r2 = r.get_res_txt('A', 2) |
1938
|
|
|
r3 = r.get_res_txt('A', 3) |
1939
|
|
|
with open(fout, 'w') as f: |
1940
|
|
|
f.write(remarks) |
1941
|
|
|
f.write(r1) |
1942
|
|
|
f.write(r2) |
1943
|
|
|
f.write(r3) |
1944
|
|
|
|
1945
|
|
|
remarks is |
1946
|
|
|
|
1947
|
|
|
REMARK 250 Model edited with rna-tools |
1948
|
|
|
REMARK 250 ver 3.5.4+63.g4338516.dirty |
1949
|
|
|
REMARK 250 https://github.com/mmagnus/rna-tools |
1950
|
|
|
REMARK 250 Fri Nov 13 10:15:19 2020 |
1951
|
|
|
|
1952
|
|
|
""" |
1953
|
|
|
txt = '' |
1954
|
|
|
for l in open(self.fn): |
1955
|
|
|
if l.startswith("REMARK"): |
1956
|
|
|
txt += l |
1957
|
|
|
return txt |
1958
|
|
|
|
1959
|
|
|
def get_res_text(self, chain_id, resi): |
1960
|
|
|
"""Get a residue of given resi of chain_id and return as a text |
1961
|
|
|
|
1962
|
|
|
Args: |
1963
|
|
|
chain_id (str): e.g., 'A' |
1964
|
|
|
resi (int): e.g., 1 |
1965
|
|
|
|
1966
|
|
|
Returns: |
1967
|
|
|
txt: |
1968
|
|
|
|
1969
|
|
|
Example:: |
1970
|
|
|
|
1971
|
|
|
r = RNAStructure(fn) |
1972
|
|
|
print(r.get_res_txt('A', 1)) |
1973
|
|
|
|
1974
|
|
|
ATOM 1 O5' G A 1 78.080 -14.909 -0.104 1.00 9.24 O |
1975
|
|
|
ATOM 2 C5' G A 1 79.070 -15.499 -0.956 1.00 9.70 C |
1976
|
|
|
ATOM 3 C4' G A 1 78.597 -16.765 -1.648 1.00 9.64 C |
1977
|
|
|
ATOM 4 O4' G A 1 78.180 -17.761 -0.672 1.00 9.88 O |
1978
|
|
|
(...) |
1979
|
|
|
|
1980
|
|
|
""" |
1981
|
|
|
txt = '' |
1982
|
|
|
for l in self.lines: |
1983
|
|
|
if l.startswith("ATOM"): |
1984
|
|
|
if self.get_res_num(l) == resi and self.get_chain_id(l) == chain_id: |
1985
|
|
|
txt += l + '\n' |
1986
|
|
|
return txt |
1987
|
|
|
|
1988
|
|
|
def mq(self, method="RASP", verbose=False): |
1989
|
|
|
|
1990
|
|
|
if method.lower() == "rasp": |
1991
|
|
|
from rna_tools.tools.mq.RASP import RASP |
1992
|
|
|
r = RASP.RASP() |
1993
|
|
|
return [float(i) for i in r.run(self.fn, potentials=['all'])] |
1994
|
|
|
# ['-42058.4', '466223', '-0.0902109', '0', '0', '0'] |
1995
|
|
|
|
1996
|
|
|
if method.lower() == "dfire": |
1997
|
|
|
from rna_tools.tools.mq.Dfire import Dfire |
1998
|
|
|
r = Dfire.Dfire() |
1999
|
|
|
return r.run(self.fn, verbose) |
2000
|
|
|
|
2001
|
|
|
if method.lower() == "rna3dcnn": |
2002
|
|
|
from rna_tools.tools.mq.RNA3DCNN import RNA3DCNN |
2003
|
|
|
r = RNA3DCNN.RNA3DCNN() |
2004
|
|
|
return r.run(self.fn, verbose) |
2005
|
|
|
|
2006
|
|
|
if method.lower() == "qrna": |
2007
|
|
|
from rna_tools.tools.mq.QRNA import QRNA |
2008
|
|
|
r = QRNA.QRNA() |
2009
|
|
|
return r.run(self.fn, verbose) |
2010
|
|
|
|
2011
|
|
|
if method.lower() == "clashscore": |
2012
|
|
|
from rna_tools.tools.mq.ClashScore import ClashScore |
2013
|
|
|
r = ClashScore.ClashScore() |
2014
|
|
|
return r.run(self.fn, verbose) |
2015
|
|
|
|
2016
|
|
|
if method.lower() == "analyzegeometry": |
2017
|
|
|
from rna_tools.tools.mq.AnalyzeGeometry import AnalyzeGeometry |
2018
|
|
|
r = AnalyzeGeometry.AnalyzeGeometry() |
2019
|
|
|
return r.run(self.fn, verbose) |
2020
|
|
|
|
2021
|
|
|
raise MethodUnknown('Define corrent method') |
2022
|
|
|
|
2023
|
|
|
def get_empty_line(self): |
2024
|
|
|
l = "ATOM 1 C5' A A 3 25.674 19.091 3.459 1.00 1.00 X" |
2025
|
|
|
return l |
2026
|
|
|
|
2027
|
|
|
def add_line(self, l): |
2028
|
|
|
self.lines.append(l) |
2029
|
|
|
|
2030
|
|
|
def add_header(version=None): |
2031
|
|
|
now = time.strftime("%c") |
2032
|
|
|
txt = 'REMARK 250 Model edited with rna-tools\n' |
2033
|
|
|
txt += 'REMARK 250 ver %s \nREMARK 250 https://github.com/mmagnus/rna-tools \nREMARK 250 %s' % ( |
2034
|
|
|
version, now) |
2035
|
|
|
return txt |
2036
|
|
|
|
2037
|
|
|
|
2038
|
|
|
def edit_pdb(f, args): |
2039
|
|
|
"""Edit your structure. |
2040
|
|
|
|
2041
|
|
|
The function can take ``A:3-21>A:1-19`` or even syntax like this |
2042
|
|
|
``A:3-21>A:1-19,B:22-32>B:20-30`` and will do an editing. |
2043
|
|
|
|
2044
|
|
|
The output is printed, line by line. Only ATOM lines are edited! |
2045
|
|
|
|
2046
|
|
|
Examples:: |
2047
|
|
|
|
2048
|
|
|
$ rna_pdb_tools.py --edit 'A:3-21>A:1-19' 1f27_clean.pdb > 1f27_clean_A1-19.pdb |
2049
|
|
|
|
2050
|
|
|
or even:: |
2051
|
|
|
|
2052
|
|
|
$ rna_pdb_tools.py --edit 'A:3-21>A:1-19,B:22-32>B:20-30' 1f27_clean.pdb > 1f27_clean_renumb.pdb |
2053
|
|
|
|
2054
|
|
|
""" |
2055
|
|
|
# open a new file |
2056
|
|
|
s = RNAStructure(f) |
2057
|
|
|
output = '' |
2058
|
|
|
if not args.no_hr: |
2059
|
|
|
output += add_header() + '\n' |
2060
|
|
|
output += 'REMARK 250 HEADER --edit ' + args.edit + '\n' |
2061
|
|
|
|
2062
|
|
|
# --edit 'A:3-21>A:1-19,B:22-32>B:20-30' |
2063
|
|
|
if args.edit.find(',') > -1: |
2064
|
|
|
# more than one edits |
2065
|
|
|
edits = args.edit.split(',') # ['A:3-21>A:1-19', 'B:22-32>B:20-30'] |
2066
|
|
|
selects = [] |
2067
|
|
|
for e in edits: |
2068
|
|
|
selection_from, selection_to = select_pdb_fragment( |
2069
|
|
|
e.split('>')[0]), select_pdb_fragment(e.split('>')[1]) |
2070
|
|
|
if len(selection_to) != len(selection_from): |
2071
|
|
|
raise Exception('len(selection_to) != len(selection_from)') |
2072
|
|
|
selects.append([selection_from, selection_to]) |
2073
|
|
|
else: |
2074
|
|
|
# one edit |
2075
|
|
|
e = args.edit |
2076
|
|
|
selection_from, selection_to = select_pdb_fragment( |
2077
|
|
|
e.split('>')[0]), select_pdb_fragment(e.split('>')[1]) |
2078
|
|
|
if len(selection_to) != len(selection_from): |
2079
|
|
|
raise Exception('len(selection_to) != len(selection_from)') |
2080
|
|
|
selects = [[selection_from, selection_to]] |
2081
|
|
|
|
2082
|
|
|
# go ever all edits: ['A:3-21>A:1-19','B:22-32>B:20-30'] |
2083
|
|
|
for l in s.lines: |
2084
|
|
|
if l.startswith('ATOM'): |
2085
|
|
|
# get chain and resi |
2086
|
|
|
chain = l[21:22].strip() |
2087
|
|
|
resi = int(l[22:26].strip()) |
2088
|
|
|
|
2089
|
|
|
if_selected_dont_print = False |
2090
|
|
|
# for selections |
2091
|
|
|
for select in selects: |
2092
|
|
|
selection_from, selection_to = select |
2093
|
|
|
if chain in selection_from: |
2094
|
|
|
if resi in selection_from[chain]: |
2095
|
|
|
# [1,2,3] mapping from [4,5,10], you want to know how to map 1 |
2096
|
|
|
# 1 is [0] element of first list, so you have to index first list |
2097
|
|
|
# to get 0, with this 0 you can get 4 out of second list [4,5,10][0] -> 4 |
2098
|
|
|
nl = list(l) |
2099
|
|
|
chain_new = list(selection_to.keys())[0] # chain form second list |
2100
|
|
|
nl[21] = chain_new # new chain |
2101
|
|
|
index = selection_from[chain].index(int(resi)) # get index of 1 |
2102
|
|
|
resi_new = str(selection_to[chain_new][index]).rjust( |
2103
|
|
|
4) # 'A' [1,2,3] -> ' 1' |
2104
|
|
|
nl[22:26] = resi_new |
2105
|
|
|
nl = ''.join(nl) |
2106
|
|
|
if_selected_dont_print = True |
2107
|
|
|
output += nl + '\n' |
2108
|
|
|
if not if_selected_dont_print: |
2109
|
|
|
output += l + '\n' |
2110
|
|
|
else: # if not atom |
2111
|
|
|
output += l + '\n' |
2112
|
|
|
return output |
2113
|
|
|
|
2114
|
|
|
|
2115
|
|
|
def collapsed_view(args): |
2116
|
|
|
"""Collapsed view of pdb file. Only lines with C5' atoms are shown and TER, MODEL, END. |
2117
|
|
|
|
2118
|
|
|
example:: |
2119
|
|
|
|
2120
|
|
|
[mm] rna_tools git:(master) $ python rna-pdb-tools.py --cv input/1f27.pdb |
2121
|
|
|
ATOM 1 C5' A A 3 25.674 19.091 3.459 1.00 16.99 C |
2122
|
|
|
ATOM 23 C5' C A 4 19.700 19.206 5.034 1.00 12.65 C |
2123
|
|
|
ATOM 43 C5' C A 5 14.537 16.130 6.444 1.00 8.74 C |
2124
|
|
|
ATOM 63 C5' G A 6 11.726 11.579 9.544 1.00 9.81 C |
2125
|
|
|
ATOM 86 C5' U A 7 12.007 7.281 13.726 1.00 11.35 C |
2126
|
|
|
ATOM 106 C5' C A 8 12.087 6.601 18.999 1.00 12.74 C |
2127
|
|
|
TER""" |
2128
|
|
|
r = RNAStructure(args.file) |
2129
|
|
|
for l in r.lines: |
2130
|
|
|
at = r.get_atom_code(l) |
2131
|
|
|
if at == "P": # C5'": |
2132
|
|
|
print(l) |
2133
|
|
|
if l.startswith('TER') or l.startswith('MODEL') or l.startswith('END'): |
2134
|
|
|
print(l) |
2135
|
|
|
|
2136
|
|
|
|
2137
|
|
|
def fetch(pdb_id, path="."): |
2138
|
|
|
"""fetch pdb file from RCSB.org |
2139
|
|
|
https://files.rcsb.org/download/1Y26.pdb |
2140
|
|
|
|
2141
|
|
|
Args: |
2142
|
|
|
- pdb_id, but also a chain can be specified, 1jj2:A+B+C |
2143
|
|
|
|
2144
|
|
|
Returns: |
2145
|
|
|
- a path to a file |
2146
|
|
|
|
2147
|
|
|
TODO: na_pdb_tools.py --extract A:1-25+B:30-57 1jj2.pdb""" |
2148
|
|
|
|
2149
|
|
|
chains = '' |
2150
|
|
|
pdb_id = pdb_id.replace('_', ':') # to accept also 1jj2_A |
2151
|
|
|
if ':' in pdb_id: |
2152
|
|
|
pdb_id, chains = pdb_id.split(':') # >>> 'X:A+B+C'.split(':') ['X', 'A+B+C'] |
2153
|
|
|
|
2154
|
|
|
if pdb_id == 'rp': |
2155
|
|
|
os.system('wget https://github.com/RNA-Puzzles/standardized_dataset/archive/master.tar.gz -O - | tar -xz') |
2156
|
|
|
return |
2157
|
|
|
|
2158
|
|
|
import urllib3 |
2159
|
|
|
http = urllib3.PoolManager() |
2160
|
|
|
|
2161
|
|
|
# try: |
2162
|
|
|
pdb_id = pdb_id.replace('.pdb', '') |
2163
|
|
|
response = http.request('GET', 'https://files.rcsb.org/download/' + pdb_id + '.pdb') |
2164
|
|
|
if not response.status == 200: |
2165
|
|
|
raise PDBFetchError() |
2166
|
|
|
|
2167
|
|
|
# except urllib3.HTTPError: |
2168
|
|
|
# raise Exception('The PDB does not exists: ' + pdb_id) |
2169
|
|
|
txt = response.data |
2170
|
|
|
|
2171
|
|
|
if path != '.': |
2172
|
|
|
npath = path + os.sep + pdb_id + '.pdb' |
2173
|
|
|
else: |
2174
|
|
|
npath = pdb_id + '.pdb' |
2175
|
|
|
print('downloading... ' + npath) |
2176
|
|
|
with open(npath, 'wb') as f: |
2177
|
|
|
f.write(txt) |
2178
|
|
|
if chains: |
2179
|
|
|
for chain in chains.split('+'): |
2180
|
|
|
cmd = f'rna_pdb_tools.py --extract-chain {chain} {pdb_id}.pdb > {pdb_id}_{chain}.pdb' |
2181
|
|
|
print(cmd) |
2182
|
|
|
exe(cmd) |
2183
|
|
|
# print('ok') |
2184
|
|
|
return npath |
2185
|
|
|
|
2186
|
|
|
|
2187
|
|
View Code Duplication |
def fetch_ba(pdb_id, path="."): |
|
|
|
|
2188
|
|
|
"""fetch biological assembly pdb file from RCSB.org |
2189
|
|
|
|
2190
|
|
|
>>> fetch_ba('1xjr') |
2191
|
|
|
... |
2192
|
|
|
""" |
2193
|
|
|
try: |
2194
|
|
|
import urllib3 |
2195
|
|
|
except ImportError: |
2196
|
|
|
print('urllib3 is required') |
2197
|
|
|
return |
2198
|
|
|
http = urllib3.PoolManager() |
2199
|
|
|
# try: |
2200
|
|
|
response = http.request('GET', url='https://files.rcsb.org/download/' + |
2201
|
|
|
pdb_id.lower() + '.pdb1') |
2202
|
|
|
if not response.status == 200: |
2203
|
|
|
raise PDBFetchError() |
2204
|
|
|
txt = response.data |
2205
|
|
|
|
2206
|
|
|
npath = path + os.sep + pdb_id + '_ba.pdb' |
2207
|
|
|
print('downloading...' + npath) |
2208
|
|
|
with open(npath, 'wb') as f: |
2209
|
|
|
f.write(txt) |
2210
|
|
|
print('ok') |
2211
|
|
|
return pdb_id + '_ba.pdb' |
2212
|
|
|
|
2213
|
|
|
|
2214
|
|
View Code Duplication |
def fetch_cif_ba(cif_id, path="."): |
|
|
|
|
2215
|
|
|
"""fetch biological assembly cif file from RCSB.org""" |
2216
|
|
|
import urrlib3 |
2217
|
|
|
http = urllib3.PoolManager() |
|
|
|
|
2218
|
|
|
# try: |
2219
|
|
|
response = http.request('GET', url='https://files.rcsb.org/download/' + |
2220
|
|
|
cif_id.lower() + '-assembly1.cif') |
2221
|
|
|
if not response.status == 200: |
2222
|
|
|
raise PDBFetchError() |
2223
|
|
|
txt = response.data |
2224
|
|
|
|
2225
|
|
|
npath = path + os.sep + cif_id + '_ba.cif' |
2226
|
|
|
print('downloading...' + npath) |
2227
|
|
|
with open(npath, 'wb') as f: |
2228
|
|
|
f.write(txt) |
2229
|
|
|
print('ok') |
2230
|
|
|
return cif_id + '_ba.cif' |
2231
|
|
|
|
2232
|
|
|
|
2233
|
|
|
def replace_chain(struc_fn, insert_fn, chain_id): |
2234
|
|
|
"""Replace chain of the main file (struc_fn) with some new chain (insert_fn) of given chain id. |
2235
|
|
|
|
2236
|
|
|
Args: |
2237
|
|
|
struc_fn (str): path to the main PDB file |
2238
|
|
|
insert_fn (str): path to the file that will be injected in into the main PDB file |
2239
|
|
|
chain_id (str): chain that will be inserted into the main PDB file |
2240
|
|
|
|
2241
|
|
|
Returns: |
2242
|
|
|
string: text in the PDB format |
2243
|
|
|
""" |
2244
|
|
|
struc = RNAStructure(struc_fn) |
2245
|
|
|
insert = RNAStructure(insert_fn) |
2246
|
|
|
|
2247
|
|
|
output = '' |
2248
|
|
|
inserted = False |
2249
|
|
|
for l in struc.lines: |
2250
|
|
|
if l.startswith('ATOM'): |
2251
|
|
|
chain = l[21] |
2252
|
|
|
if chain == chain_id: |
2253
|
|
|
if not inserted: |
2254
|
|
|
for insertl in insert.lines: |
2255
|
|
|
if not insertl.startswith('HEADER') and not insertl.startswith('END'): |
2256
|
|
|
output += insertl + '\n' |
2257
|
|
|
inserted = True |
2258
|
|
|
continue |
2259
|
|
|
# insert pdb |
2260
|
|
|
output += l + '\n' |
2261
|
|
|
return output.strip() |
2262
|
|
|
|
2263
|
|
|
|
2264
|
|
|
def replace_atoms(struc_fn, insert_fn, verbose=False): |
2265
|
|
|
"""Replace XYZ coordinate of the file (struc_fn) with XYZ from another file (insert_fn). |
2266
|
|
|
|
2267
|
|
|
This can be useful if you want to replace positions of atoms, for example, one base only. The lines are muted based on atom name, residue name, chain, residue index (marked with XXXX below).:: |
2268
|
|
|
|
2269
|
|
|
ATOM 11 N1 A 2 27 303.441 273.472 301.457 1.00 0.00 N # file |
2270
|
|
|
ATOM 1 N1 A 2 27 300.402 273.627 303.188 1.00 99.99 N # insert |
2271
|
|
|
ATOM 11 N1 A 2 27 300.402 273.627 303.188 1.00 0.00 N # inserted |
2272
|
|
|
XXXXXXXXXXXXXXXX # part used to find lines to be replaced |
2273
|
|
|
|
2274
|
|
|
ATOM 1 P A 2 27 295.653 270.783 300.135 1.00119.29 P # next line |
2275
|
|
|
|
2276
|
|
|
Args: |
2277
|
|
|
struc_fn (str): path to the main PDB file |
2278
|
|
|
insert_fn (str): path to the file that will be injected in into the main PDB file |
2279
|
|
|
|
2280
|
|
|
Returns: |
2281
|
|
|
string: text in the PDB format |
2282
|
|
|
""" |
2283
|
|
|
struc = RNAStructure(struc_fn) |
2284
|
|
|
insert = RNAStructure(insert_fn) |
2285
|
|
|
|
2286
|
|
|
output = '' |
2287
|
|
|
for l in struc.lines: |
2288
|
|
|
inserted = False |
2289
|
|
|
if l.startswith('ATOM'): |
2290
|
|
|
idx = l[13:26]#.strip() # P A 2 27 |
2291
|
|
|
for li in insert.lines: |
2292
|
|
|
idxi = li[13:26]#.strip() |
2293
|
|
|
if idx == idxi: |
2294
|
|
|
ln = l[:30] + li[30:54] + l[54:] + '\n' # only coords replace (!) |
2295
|
|
|
if verbose: |
2296
|
|
|
print(l) |
2297
|
|
|
print(li) |
2298
|
|
|
print(ln) |
2299
|
|
|
print() |
2300
|
|
|
inserted = True |
2301
|
|
|
output += ln |
2302
|
|
|
if not inserted: |
2303
|
|
|
output += l + '\n' |
2304
|
|
|
inserted = False |
2305
|
|
|
return output.strip() |
2306
|
|
|
|
2307
|
|
|
def set_chain_for_struc(struc_fn, chain_id, save_file_inplace=False, skip_ter=True): |
2308
|
|
|
"""Quick & dirty function to set open a fn PDB format, set chain_id |
2309
|
|
|
and save it to a file. Takes only lines with ATOM and TER.""" |
2310
|
|
|
txt = '' |
2311
|
|
|
for line in open(struc_fn): |
2312
|
|
|
if skip_ter: |
2313
|
|
|
if line.startswith('ATOM') or line.startswith('TER'): |
2314
|
|
|
# if TER line is only TER, not like TER atom chain etc |
2315
|
|
|
# then just keep TER there and don't add any chain to id |
2316
|
|
|
if line == 'TER': |
2317
|
|
|
pass |
2318
|
|
|
else: |
2319
|
|
|
l = line[:21] + chain_id + line[22:] |
2320
|
|
|
txt += l |
|
|
|
|
2321
|
|
|
else: |
2322
|
|
|
txt += line |
2323
|
|
|
else: |
2324
|
|
|
if line.startswith('ATOM'): |
2325
|
|
|
l = line[:21] + chain_id + line[22:] |
2326
|
|
|
txt += l |
2327
|
|
|
elif line.startswith('TER'): |
2328
|
|
|
continue |
2329
|
|
|
else: |
2330
|
|
|
txt += line |
2331
|
|
|
if save_file_inplace: |
2332
|
|
|
with open(struc_fn, 'w') as f: |
2333
|
|
|
f.write(txt) |
2334
|
|
|
return txt.strip() |
2335
|
|
|
|
2336
|
|
|
def load_rnas(path, verbose=True): |
2337
|
|
|
"""Load structural files (via glob) and return a list of RNAStructure objects. |
2338
|
|
|
|
2339
|
|
|
Examples:: |
2340
|
|
|
|
2341
|
|
|
rnas = rtl.load_rnas('../rna_tools/input/mq/*.pdb') |
2342
|
|
|
|
2343
|
|
|
""" |
2344
|
|
|
rs = [] |
2345
|
|
|
import glob |
2346
|
|
|
for f in glob.glob(path): |
2347
|
|
|
r = RNAStructure(f) |
2348
|
|
|
rs.append(r) |
2349
|
|
|
return rs |
2350
|
|
|
|
2351
|
|
|
# main |
2352
|
|
|
if '__main__' == __name__: |
2353
|
|
|
f1 = "input/t2-4-ACA.pdb" |
2354
|
|
|
f2 = "input/to-replace.pdb" |
2355
|
|
|
r1 = RNAStructure(f1) |
2356
|
|
|
r2 = RNAStructure(f2) |
2357
|
|
|
t = replace_atoms(f1, f2) |
2358
|
|
|
with open('output/replaced.pdb', 'w') as f: f.write(t) |
2359
|
|
|
|
2360
|
|
|
fn = "input/1a9l_NMR_1_2_models.pdb" |
2361
|
|
|
r = RNAStructure(fn) |
2362
|
|
|
t = r.get_res_text('A', 1) |
2363
|
|
|
with open('output/1a9l_NMR_1_2_models_R1.pdb', 'w') as f: f.write(t) |
2364
|
|
|
|
2365
|
|
|
fn = "input/remarks.pdb" |
2366
|
|
|
r = RNAStructure(fn) |
2367
|
|
|
t = r.get_remarks_text() |
2368
|
|
|
with open('output/remarks_only.pdb', 'w') as f: f.write(t) |
2369
|
|
|
|
2370
|
|
|
fn = 'input/image' |
2371
|
|
|
print('fn:', fn) |
2372
|
|
|
struc = RNAStructure(fn) |
2373
|
|
|
print(' pdb?:', struc.is_pdb()) |
2374
|
|
|
# print( atoms:', struc.get_no_lines()) |
2375
|
|
|
|
2376
|
|
|
fn = 'input/na.pdb' |
2377
|
|
|
s = RNAStructure(fn) |
2378
|
|
|
print(s.detect_molecule_type()) |
2379
|
|
|
#res = get_all_res(na) |
2380
|
|
|
# print 'what is?', what_is(res) |
2381
|
|
|
# print res |
2382
|
|
|
print('non standard:', s.check_res_if_std_na()) |
2383
|
|
|
print('is protein:', s.detect_molecule_type()) |
2384
|
|
|
|
2385
|
|
|
fn = 'input/prot.pdb' |
2386
|
|
|
s = RNAStructure(fn) |
2387
|
|
|
print('non standard:', s.check_res_if_std_prot()) |
2388
|
|
|
print('is protein:', s.detect_molecule_type()) |
2389
|
|
|
|
2390
|
|
|
fn = 'input/rna-ru.pdb' |
2391
|
|
|
s = RNAStructure(fn) |
2392
|
|
|
print('non standard:', s.check_res_if_supid_rna()) |
2393
|
|
|
print('is protein:', s.detect_molecule_type()) |
2394
|
|
|
|
2395
|
|
|
fn = 'input/na_highAtomNum.pdb' |
2396
|
|
|
print(fn) |
2397
|
|
|
s = RNAStructure(fn) |
2398
|
|
|
s.renum_atoms() |
2399
|
|
|
s.write('output/na_highAtomNum.pdb') |
2400
|
|
|
|
2401
|
|
|
fn = 'input/na_solvet_old_format.pdb' |
2402
|
|
|
print(fn) |
2403
|
|
|
s = RNAStructure(fn) |
2404
|
|
|
s.fix_op_atoms() |
2405
|
|
|
s.remove_hydrogen() |
2406
|
|
|
s.remove_ion() |
2407
|
|
|
s.remove_water() |
2408
|
|
|
s.write('output/na_solvet_old_format.pdb') |
2409
|
|
|
|
2410
|
|
|
fn = 'input/na_solvet_old_format.pdb' |
2411
|
|
|
print(fn) |
2412
|
|
|
s = RNAStructure(fn) |
2413
|
|
|
s.std_resn() |
2414
|
|
|
s.remove_hydrogen() |
2415
|
|
|
s.remove_ion() |
2416
|
|
|
s.remove_water() |
2417
|
|
|
s.write('output/na_solvet_old_format.pdb') |
2418
|
|
|
|
2419
|
|
|
#fn = 'input/na_solvet_old_format__.pdb' |
2420
|
|
|
#s = RNAStructure(fn) |
2421
|
|
|
# s.std_resn() |
2422
|
|
|
# s.remove_hydrogen() |
2423
|
|
|
# s.remove_ion() |
2424
|
|
|
# s.remove_water() |
2425
|
|
|
# s.renum_atoms() |
2426
|
|
|
# s.fix_op_atoms() |
2427
|
|
|
# s.write('output/na_solvet_old_format__.pdb') |
2428
|
|
|
|
2429
|
|
|
fn = 'input/1xjr.pdb' |
2430
|
|
|
s.std_resn() |
2431
|
|
|
s.remove_hydrogen() |
2432
|
|
|
s.remove_ion() |
2433
|
|
|
s.remove_water() |
2434
|
|
|
s.renum_atoms() |
2435
|
|
|
s.fix_op_atoms() |
2436
|
|
|
s.write('output/1xjr.pdb') |
2437
|
|
|
|
2438
|
|
|
fn = 'input/decoy0165_amb.pdb' |
2439
|
|
|
print(fn) |
2440
|
|
|
s = RNAStructure(fn) |
2441
|
|
|
s.std_resn() |
2442
|
|
|
s.remove_hydrogen() |
2443
|
|
|
s.remove_ion() |
2444
|
|
|
s.remove_water() |
2445
|
|
|
s.renum_atoms() |
2446
|
|
|
s.fix_O_in_UC() |
2447
|
|
|
s.fix_op_atoms() |
2448
|
|
|
s.write('output/decoy0165_amb_clx.pdb') |
2449
|
|
|
|
2450
|
|
|
fn = 'input/farna.pdb' |
2451
|
|
|
print(fn) |
2452
|
|
|
s = RNAStructure(fn) |
2453
|
|
|
s.std_resn() |
2454
|
|
|
s.remove_hydrogen() |
2455
|
|
|
s.remove_ion() |
2456
|
|
|
s.remove_water() |
2457
|
|
|
s.fix_op_atoms() |
2458
|
|
|
s.renum_atoms() |
2459
|
|
|
s.write('output/farna.pdb') |
2460
|
|
|
|
2461
|
|
|
fn = 'input/farna.pdb' |
2462
|
|
|
print(fn) |
2463
|
|
|
|
2464
|
|
|
r = RNAStructure(fn) |
2465
|
|
|
print(r.is_mol2()) |
2466
|
|
|
|
2467
|
|
|
if True: |
2468
|
|
|
print('================================================') |
2469
|
|
|
print ("input/1xjr_clx_fChimera_noIncludeNumbers.mol2") |
2470
|
|
|
r = RNAStructure("input/1xjr_clx_fChimera_noIncludeNumbers.mol2") |
2471
|
|
|
print(r.is_mol2()) |
2472
|
|
|
r.mol2toPDB('/tmp/x.pdb') |
2473
|
|
|
|
2474
|
|
|
r = RNAStructure('/tmp/x.pdb') |
2475
|
|
|
print(r.get_report) |
2476
|
|
|
r.std_resn() |
2477
|
|
|
r.remove_hydrogen() |
2478
|
|
|
r.remove_ion() |
2479
|
|
|
r.remove_water() |
2480
|
|
|
r.fix_op_atoms() |
2481
|
|
|
r.renum_atoms() |
2482
|
|
|
r.fixU__to__U() |
2483
|
|
|
r.write("output/1xjr_clx_fChimera_noIncludeNumbers.mol2") |
2484
|
|
|
|
2485
|
|
|
if True: |
2486
|
|
|
r = RNAStructure("input/2du3_prot_bound.mol2") |
2487
|
|
|
print(r.is_mol2()) |
2488
|
|
|
outfn = r.mol2toPDB() |
2489
|
|
|
print(r.get_report) |
2490
|
|
|
|
2491
|
|
|
print('================================================') |
2492
|
|
|
fn = "input/3e5fA-nogtp_processed_zephyr.pdb" |
2493
|
|
|
r = RNAStructure(fn) |
2494
|
|
|
print(r.is_mol2()) |
2495
|
|
|
#outfn = r.mol2toPDB() |
2496
|
|
|
print(r.is_amber_like()) |
2497
|
|
|
print(r.get_report) |
2498
|
|
|
|
2499
|
|
|
print(r.get_preview()) |
2500
|
|
|
|
2501
|
|
|
r.std_resn() |
2502
|
|
|
|
2503
|
|
|
print(r.get_preview()) |
2504
|
|
|
|
2505
|
|
|
r.remove_hydrogen() |
2506
|
|
|
r.remove_ion() |
2507
|
|
|
r.remove_water() |
2508
|
|
|
#renum_atoms(t, t) |
2509
|
|
|
#fix_O_in_UC(t, t) |
2510
|
|
|
#fix_op_atoms(t, t) |
2511
|
|
|
r.write('output/3e5fA-nogtp_processed_zephyr.pdb') |
2512
|
|
|
|
2513
|
|
|
print() |
2514
|
|
|
fn = "input/1xjr_clx_charmm.pdb" |
2515
|
|
|
print(fn) |
2516
|
|
|
s = RNAStructure(fn) |
2517
|
|
|
s.std_resn() |
2518
|
|
|
s.remove_hydrogen() |
2519
|
|
|
s.remove_ion() |
2520
|
|
|
s.remove_water() |
2521
|
|
|
s.write('output/1xjr_clx_charmm.pdb') |
2522
|
|
|
|
2523
|
|
|
#renum_atoms(t, t) |
2524
|
|
|
#fix_O_in_UC(t, t) |
2525
|
|
|
#fix_op_atoms(t, t) |
2526
|
|
|
|
2527
|
|
|
print() |
2528
|
|
|
fn = "input/dna_fconvpdb_charmm22.pdb" |
2529
|
|
|
print(fn) |
2530
|
|
|
r = RNAStructure(fn) |
2531
|
|
|
r.get_preview() |
2532
|
|
|
r.resn_as_dna() |
2533
|
|
|
r.remove_hydrogen() |
2534
|
|
|
r.remove_ion() |
2535
|
|
|
r.remove_water() |
2536
|
|
|
r.std_resn() |
2537
|
|
|
print(r.get_head()) |
2538
|
|
|
print(r.get_tail()) |
2539
|
|
|
print(r.get_preview()) |
2540
|
|
|
r.write("output/dna_fconvpdb_charmm22.pdb") |
2541
|
|
|
|
2542
|
|
|
print() |
2543
|
|
|
fn = "input/1a9l_NMR_1_2_models.pdb" |
2544
|
|
|
print(fn) |
2545
|
|
|
r = RNAStructure(fn) |
2546
|
|
|
r.write("output/1a9l_NMR_1_2_models_lib.pdb") |
2547
|
|
|
# r.get_text() # get #1 model |
2548
|
|
|
|
2549
|
|
|
import doctest |
2550
|
|
|
doctest.testmod() |
2551
|
|
|
|