|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# -*- coding: utf-8 -*- |
|
3
|
|
|
|
|
4
|
|
|
""" |
|
5
|
|
|
v2 is full atom! |
|
6
|
|
|
|
|
7
|
|
|
Examples:: |
|
8
|
|
|
|
|
9
|
|
|
triplexibility.py -t 2nd_triplex_FB_ABC.pdb triples-all-v2-rpr/*.pdb --save --result abc.csv |
|
10
|
|
|
|
|
11
|
|
|
""" |
|
12
|
|
|
from __future__ import print_function |
|
13
|
|
|
|
|
14
|
|
|
import Bio.PDB.PDBParser |
|
15
|
|
|
import Bio.PDB.Superimposer |
|
16
|
|
|
import copy |
|
17
|
|
|
import warnings |
|
18
|
|
|
warnings.filterwarnings('ignore', '.*Invalid or missing.*',) |
|
19
|
|
|
warnings.filterwarnings('ignore', '.*with given element *',) |
|
20
|
|
|
warnings.filterwarnings('ignore', '.*is discontinuous*',) |
|
21
|
|
|
warnings.filterwarnings('ignore', '.*Ignoring unrecognized record*',) |
|
22
|
|
|
|
|
23
|
|
|
import sys |
|
24
|
|
|
from icecream import ic |
|
25
|
|
|
ic.configureOutput(outputFunction=lambda *a: print(*a, file=sys.stderr)) |
|
26
|
|
|
ic.configureOutput(prefix='> ') |
|
27
|
|
|
|
|
28
|
|
|
from rna_tools.rna_tools_lib import set_chain_for_struc, RNAStructure |
|
29
|
|
|
from rna_tools.rna_tools_config import RT |
|
30
|
|
|
|
|
31
|
|
|
import argparse |
|
32
|
|
|
import sys |
|
33
|
|
|
import glob |
|
34
|
|
|
import re |
|
35
|
|
|
import os |
|
36
|
|
|
|
|
37
|
|
|
class RNAmodel: |
|
38
|
|
|
def __init__(self, fpath): |
|
39
|
|
|
# parser 1-5 -> 1 2 3 4 5 |
|
40
|
|
|
self.struc = Bio.PDB.PDBParser().get_structure('', fpath) |
|
41
|
|
|
self.__get_atoms() |
|
42
|
|
|
self.fpath = fpath |
|
43
|
|
|
self.fn = os.path.basename(fpath) |
|
44
|
|
|
# self.atoms = [] |
|
45
|
|
|
# if save: |
|
46
|
|
|
# self.save() # @save |
|
47
|
|
|
|
|
48
|
|
|
def __get_atoms(self): |
|
49
|
|
|
self.atoms = [[], [], []] # this will be only 3 resides |
|
50
|
|
|
# ugly but should work |
|
51
|
|
|
tseq = '' |
|
52
|
|
|
for index, res in enumerate(self.struc.get_residues()): |
|
53
|
|
|
self.atoms[index] = [] |
|
54
|
|
|
for at in res: |
|
55
|
|
|
self.atoms[index].append(at) |
|
56
|
|
|
tseq += res.get_resname() |
|
57
|
|
|
## # calc rmsd # |
|
58
|
|
|
## if not tseq: |
|
59
|
|
|
## tseq = '' |
|
60
|
|
|
## rt = None |
|
61
|
|
|
## for a in self.atoms_for_rmsd: |
|
62
|
|
|
## r = a.get_parent() |
|
63
|
|
|
## if r != rt: |
|
64
|
|
|
## rt = r |
|
65
|
|
|
## tseq += r.get_resname().strip() |
|
66
|
|
|
self.tseq = tseq |
|
67
|
|
|
return self.atoms |
|
68
|
|
|
|
|
69
|
|
|
def __str__(self): |
|
70
|
|
|
return self.fn #+ ' # beads' + str(len(self.residues)) |
|
71
|
|
|
|
|
72
|
|
|
def __repr__(self): |
|
73
|
|
|
return self.fn #+ ' # beads' + str(len(self.residues)) |
|
74
|
|
|
|
|
75
|
|
|
def get_report(self): |
|
76
|
|
|
"""Str a short report about rna model""" |
|
77
|
|
|
t = ' '.join(['File: ', self.fn, ' # of atoms:', str(len(self.atoms)), '\n']) |
|
78
|
|
|
for r,a in zip(self.residues, self.atoms ): |
|
79
|
|
|
t += ' '.join(['resi: ', str(r) ,' atom: ', str(a) , '\n' ]) |
|
80
|
|
|
return t |
|
81
|
|
|
|
|
82
|
|
|
def get_rmsd_to(self, other_rnamodel, way="", triple_mode=False, save=True, tseq=''): |
|
83
|
|
|
"""Calc rmsd P-atom based rmsd to other rna model |
|
84
|
|
|
|
|
85
|
|
|
sugar now 10 atoms ;-) """ |
|
86
|
|
|
sup = Bio.PDB.Superimposer() |
|
87
|
|
|
|
|
88
|
|
|
other_atoms_for_rmsd = other_rnamodel.atoms |
|
89
|
|
|
|
|
90
|
|
|
if triple_mode: |
|
91
|
|
|
def chunks(lst, n): |
|
92
|
|
|
"""Yield successive n-sized chunks from lst. |
|
93
|
|
|
https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks |
|
94
|
|
|
""" |
|
95
|
|
|
for i in range(0, len(lst), n): |
|
96
|
|
|
yield lst[i:i + n] |
|
97
|
|
|
|
|
98
|
|
|
rmsd_min = 10000 # ugly |
|
99
|
|
|
import itertools |
|
100
|
|
|
# ok, for different residues now it's a problem |
|
101
|
|
|
per = list(itertools.permutations([0, 1, 2])) # [(0, 1, 2), (0, 2, 1), (1, 0, 2), (1, 2, 0), (2, 0, 1), (2, 1, 0)] |
|
102
|
|
|
#lst = list(chunks(other_atoms_for_rmsd, |
|
103
|
|
|
# int(len(other_atoms_for_rmsd)/3))) # for 1 atom, this will be 1 x 3 [3 residues] |
|
104
|
|
|
# # so so len is 3 atoms so / by 3 to get how many atoms per residue |
|
105
|
|
|
|
|
106
|
|
|
self.atoms_for_rmsd = [] |
|
107
|
|
|
for i in self.atoms: |
|
108
|
|
|
self.atoms_for_rmsd.extend(i) |
|
109
|
|
|
|
|
110
|
|
|
sup_min = None |
|
111
|
|
|
seq_min = 'not yet obtained, rmsd rejected!' |
|
112
|
|
|
p_min = None |
|
113
|
|
|
|
|
114
|
|
|
rms = -1 |
|
115
|
|
|
|
|
116
|
|
|
for p in per: # for each combo make a list of atoms |
|
117
|
|
|
|
|
118
|
|
|
patoms = [] |
|
119
|
|
|
for i in p: # p=(1, 2, 3) |
|
120
|
|
|
#patoms.extend(lst[i]) |
|
121
|
|
|
# list is not needed, just other_atoms !!!! it's a list of [1, 2, 3] residues |
|
122
|
|
|
patoms.extend(other_atoms_for_rmsd[i]) |
|
123
|
|
|
|
|
124
|
|
|
#print(self.atoms_for_rmsd) |
|
125
|
|
|
## print('patoms') |
|
126
|
|
|
## for a in patoms: |
|
127
|
|
|
## print(a, a.get_parent().get_id()) |
|
128
|
|
|
## print('self.atoms_for_rmsd') |
|
129
|
|
|
## for a in self.atoms_for_rmsd: |
|
130
|
|
|
## print(a, a.get_parent().get_id()) |
|
131
|
|
|
#sup.set_atoms(patoms, self.atoms_for_rmsd) |
|
132
|
|
|
|
|
133
|
|
|
rt = None |
|
134
|
|
|
seq = '' |
|
135
|
|
|
for a in patoms: |
|
136
|
|
|
r = a.get_parent() |
|
137
|
|
|
if r != rt: |
|
138
|
|
|
rt = r |
|
139
|
|
|
seq += r.get_resname().strip() |
|
140
|
|
|
|
|
141
|
|
|
ic(self.tseq.lower(), seq.lower(), other_rnamodel.fpath) |
|
142
|
|
|
# dont' even calc rmsd if the curr seq and tseq are not the same |
|
143
|
|
|
if self.tseq.lower() == seq.lower(): # only if seq is the same |
|
144
|
|
|
|
|
145
|
|
|
sup.set_atoms(self.atoms_for_rmsd, patoms) |
|
146
|
|
|
rms = round(sup.rms, 2) |
|
147
|
|
|
if rms < rmsd_min: |
|
148
|
|
|
rmsd_min = rms |
|
149
|
|
|
sup_min = copy.copy(sup) |
|
150
|
|
|
suffix = seq |
|
151
|
|
|
p_min = p |
|
152
|
|
|
seq_min = seq |
|
153
|
|
|
if args.debug: 'set new rmsd_min', rmsd_min |
|
|
|
|
|
|
154
|
|
|
|
|
155
|
|
|
if args.debug: |
|
156
|
|
|
print(p, '', [i + 1 for i in p], end=' ') |
|
157
|
|
|
print(seq, 'seq_min: ' + seq_min, rms) |
|
158
|
|
|
|
|
159
|
|
View Code Duplication |
if p_min: # found target sequence |
|
|
|
|
|
|
160
|
|
|
# what is this? ;-) |
|
161
|
|
|
index = [0 ,0 ,0] |
|
162
|
|
|
index[0] = p_min.index(0) |
|
163
|
|
|
index[1] = p_min.index(1) |
|
164
|
|
|
index[2] = p_min.index(2) |
|
165
|
|
|
|
|
166
|
|
|
# ugly re-set 123 to crazy id ! + 100, so this will |
|
167
|
|
|
# fill up 1 2 3 for the second for |
|
168
|
|
|
rs = other_rnamodel.struc[0].get_residues() |
|
169
|
|
|
for i, r in enumerate(rs): |
|
170
|
|
|
r.id = (' ', index[i] + 253, ' ') # ugly, some random offset |
|
171
|
|
|
|
|
172
|
|
|
for i, r in enumerate(other_rnamodel.struc[0].get_residues()): |
|
173
|
|
|
r.id = (' ', index[i] + 1, ' ') |
|
174
|
|
|
if args.debug: print('r', r) |
|
175
|
|
|
|
|
176
|
|
|
io = Bio.PDB.PDBIO() |
|
177
|
|
|
sup_min.apply(other_rnamodel.struc.get_atoms()) |
|
178
|
|
|
# if args.debug: print(p_min, [i + 1 for i in p_min]) |
|
179
|
|
|
io.set_structure(other_rnamodel.struc) |
|
180
|
|
|
|
|
181
|
|
|
args.save_here = True |
|
182
|
|
|
if args.save_here and save: |
|
183
|
|
|
folder = os.path.basename(self.fpath.replace('.pdb', '_' + args.folder_prefix + '_aligned')) |
|
184
|
|
|
# print(f) |
|
185
|
|
|
try: |
|
186
|
|
|
os.mkdir(folder) |
|
187
|
|
|
except: |
|
188
|
|
|
pass |
|
189
|
|
|
fout = folder + os.sep + "{:1.2f}".format(rmsd_min) + '-' + os.path.basename(other_rnamodel.fpath)#.replace('.pdb', '-' + str(rms) + '.pdb')) |
|
190
|
|
|
#_s' + suffix + '.pdb')) |
|
191
|
|
|
else: |
|
192
|
|
|
fout = other_rnamodel.fpath.replace('.pdb', '_aligned.pdb')#_s' + suffix + '.pdb') |
|
193
|
|
|
|
|
194
|
|
|
if args.debug: print(fout) |
|
195
|
|
|
|
|
196
|
|
|
if save: |
|
197
|
|
|
io.save(fout) |
|
198
|
|
|
# ugly set chain to A |
|
199
|
|
|
set_chain_for_struc(fout, 'A', save_file_inplace=True) |
|
200
|
|
|
# and now run this to sort into 1 2 3 |
|
201
|
|
|
|
|
202
|
|
|
r = RNAStructure(fout) |
|
203
|
|
|
remarks = r.get_remarks_text() |
|
204
|
|
|
r1 = r.get_res_text('A', 1) |
|
205
|
|
|
r2 = r.get_res_text('A', 2) |
|
206
|
|
|
r3 = r.get_res_text('A', 3) |
|
207
|
|
|
with open(fout, 'w') as f: |
|
208
|
|
|
f.write(remarks) |
|
209
|
|
|
f.write(r1) |
|
210
|
|
|
f.write(r2) |
|
211
|
|
|
f.write(r3) |
|
212
|
|
|
r.reload() |
|
213
|
|
|
r.get_rnapuzzle_ready() |
|
214
|
|
|
if rmsd_min < 1: # !!!!!!!!!!!! ugly |
|
215
|
|
|
r.write() |
|
216
|
|
|
return str(rmsd_min)# + ',s' + seq_min + ',' + os.path.basename(fout) |
|
217
|
|
|
else: |
|
218
|
|
|
# check if number of the same atoms # |
|
219
|
|
|
# if not the same then return -1 # |
|
220
|
|
|
# print(len(self.atoms_for_rmsd), len(other_atoms_for_rmsd)) |
|
221
|
|
|
if len(self.atoms_for_rmsd) != len(other_atoms_for_rmsd): |
|
222
|
|
|
return -1 |
|
223
|
|
|
sup.set_atoms(self.atoms_for_rmsd, other_atoms_for_rmsd) |
|
224
|
|
|
rms = round(sup.rms, 2) |
|
225
|
|
|
|
|
226
|
|
|
if save: |
|
227
|
|
|
## io = Bio.PDB.PDBIO() |
|
228
|
|
|
## sup.apply(self.struc.get_atoms()) |
|
229
|
|
|
## io.set_structure(self.struc) |
|
230
|
|
|
## io.save("aligned.pdb") |
|
231
|
|
|
|
|
232
|
|
|
io = Bio.PDB.PDBIO() |
|
233
|
|
|
sup.apply(other_rnamodel.struc.get_atoms()) |
|
234
|
|
|
io.set_structure(other_rnamodel.struc) |
|
235
|
|
|
|
|
236
|
|
|
args.save_here = True |
|
237
|
|
View Code Duplication |
if args.save_here and save: |
|
|
|
|
|
|
238
|
|
|
f = os.path.basename(self.fpath.replace('.pdb', '_aligned')) |
|
239
|
|
|
# print(f) |
|
240
|
|
|
try: |
|
241
|
|
|
os.mkdir(f) |
|
242
|
|
|
except: |
|
243
|
|
|
pass |
|
244
|
|
|
# fout = f + os.sep + os.path.basename(other_rnamodel.fpath.replace('.pdb', '_aligned_s' + suffix + '.pdb')) |
|
245
|
|
|
fout = f + os.sep + os.path.basename(other_rnamodel.fpath)#.replace('.pdb', '_aligned')) |
|
246
|
|
|
else: |
|
247
|
|
|
fout = other_rnamodel.fpath.replace('.pdb', '_aligned.pdb') |
|
248
|
|
|
if save: |
|
249
|
|
|
io.save(fout) |
|
|
|
|
|
|
250
|
|
|
return rms |
|
251
|
|
|
|
|
252
|
|
|
|
|
253
|
|
View Code Duplication |
def get_rna_models_from_dir(directory): |
|
|
|
|
|
|
254
|
|
|
models = [] |
|
255
|
|
|
if not os.path.exists(directory): |
|
256
|
|
|
raise Exception('Dir does not exist! ', directory) |
|
257
|
|
|
files = glob.glob(directory + "/*.pdb") |
|
258
|
|
|
files_sorted = sort_nicely(files) |
|
259
|
|
|
for f in files_sorted: |
|
260
|
|
|
#print f |
|
261
|
|
|
models.append(RNAmodel(f)) |
|
262
|
|
|
return models |
|
263
|
|
|
|
|
264
|
|
|
def sort_nicely( l ): |
|
265
|
|
|
""" Sort the given list in the way that humans expect. |
|
266
|
|
|
|
|
267
|
|
|
http://blog.codinghorror.com/sorting-for-humans-natural-sort-order/ |
|
268
|
|
|
""" |
|
269
|
|
|
convert = lambda text: int(text) if text.isdigit() else text |
|
270
|
|
|
alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] |
|
271
|
|
|
l.sort( key=alphanum_key ) |
|
272
|
|
|
return l |
|
273
|
|
|
|
|
274
|
|
|
|
|
275
|
|
View Code Duplication |
def get_parser(): |
|
|
|
|
|
|
276
|
|
|
parser = argparse.ArgumentParser( |
|
277
|
|
|
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) |
|
278
|
|
|
|
|
279
|
|
|
parser.add_argument('-t',"--target", |
|
280
|
|
|
help="target file") |
|
281
|
|
|
parser.add_argument('--result', #default='out.rmsd', |
|
282
|
|
|
help="result file") |
|
283
|
|
|
parser.add_argument('--ignore-files', default='aligned', help="use to ignore files, .e.g. with 'aligned'") |
|
284
|
|
|
parser.add_argument('--suffix', default='aligned', help="used with --saved, by default: aligned") |
|
285
|
|
|
parser.add_argument('--way', help="e.g., backbone+sugar, c1, c1+Nx") |
|
286
|
|
|
parser.add_argument('--triple-mode', help="same crazy strategy to align triples", default=True, action="store_true") |
|
287
|
|
|
parser.add_argument('--column-name', help="name column for rmsd, by default 'rmsd', but can also be a name of the target file", default="rmsd") |
|
288
|
|
|
parser.add_argument("-s", "--save", action="store_true", help="set suffix with --suffix, by default: aligned") |
|
289
|
|
|
parser.add_argument("--folder-prefix", default = '', help="folder name, t2-3-UAU_NAME_aligned") |
|
290
|
|
|
|
|
291
|
|
|
parser.add_argument("--debug", action="store_true") |
|
292
|
|
|
parser.add_argument("--show", action="store_true", help="show the results in the terminal") |
|
293
|
|
|
parser.add_argument("--sort", action="store_true", help='sort results based on rmsd (ascending), --result must be set up') |
|
294
|
|
|
parser.add_argument("--tseq", help='target sequence, e.g. acu, find only triples of this sequence [use the order for the seq taken from the input PDB file, literally order of residues in a pdb file]') |
|
295
|
|
|
parser.add_argument('--files', help='files', nargs='+', default=RT + '/rna_tools/tools/triplexibility/db/triples-all-v2-rpr/*.pdb') |
|
296
|
|
|
return parser |
|
297
|
|
|
|
|
298
|
|
|
# main |
|
299
|
|
View Code Duplication |
if __name__ == '__main__': |
|
|
|
|
|
|
300
|
|
|
parser = get_parser() |
|
301
|
|
|
args = parser.parse_args() |
|
302
|
|
|
if not args.debug: |
|
303
|
|
|
ic.disable() |
|
304
|
|
|
|
|
305
|
|
|
target = RNAmodel(args.target) |
|
306
|
|
|
|
|
307
|
|
|
# a trick to get files be default if there is only a path (so a string ;-)) |
|
308
|
|
|
if not isinstance(args.files, str): |
|
309
|
|
|
models = args.files |
|
310
|
|
|
else: |
|
311
|
|
|
import glob |
|
312
|
|
|
models = glob.glob(args.files) # opts.input_dir |
|
313
|
|
|
|
|
314
|
|
|
## tmp = [] |
|
315
|
|
|
## if args.ignore_files: |
|
316
|
|
|
## for f in args.files: |
|
317
|
|
|
## if args.debug: print(f) |
|
318
|
|
|
## if args.ignore_files in f: |
|
319
|
|
|
## continue |
|
320
|
|
|
## tmp.append(f) |
|
321
|
|
|
## models = tmp |
|
322
|
|
|
|
|
323
|
|
|
print('# of models:', len(models)) |
|
324
|
|
|
|
|
325
|
|
|
c = 1 |
|
326
|
|
|
t = 'fn,' + args.column_name + '\n' # ,aligned_seq, aligned_fn\n' |
|
327
|
|
|
for m in models: |
|
328
|
|
|
mrna = RNAmodel(m) |
|
329
|
|
|
#print r1.fn, r2.fn, r1.get_rmsd_to(r2)#, 'tmp.pdb') |
|
330
|
|
|
# print(m) |
|
331
|
|
|
rmsd = target.get_rmsd_to(mrna, args.way, args.triple_mode, args.save, args.tseq) #, 'tmp.pdb') |
|
332
|
|
|
#except: |
|
333
|
|
|
if 0: |
|
334
|
|
|
print(m) |
|
335
|
|
|
sys.exit(1) |
|
336
|
|
|
#print rmsd |
|
337
|
|
|
t += mrna.fn + ',' + str(rmsd) + '\n' |
|
338
|
|
|
#break |
|
339
|
|
|
|
|
340
|
|
|
if args.show: |
|
341
|
|
|
print(t.strip()) # all the data |
|
342
|
|
|
|
|
343
|
|
|
if args.result: |
|
344
|
|
|
with open(args.result, 'w') as f: |
|
345
|
|
|
f.write(t) |
|
346
|
|
|
|
|
347
|
|
|
if args.sort: |
|
348
|
|
|
import pandas as pd |
|
349
|
|
|
df = pd.read_csv(args.result) |
|
350
|
|
|
df = df[df.rmsd != -1] |
|
351
|
|
|
df = df.sort_values('rmsd') |
|
352
|
|
|
df.to_csv(args.result, index=False) |
|
353
|
|
|
print('saved: %s' % args.result) |
|
354
|
|
|
# print(df.to_string(index=False)) |
|
355
|
|
|
|