|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# -*- coding: utf-8 -*- |
|
3
|
|
|
""" |
|
4
|
|
|
|
|
5
|
|
|
""" |
|
6
|
|
|
from __future__ import print_function |
|
7
|
|
|
import argparse |
|
8
|
|
|
from icecream import ic |
|
9
|
|
|
import sys |
|
10
|
|
|
ic.configureOutput(outputFunction=lambda *a: print(*a, file=sys.stderr)) |
|
11
|
|
|
ic.configureOutput(prefix='> ') |
|
12
|
|
|
|
|
13
|
|
|
from rna_tools.rna_tools_lib import edit_pdb, add_header, get_version |
|
14
|
|
|
import os |
|
15
|
|
|
|
|
16
|
|
|
|
|
17
|
|
|
def convert_cif_to_pdb(cif_file, add_header_to_output=True, version='', verbose=True, |
|
18
|
|
|
split_conflicting_chains=True, output_path=None): |
|
19
|
|
|
"""Convert an mmCIF file into one or more PDB files. |
|
20
|
|
|
|
|
21
|
|
|
Returns a list of generated PDB filenames. When a direct conversion fails |
|
22
|
|
|
(e.g. multi-character chain ids) the chains are split into separate models |
|
23
|
|
|
following the legacy CLI behavior. |
|
24
|
|
|
""" |
|
25
|
|
|
from Bio.PDB import MMCIFParser, PDBIO |
|
26
|
|
|
|
|
27
|
|
|
def _prepend_header(pdb_file, remarks=None): |
|
28
|
|
|
if not add_header_to_output and not remarks: |
|
29
|
|
|
return |
|
30
|
|
|
new_content = '' |
|
31
|
|
|
if add_header_to_output: |
|
32
|
|
|
new_content += add_header(version) + '\n' |
|
33
|
|
|
if remarks: |
|
34
|
|
|
new_content += '\n'.join(remarks) + '\n' |
|
35
|
|
|
with open(pdb_file, 'r') as fh: |
|
36
|
|
|
new_content += fh.read() |
|
37
|
|
|
with open(pdb_file, 'w') as fh: |
|
38
|
|
|
fh.write(new_content) |
|
39
|
|
|
|
|
40
|
|
|
if output_path and split_conflicting_chains: |
|
41
|
|
|
raise ValueError('output_path is only supported when split_conflicting_chains is False') |
|
42
|
|
|
|
|
43
|
|
|
outputs = [] |
|
44
|
|
|
parser = MMCIFParser() |
|
45
|
|
|
structure = parser.get_structure("structure_id", cif_file) |
|
46
|
|
|
pdb_file = output_path or cif_file.replace('.cif', '_fCIF.pdb') |
|
47
|
|
|
|
|
48
|
|
|
io = PDBIO() |
|
49
|
|
|
io.set_structure(structure) |
|
50
|
|
|
try: |
|
51
|
|
|
io.save(pdb_file) |
|
52
|
|
|
_prepend_header(pdb_file) |
|
53
|
|
|
outputs.append(pdb_file) |
|
54
|
|
|
if verbose: |
|
55
|
|
|
print(f'saved: {pdb_file}') |
|
56
|
|
|
return outputs |
|
57
|
|
|
except Exception: |
|
58
|
|
|
if verbose: |
|
59
|
|
|
print('Warning: some of the chains in this mmCIF file have multi-character ids.') |
|
60
|
|
|
|
|
61
|
|
|
from Bio.PDB.MMCIFParser import MMCIFParser |
|
62
|
|
|
from Bio.PDB import Structure, Model, PDBIO |
|
63
|
|
|
|
|
64
|
|
|
def has_high_rna_content(chain, threshold=0.8): |
|
65
|
|
|
rna_nucleotides = ['A', 'C', 'G', 'U', 'X'] |
|
66
|
|
|
total_residues = 0 |
|
67
|
|
|
rna_residues = 0 |
|
68
|
|
|
for residue in chain: |
|
69
|
|
|
total_residues += 1 |
|
70
|
|
|
if residue.get_resname().strip() in rna_nucleotides: |
|
71
|
|
|
rna_residues += 1 |
|
72
|
|
|
if total_residues == 0: |
|
73
|
|
|
return False |
|
74
|
|
|
return (rna_residues / total_residues) >= threshold |
|
75
|
|
|
|
|
76
|
|
|
parser = MMCIFParser() |
|
77
|
|
|
structure = parser.get_structure("structure", cif_file) |
|
78
|
|
|
|
|
79
|
|
|
if not split_conflicting_chains: |
|
80
|
|
|
import string |
|
81
|
|
|
letters = list(string.ascii_uppercase + string.ascii_lowercase + string.digits) |
|
82
|
|
|
letter_iter = iter(letters) |
|
83
|
|
|
for model in structure: |
|
84
|
|
|
for chain in model: |
|
85
|
|
|
try: |
|
86
|
|
|
chain.id = next(letter_iter) |
|
87
|
|
|
except StopIteration: |
|
88
|
|
|
raise RuntimeError('Too many chains to assign unique single-letter IDs in %s' % cif_file) |
|
89
|
|
|
io = PDBIO() |
|
90
|
|
|
io.set_structure(structure) |
|
91
|
|
|
io.save(pdb_file) |
|
92
|
|
|
_prepend_header(pdb_file) |
|
93
|
|
|
outputs.append(pdb_file) |
|
94
|
|
|
if verbose: |
|
95
|
|
|
print(f'saved: {pdb_file} (renamed chain ids)') |
|
96
|
|
|
return outputs |
|
97
|
|
|
|
|
98
|
|
|
import string |
|
99
|
|
|
letters = list(string.ascii_uppercase) |
|
100
|
|
|
|
|
101
|
|
|
for model in structure: |
|
102
|
|
|
for chain in model: |
|
103
|
|
|
if has_high_rna_content(chain): |
|
104
|
|
|
new_structure = Structure.Structure("new_structure") |
|
105
|
|
|
new_model = Model.Model(0) |
|
106
|
|
|
new_structure.add(new_model) |
|
107
|
|
|
|
|
108
|
|
|
chain_id_new = letters.pop(0) |
|
109
|
|
|
chain_id = chain.get_id() |
|
110
|
|
|
|
|
111
|
|
|
atom_count = 0 |
|
112
|
|
|
for residue in chain: |
|
113
|
|
|
for atom in residue: |
|
114
|
|
|
atom_count += 1 |
|
115
|
|
|
|
|
116
|
|
|
remarks = [f'REMARK rna chain {chain.id} -> {chain_id_new}'] |
|
117
|
|
|
pdb_file = cif_file.replace('.cif', f'_{chain_id}_n{chain_id_new}_fCIF.pdb') |
|
118
|
|
|
if verbose: |
|
119
|
|
|
print(f'rna chain {chain.id} -> {chain_id_new} {pdb_file} # of atoms: {atom_count}') |
|
120
|
|
|
|
|
121
|
|
|
chain.id = chain_id_new |
|
122
|
|
|
new_model.add(chain) |
|
123
|
|
|
|
|
124
|
|
|
io = PDBIO() |
|
125
|
|
|
io.set_structure(new_structure) |
|
126
|
|
|
io.save(pdb_file) |
|
127
|
|
|
_prepend_header(pdb_file, remarks) |
|
128
|
|
|
outputs.append(pdb_file) |
|
129
|
|
|
|
|
130
|
|
|
return outputs |
|
131
|
|
|
|
|
132
|
|
View Code Duplication |
def get_parser(): |
|
|
|
|
|
|
133
|
|
|
parser = argparse.ArgumentParser( |
|
134
|
|
|
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) |
|
135
|
|
|
|
|
136
|
|
|
version = os.path.basename(os.path.dirname(os.path.abspath(__file__))), get_version(__file__) |
|
137
|
|
|
version = version[1].strip() |
|
138
|
|
|
parser = argparse.ArgumentParser( |
|
139
|
|
|
description=__doc__, formatter_class=argparse.RawTextHelpFormatter) |
|
140
|
|
|
|
|
141
|
|
|
parser.add_argument('--version', help='', action='version', version=version) |
|
142
|
|
|
parser.add_argument("-v", "--verbose", |
|
143
|
|
|
action="store_true", help="be verbose") |
|
144
|
|
|
parser.add_argument('--no-hr', help='do not insert the header into files', |
|
145
|
|
|
action='store_true') |
|
146
|
|
|
parser.add_argument("file", help="", default="", nargs='+') |
|
147
|
|
|
return parser, version |
|
148
|
|
|
|
|
149
|
|
|
|
|
150
|
|
|
if __name__ == '__main__': |
|
151
|
|
|
parser, version = get_parser() |
|
152
|
|
|
args = parser.parse_args() |
|
153
|
|
|
|
|
154
|
|
|
if list != type(args.file): |
|
155
|
|
|
args.file = [args.file] |
|
156
|
|
|
|
|
157
|
|
|
for cif_file in args.file: |
|
158
|
|
|
convert_cif_to_pdb( |
|
159
|
|
|
cif_file, |
|
160
|
|
|
add_header_to_output=not args.no_hr, |
|
161
|
|
|
version=version, |
|
162
|
|
|
verbose=True, |
|
163
|
|
|
) |
|
164
|
|
|
|