| Total Complexity | 45 |
| Total Lines | 354 |
| Duplicated Lines | 44.07 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like triplexibility2 often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | #!/usr/bin/env python |
||
| 2 | # -*- coding: utf-8 -*- |
||
| 3 | |||
| 4 | """ |
||
| 5 | v2 is full atom! |
||
| 6 | |||
| 7 | Examples:: |
||
| 8 | |||
| 9 | triplexibility.py -t 2nd_triplex_FB_ABC.pdb triples-all-v2-rpr/*.pdb --save --result abc.csv |
||
| 10 | |||
| 11 | """ |
||
| 12 | from __future__ import print_function |
||
| 13 | |||
| 14 | import Bio.PDB.PDBParser |
||
| 15 | import Bio.PDB.Superimposer |
||
| 16 | import copy |
||
| 17 | import warnings |
||
| 18 | warnings.filterwarnings('ignore', '.*Invalid or missing.*',) |
||
| 19 | warnings.filterwarnings('ignore', '.*with given element *',) |
||
| 20 | warnings.filterwarnings('ignore', '.*is discontinuous*',) |
||
| 21 | warnings.filterwarnings('ignore', '.*Ignoring unrecognized record*',) |
||
| 22 | |||
| 23 | import sys |
||
| 24 | from icecream import ic |
||
| 25 | ic.configureOutput(outputFunction=lambda *a: print(*a, file=sys.stderr)) |
||
| 26 | ic.configureOutput(prefix='> ') |
||
| 27 | |||
| 28 | from rna_tools.rna_tools_lib import set_chain_for_struc, RNAStructure |
||
| 29 | from rna_tools.rna_tools_config import RT |
||
| 30 | |||
| 31 | import argparse |
||
| 32 | import sys |
||
| 33 | import glob |
||
| 34 | import re |
||
| 35 | import os |
||
| 36 | |||
| 37 | class RNAmodel: |
||
| 38 | def __init__(self, fpath): |
||
| 39 | # parser 1-5 -> 1 2 3 4 5 |
||
| 40 | self.struc = Bio.PDB.PDBParser().get_structure('', fpath) |
||
| 41 | self.__get_atoms() |
||
| 42 | self.fpath = fpath |
||
| 43 | self.fn = os.path.basename(fpath) |
||
| 44 | # self.atoms = [] |
||
| 45 | # if save: |
||
| 46 | # self.save() # @save |
||
| 47 | |||
| 48 | def __get_atoms(self): |
||
| 49 | self.atoms = [[], [], []] # this will be only 3 resides |
||
| 50 | # ugly but should work |
||
| 51 | tseq = '' |
||
| 52 | for index, res in enumerate(self.struc.get_residues()): |
||
| 53 | self.atoms[index] = [] |
||
| 54 | for at in res: |
||
| 55 | self.atoms[index].append(at) |
||
| 56 | tseq += res.get_resname() |
||
| 57 | ## # calc rmsd # |
||
| 58 | ## if not tseq: |
||
| 59 | ## tseq = '' |
||
| 60 | ## rt = None |
||
| 61 | ## for a in self.atoms_for_rmsd: |
||
| 62 | ## r = a.get_parent() |
||
| 63 | ## if r != rt: |
||
| 64 | ## rt = r |
||
| 65 | ## tseq += r.get_resname().strip() |
||
| 66 | self.tseq = tseq |
||
| 67 | return self.atoms |
||
| 68 | |||
| 69 | def __str__(self): |
||
| 70 | return self.fn #+ ' # beads' + str(len(self.residues)) |
||
| 71 | |||
| 72 | def __repr__(self): |
||
| 73 | return self.fn #+ ' # beads' + str(len(self.residues)) |
||
| 74 | |||
| 75 | def get_report(self): |
||
| 76 | """Str a short report about rna model""" |
||
| 77 | t = ' '.join(['File: ', self.fn, ' # of atoms:', str(len(self.atoms)), '\n']) |
||
| 78 | for r,a in zip(self.residues, self.atoms ): |
||
| 79 | t += ' '.join(['resi: ', str(r) ,' atom: ', str(a) , '\n' ]) |
||
| 80 | return t |
||
| 81 | |||
| 82 | def get_rmsd_to(self, other_rnamodel, way="", triple_mode=False, save=True, tseq=''): |
||
| 83 | """Calc rmsd P-atom based rmsd to other rna model |
||
| 84 | |||
| 85 | sugar now 10 atoms ;-) """ |
||
| 86 | sup = Bio.PDB.Superimposer() |
||
| 87 | |||
| 88 | other_atoms_for_rmsd = other_rnamodel.atoms |
||
| 89 | |||
| 90 | if triple_mode: |
||
| 91 | def chunks(lst, n): |
||
| 92 | """Yield successive n-sized chunks from lst. |
||
| 93 | https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks |
||
| 94 | """ |
||
| 95 | for i in range(0, len(lst), n): |
||
| 96 | yield lst[i:i + n] |
||
| 97 | |||
| 98 | rmsd_min = 10000 # ugly |
||
| 99 | import itertools |
||
| 100 | # ok, for different residues now it's a problem |
||
| 101 | per = list(itertools.permutations([0, 1, 2])) # [(0, 1, 2), (0, 2, 1), (1, 0, 2), (1, 2, 0), (2, 0, 1), (2, 1, 0)] |
||
| 102 | #lst = list(chunks(other_atoms_for_rmsd, |
||
| 103 | # int(len(other_atoms_for_rmsd)/3))) # for 1 atom, this will be 1 x 3 [3 residues] |
||
| 104 | # # so so len is 3 atoms so / by 3 to get how many atoms per residue |
||
| 105 | |||
| 106 | self.atoms_for_rmsd = [] |
||
| 107 | for i in self.atoms: |
||
| 108 | self.atoms_for_rmsd.extend(i) |
||
| 109 | |||
| 110 | sup_min = None |
||
| 111 | seq_min = 'not yet obtained, rmsd rejected!' |
||
| 112 | p_min = None |
||
| 113 | |||
| 114 | rms = -1 |
||
| 115 | |||
| 116 | for p in per: # for each combo make a list of atoms |
||
| 117 | |||
| 118 | patoms = [] |
||
| 119 | for i in p: # p=(1, 2, 3) |
||
| 120 | #patoms.extend(lst[i]) |
||
| 121 | # list is not needed, just other_atoms !!!! it's a list of [1, 2, 3] residues |
||
| 122 | patoms.extend(other_atoms_for_rmsd[i]) |
||
| 123 | |||
| 124 | #print(self.atoms_for_rmsd) |
||
| 125 | ## print('patoms') |
||
| 126 | ## for a in patoms: |
||
| 127 | ## print(a, a.get_parent().get_id()) |
||
| 128 | ## print('self.atoms_for_rmsd') |
||
| 129 | ## for a in self.atoms_for_rmsd: |
||
| 130 | ## print(a, a.get_parent().get_id()) |
||
| 131 | #sup.set_atoms(patoms, self.atoms_for_rmsd) |
||
| 132 | |||
| 133 | rt = None |
||
| 134 | seq = '' |
||
| 135 | for a in patoms: |
||
| 136 | r = a.get_parent() |
||
| 137 | if r != rt: |
||
| 138 | rt = r |
||
| 139 | seq += r.get_resname().strip() |
||
| 140 | |||
| 141 | ic(self.tseq.lower(), seq.lower(), other_rnamodel.fpath) |
||
| 142 | # dont' even calc rmsd if the curr seq and tseq are not the same |
||
| 143 | if self.tseq.lower() == seq.lower(): # only if seq is the same |
||
| 144 | |||
| 145 | sup.set_atoms(self.atoms_for_rmsd, patoms) |
||
| 146 | rms = round(sup.rms, 2) |
||
| 147 | if rms < rmsd_min: |
||
| 148 | rmsd_min = rms |
||
| 149 | sup_min = copy.copy(sup) |
||
| 150 | suffix = seq |
||
| 151 | p_min = p |
||
| 152 | seq_min = seq |
||
| 153 | if args.debug: 'set new rmsd_min', rmsd_min |
||
|
|
|||
| 154 | |||
| 155 | if args.debug: |
||
| 156 | print(p, '', [i + 1 for i in p], end=' ') |
||
| 157 | print(seq, 'seq_min: ' + seq_min, rms) |
||
| 158 | |||
| 159 | View Code Duplication | if p_min: # found target sequence |
|
| 160 | # what is this? ;-) |
||
| 161 | index = [0 ,0 ,0] |
||
| 162 | index[0] = p_min.index(0) |
||
| 163 | index[1] = p_min.index(1) |
||
| 164 | index[2] = p_min.index(2) |
||
| 165 | |||
| 166 | # ugly re-set 123 to crazy id ! + 100, so this will |
||
| 167 | # fill up 1 2 3 for the second for |
||
| 168 | rs = other_rnamodel.struc[0].get_residues() |
||
| 169 | for i, r in enumerate(rs): |
||
| 170 | r.id = (' ', index[i] + 253, ' ') # ugly, some random offset |
||
| 171 | |||
| 172 | for i, r in enumerate(other_rnamodel.struc[0].get_residues()): |
||
| 173 | r.id = (' ', index[i] + 1, ' ') |
||
| 174 | if args.debug: print('r', r) |
||
| 175 | |||
| 176 | io = Bio.PDB.PDBIO() |
||
| 177 | sup_min.apply(other_rnamodel.struc.get_atoms()) |
||
| 178 | # if args.debug: print(p_min, [i + 1 for i in p_min]) |
||
| 179 | io.set_structure(other_rnamodel.struc) |
||
| 180 | |||
| 181 | args.save_here = True |
||
| 182 | if args.save_here and save: |
||
| 183 | folder = os.path.basename(self.fpath.replace('.pdb', '_' + args.folder_prefix + '_aligned')) |
||
| 184 | # print(f) |
||
| 185 | try: |
||
| 186 | os.mkdir(folder) |
||
| 187 | except: |
||
| 188 | pass |
||
| 189 | fout = folder + os.sep + "{:1.2f}".format(rmsd_min) + '-' + os.path.basename(other_rnamodel.fpath)#.replace('.pdb', '-' + str(rms) + '.pdb')) |
||
| 190 | #_s' + suffix + '.pdb')) |
||
| 191 | else: |
||
| 192 | fout = other_rnamodel.fpath.replace('.pdb', '_aligned.pdb')#_s' + suffix + '.pdb') |
||
| 193 | |||
| 194 | if args.debug: print(fout) |
||
| 195 | |||
| 196 | if save: |
||
| 197 | io.save(fout) |
||
| 198 | # ugly set chain to A |
||
| 199 | set_chain_for_struc(fout, 'A', save_file_inplace=True) |
||
| 200 | # and now run this to sort into 1 2 3 |
||
| 201 | |||
| 202 | r = RNAStructure(fout) |
||
| 203 | remarks = r.get_remarks_text() |
||
| 204 | r1 = r.get_res_text('A', 1) |
||
| 205 | r2 = r.get_res_text('A', 2) |
||
| 206 | r3 = r.get_res_text('A', 3) |
||
| 207 | with open(fout, 'w') as f: |
||
| 208 | f.write(remarks) |
||
| 209 | f.write(r1) |
||
| 210 | f.write(r2) |
||
| 211 | f.write(r3) |
||
| 212 | r.reload() |
||
| 213 | r.get_rnapuzzle_ready() |
||
| 214 | if rmsd_min < 1: # !!!!!!!!!!!! ugly |
||
| 215 | r.write() |
||
| 216 | return str(rmsd_min)# + ',s' + seq_min + ',' + os.path.basename(fout) |
||
| 217 | else: |
||
| 218 | # check if number of the same atoms # |
||
| 219 | # if not the same then return -1 # |
||
| 220 | # print(len(self.atoms_for_rmsd), len(other_atoms_for_rmsd)) |
||
| 221 | if len(self.atoms_for_rmsd) != len(other_atoms_for_rmsd): |
||
| 222 | return -1 |
||
| 223 | sup.set_atoms(self.atoms_for_rmsd, other_atoms_for_rmsd) |
||
| 224 | rms = round(sup.rms, 2) |
||
| 225 | |||
| 226 | if save: |
||
| 227 | ## io = Bio.PDB.PDBIO() |
||
| 228 | ## sup.apply(self.struc.get_atoms()) |
||
| 229 | ## io.set_structure(self.struc) |
||
| 230 | ## io.save("aligned.pdb") |
||
| 231 | |||
| 232 | io = Bio.PDB.PDBIO() |
||
| 233 | sup.apply(other_rnamodel.struc.get_atoms()) |
||
| 234 | io.set_structure(other_rnamodel.struc) |
||
| 235 | |||
| 236 | args.save_here = True |
||
| 237 | View Code Duplication | if args.save_here and save: |
|
| 238 | f = os.path.basename(self.fpath.replace('.pdb', '_aligned')) |
||
| 239 | # print(f) |
||
| 240 | try: |
||
| 241 | os.mkdir(f) |
||
| 242 | except: |
||
| 243 | pass |
||
| 244 | # fout = f + os.sep + os.path.basename(other_rnamodel.fpath.replace('.pdb', '_aligned_s' + suffix + '.pdb')) |
||
| 245 | fout = f + os.sep + os.path.basename(other_rnamodel.fpath)#.replace('.pdb', '_aligned')) |
||
| 246 | else: |
||
| 247 | fout = other_rnamodel.fpath.replace('.pdb', '_aligned.pdb') |
||
| 248 | if save: |
||
| 249 | io.save(fout) |
||
| 250 | return rms |
||
| 251 | |||
| 252 | |||
| 253 | View Code Duplication | def get_rna_models_from_dir(directory): |
|
| 254 | models = [] |
||
| 255 | if not os.path.exists(directory): |
||
| 256 | raise Exception('Dir does not exist! ', directory) |
||
| 257 | files = glob.glob(directory + "/*.pdb") |
||
| 258 | files_sorted = sort_nicely(files) |
||
| 259 | for f in files_sorted: |
||
| 260 | #print f |
||
| 261 | models.append(RNAmodel(f)) |
||
| 262 | return models |
||
| 263 | |||
| 264 | def sort_nicely( l ): |
||
| 265 | """ Sort the given list in the way that humans expect. |
||
| 266 | |||
| 267 | http://blog.codinghorror.com/sorting-for-humans-natural-sort-order/ |
||
| 268 | """ |
||
| 269 | convert = lambda text: int(text) if text.isdigit() else text |
||
| 270 | alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] |
||
| 271 | l.sort( key=alphanum_key ) |
||
| 272 | return l |
||
| 273 | |||
| 274 | |||
| 275 | View Code Duplication | def get_parser(): |
|
| 276 | parser = argparse.ArgumentParser( |
||
| 277 | description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) |
||
| 278 | |||
| 279 | parser.add_argument('-t',"--target", |
||
| 280 | help="target file") |
||
| 281 | parser.add_argument('--result', #default='out.rmsd', |
||
| 282 | help="result file") |
||
| 283 | parser.add_argument('--ignore-files', default='aligned', help="use to ignore files, .e.g. with 'aligned'") |
||
| 284 | parser.add_argument('--suffix', default='aligned', help="used with --saved, by default: aligned") |
||
| 285 | parser.add_argument('--way', help="e.g., backbone+sugar, c1, c1+Nx") |
||
| 286 | parser.add_argument('--triple-mode', help="same crazy strategy to align triples", default=True, action="store_true") |
||
| 287 | parser.add_argument('--column-name', help="name column for rmsd, by default 'rmsd', but can also be a name of the target file", default="rmsd") |
||
| 288 | parser.add_argument("-s", "--save", action="store_true", help="set suffix with --suffix, by default: aligned") |
||
| 289 | parser.add_argument("--folder-prefix", default = '', help="folder name, t2-3-UAU_NAME_aligned") |
||
| 290 | |||
| 291 | parser.add_argument("--debug", action="store_true") |
||
| 292 | parser.add_argument("--show", action="store_true", help="show the results in the terminal") |
||
| 293 | parser.add_argument("--sort", action="store_true", help='sort results based on rmsd (ascending), --result must be set up') |
||
| 294 | parser.add_argument("--tseq", help='target sequence, e.g. acu, find only triples of this sequence [use the order for the seq taken from the input PDB file, literally order of residues in a pdb file]') |
||
| 295 | parser.add_argument('--files', help='files', nargs='+', default=RT + '/rna_tools/tools/triplexibility/db/triples-all-v2-rpr/*.pdb') |
||
| 296 | return parser |
||
| 297 | |||
| 298 | # main |
||
| 299 | View Code Duplication | if __name__ == '__main__': |
|
| 300 | parser = get_parser() |
||
| 301 | args = parser.parse_args() |
||
| 302 | if not args.debug: |
||
| 303 | ic.disable() |
||
| 304 | |||
| 305 | target = RNAmodel(args.target) |
||
| 306 | |||
| 307 | # a trick to get files be default if there is only a path (so a string ;-)) |
||
| 308 | if not isinstance(args.files, str): |
||
| 309 | models = args.files |
||
| 310 | else: |
||
| 311 | import glob |
||
| 312 | models = glob.glob(args.files) # opts.input_dir |
||
| 313 | |||
| 314 | ## tmp = [] |
||
| 315 | ## if args.ignore_files: |
||
| 316 | ## for f in args.files: |
||
| 317 | ## if args.debug: print(f) |
||
| 318 | ## if args.ignore_files in f: |
||
| 319 | ## continue |
||
| 320 | ## tmp.append(f) |
||
| 321 | ## models = tmp |
||
| 322 | |||
| 323 | print('# of models:', len(models)) |
||
| 324 | |||
| 325 | c = 1 |
||
| 326 | t = 'fn,' + args.column_name + '\n' # ,aligned_seq, aligned_fn\n' |
||
| 327 | for m in models: |
||
| 328 | mrna = RNAmodel(m) |
||
| 329 | #print r1.fn, r2.fn, r1.get_rmsd_to(r2)#, 'tmp.pdb') |
||
| 330 | # print(m) |
||
| 331 | rmsd = target.get_rmsd_to(mrna, args.way, args.triple_mode, args.save, args.tseq) #, 'tmp.pdb') |
||
| 332 | #except: |
||
| 333 | if 0: |
||
| 334 | print(m) |
||
| 335 | sys.exit(1) |
||
| 336 | #print rmsd |
||
| 337 | t += mrna.fn + ',' + str(rmsd) + '\n' |
||
| 338 | #break |
||
| 339 | |||
| 340 | if args.show: |
||
| 341 | print(t.strip()) # all the data |
||
| 342 | |||
| 343 | if args.result: |
||
| 344 | with open(args.result, 'w') as f: |
||
| 345 | f.write(t) |
||
| 346 | |||
| 347 | if args.sort: |
||
| 348 | import pandas as pd |
||
| 349 | df = pd.read_csv(args.result) |
||
| 350 | df = df[df.rmsd != -1] |
||
| 351 | df = df.sort_values('rmsd') |
||
| 352 | df.to_csv(args.result, index=False) |
||
| 353 | print('saved: %s' % args.result) |
||
| 354 | # print(df.to_string(index=False)) |
||
| 355 |