1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- coding: UTF-8 -*- |
3
|
|
|
|
4
|
|
|
""" |
5
|
|
|
Process Hi-C output into AGP for chromosomal-scale scaffolding. |
6
|
|
|
""" |
7
|
|
|
|
8
|
|
|
import array |
9
|
|
|
import json |
10
|
|
|
import logging |
11
|
|
|
import sys |
12
|
|
|
import os |
13
|
|
|
import os.path as op |
14
|
|
|
import numpy as np |
15
|
|
|
import math |
16
|
|
|
|
17
|
|
|
from collections import defaultdict |
18
|
|
|
from functools import partial |
19
|
|
|
from multiprocessing import Pool |
20
|
|
|
|
21
|
|
|
from jcvi.algorithms.formula import outlier_cutoff |
22
|
|
|
from jcvi.algorithms.ec import GA_setup, GA_run |
23
|
|
|
from jcvi.algorithms.matrix import get_signs |
24
|
|
|
from jcvi.apps.base import OptionParser, ActionDispatcher, backup, iglob, \ |
25
|
|
|
mkdir, symlink |
26
|
|
|
from jcvi.apps.console import green, red |
27
|
|
|
from jcvi.apps.grid import Jobs |
28
|
|
|
from jcvi.assembly.allmaps import make_movie |
29
|
|
|
from jcvi.compara.synteny import check_beds, get_bed_filenames |
30
|
|
|
from jcvi.formats.agp import order_to_agp |
31
|
|
|
from jcvi.formats.base import LineFile, must_open |
32
|
|
|
from jcvi.formats.bed import Bed |
33
|
|
|
from jcvi.formats.sizes import Sizes |
34
|
|
|
from jcvi.formats.blast import Blast |
35
|
|
|
from jcvi.graphics.base import normalize_axes, plt, savefig |
36
|
|
|
from jcvi.graphics.dotplot import dotplot |
37
|
|
|
from jcvi.utils.cbook import gene_name, human_size |
38
|
|
|
from jcvi.utils.natsort import natsorted |
39
|
|
|
|
40
|
|
|
|
41
|
|
|
# Map orientations to ints |
42
|
|
|
FF = {'+': 1, '-': -1, '?': 1} |
43
|
|
|
RR = {'+': -1, '-': 1, '?': -1} |
44
|
|
|
LB = 18 # Lower bound for golden_array() |
45
|
|
|
UB = 29 # Upper bound for golden_array() |
46
|
|
|
BB = UB - LB + 1 # Span for golden_array() |
47
|
|
|
ACCEPT = green("ACCEPT") |
48
|
|
|
REJECT = red("REJECT") |
49
|
|
|
BINSIZE = 50000 |
50
|
|
|
|
51
|
|
|
|
52
|
|
|
class ContigOrderingLine(object): |
53
|
|
|
'''Stores one line in the ContigOrdering file |
54
|
|
|
''' |
55
|
|
|
def __init__(self, line, sep="|"): |
56
|
|
|
args = line.split() |
57
|
|
|
self.contig_id = args[0] |
58
|
|
|
self.contig_name = args[1].split(sep)[0] |
59
|
|
|
contig_rc = args[2] |
60
|
|
|
assert contig_rc in ('0', '1') |
61
|
|
|
self.strand = '+' if contig_rc == '0' else '-' |
62
|
|
|
self.orientation_score = args[3] |
63
|
|
|
self.gap_size_after_contig = args[4] |
64
|
|
|
|
65
|
|
|
|
66
|
|
|
class ContigOrdering(LineFile): |
67
|
|
|
'''ContigOrdering file as created by LACHESIS, one per chromosome group. |
68
|
|
|
Header contains summary information per group, followed by list of contigs |
69
|
|
|
with given ordering. |
70
|
|
|
''' |
71
|
|
|
def __init__(self, filename): |
72
|
|
|
super(ContigOrdering, self).__init__(filename) |
73
|
|
|
fp = open(filename) |
74
|
|
|
for row in fp: |
75
|
|
|
if row[0] == '#': |
76
|
|
|
continue |
77
|
|
|
orderline = ContigOrderingLine(row) |
78
|
|
|
self.append(orderline) |
79
|
|
|
|
80
|
|
|
def write_agp(self, obj, sizes, fw=sys.stdout, gapsize=100, |
81
|
|
|
gaptype="contig", evidence="map"): |
82
|
|
|
'''Converts the ContigOrdering file into AGP format |
83
|
|
|
''' |
84
|
|
|
contigorder = [(x.contig_name, x.strand) for x in self] |
85
|
|
|
order_to_agp(obj, contigorder, sizes, fw, |
86
|
|
|
gapsize=gapsize, gaptype=gaptype, evidence=evidence) |
87
|
|
|
|
88
|
|
|
|
89
|
|
|
class CLMFile: |
90
|
|
|
'''CLM file (modified) has the following format: |
91
|
|
|
|
92
|
|
|
tig00046211+ tig00063795+ 1 53173 |
93
|
|
|
tig00046211+ tig00063795- 1 116050 |
94
|
|
|
tig00046211- tig00063795+ 1 71155 |
95
|
|
|
tig00046211- tig00063795- 1 134032 |
96
|
|
|
tig00030676+ tig00077819+ 5 136407 87625 87625 106905 102218 |
97
|
|
|
tig00030676+ tig00077819- 5 126178 152952 152952 35680 118923 |
98
|
|
|
tig00030676- tig00077819+ 5 118651 91877 91877 209149 125906 |
99
|
|
|
tig00030676- tig00077819- 5 108422 157204 157204 137924 142611 |
100
|
|
|
''' |
101
|
|
|
def __init__(self, clmfile, skiprecover=False): |
102
|
|
|
self.name = op.basename(clmfile).rsplit(".", 1)[0] |
103
|
|
|
self.clmfile = clmfile |
104
|
|
|
self.idsfile = clmfile.rsplit(".", 1)[0] + ".ids" |
105
|
|
|
self.parse_ids(skiprecover) |
106
|
|
|
self.parse_clm() |
107
|
|
|
self.signs = None |
108
|
|
|
|
109
|
|
|
def parse_ids(self, skiprecover): |
110
|
|
|
'''IDS file has a list of contigs that need to be ordered. 'recover', |
111
|
|
|
keyword, if available in the third column, is less confident. |
112
|
|
|
|
113
|
|
|
tig00015093 46912 |
114
|
|
|
tig00035238 46779 recover |
115
|
|
|
tig00030900 119291 |
116
|
|
|
''' |
117
|
|
|
idsfile = self.idsfile |
118
|
|
|
logging.debug("Parse idsfile `{}`".format(idsfile)) |
119
|
|
|
fp = open(idsfile) |
120
|
|
|
tigs = [] |
121
|
|
|
for row in fp: |
122
|
|
|
atoms = row.split() |
123
|
|
|
tig, size = atoms[:2] |
124
|
|
|
size = int(size) |
125
|
|
|
if skiprecover and len(atoms) == 3 and atoms[2] == 'recover': |
126
|
|
|
continue |
127
|
|
|
tigs.append((tig, size)) |
128
|
|
|
|
129
|
|
|
# Arrange contig names and sizes |
130
|
|
|
_tigs, _sizes = zip(*tigs) |
131
|
|
|
self.contigs = set(_tigs) |
132
|
|
|
self.sizes = np.array(_sizes) |
133
|
|
|
self.tig_to_size = dict(tigs) |
134
|
|
|
|
135
|
|
|
# Initially all contigs are considered active |
136
|
|
|
self.active = set(_tigs) |
137
|
|
|
|
138
|
|
|
def parse_clm(self): |
139
|
|
|
clmfile = self.clmfile |
140
|
|
|
logging.debug("Parse clmfile `{}`".format(clmfile)) |
141
|
|
|
fp = open(clmfile) |
142
|
|
|
contacts = {} |
143
|
|
|
contacts_oriented = defaultdict(dict) |
144
|
|
|
orientations = defaultdict(list) |
145
|
|
|
for row in fp: |
146
|
|
|
atoms = row.strip().split('\t') |
147
|
|
|
assert len(atoms) == 3, "Malformed line `{}`".format(atoms) |
148
|
|
|
abtig, links, dists = atoms |
149
|
|
|
atig, btig = abtig.split() |
150
|
|
|
at, ao = atig[:-1], atig[-1] |
151
|
|
|
bt, bo = btig[:-1], btig[-1] |
152
|
|
|
if at not in self.tig_to_size: |
153
|
|
|
continue |
154
|
|
|
if bt not in self.tig_to_size: |
155
|
|
|
continue |
156
|
|
|
dists = [int(x) for x in dists.split()] |
157
|
|
|
contacts[(at, bt)] = len(dists) |
158
|
|
|
gdists = golden_array(dists) |
159
|
|
|
contacts_oriented[(at, bt)][(FF[ao], FF[bo])] = gdists |
160
|
|
|
contacts_oriented[(bt, at)][(RR[bo], RR[ao])] = gdists |
161
|
|
|
strandedness = 1 if ao == bo else -1 |
162
|
|
|
orientations[(at, bt)].append((strandedness, dists)) |
163
|
|
|
|
164
|
|
|
self.contacts = contacts |
165
|
|
|
self.contacts_oriented = contacts_oriented |
166
|
|
|
# Preprocess the orientations dict |
167
|
|
|
for (at, bt), dists in orientations.items(): |
168
|
|
|
dists = [(s, d, hmean_int(d)) for (s, d) in dists] |
169
|
|
|
strandedness, md, mh = min(dists, key=lambda x: x[-1]) |
170
|
|
|
orientations[(at, bt)] = (strandedness, len(md), mh) |
171
|
|
|
self.orientations = orientations |
172
|
|
|
|
173
|
|
|
def calculate_densities(self): |
174
|
|
|
""" |
175
|
|
|
Calculate the density of inter-contig links per base. Strong contigs |
176
|
|
|
considered to have high level of inter-contig links in the current |
177
|
|
|
partition. |
178
|
|
|
""" |
179
|
|
|
active = self.active |
180
|
|
|
densities = defaultdict(int) |
181
|
|
|
for (at, bt), links in self.contacts.items(): |
182
|
|
|
if not (at in active and bt in active): |
183
|
|
|
continue |
184
|
|
|
densities[at] += links |
185
|
|
|
densities[bt] += links |
186
|
|
|
|
187
|
|
|
logdensities = {} |
188
|
|
|
for x, d in densities.items(): |
189
|
|
|
s = self.tig_to_size[x] |
190
|
|
|
logd = np.log10(d * 1. / min(s, 500000)) |
191
|
|
|
logdensities[x] = logd |
192
|
|
|
|
193
|
|
|
return logdensities |
194
|
|
|
|
195
|
|
|
def report_active(self): |
196
|
|
|
logging.debug("Active contigs: {} (length={})" |
197
|
|
|
.format(self.N, self.active_sizes.sum())) |
198
|
|
|
|
199
|
|
|
def activate(self, tourfile=None, minsize=10000, backuptour=True): |
200
|
|
|
""" |
201
|
|
|
Select contigs in the current partition. This is the setup phase of the |
202
|
|
|
algorithm, and supports two modes: |
203
|
|
|
|
204
|
|
|
- "de novo": This is useful at the start of a new run where no tours |
205
|
|
|
available. We select the strong contigs that have significant number |
206
|
|
|
of links to other contigs in the partition. We build a histogram of |
207
|
|
|
link density (# links per bp) and remove the contigs that appear as |
208
|
|
|
outliers. The orientations are derived from the matrix decomposition |
209
|
|
|
of the pairwise strandedness matrix O. |
210
|
|
|
|
211
|
|
|
- "hotstart": This is useful when there was a past run, with a given |
212
|
|
|
tourfile. In this case, the active contig list and orientations are |
213
|
|
|
derived from the last tour in the file. |
214
|
|
|
""" |
215
|
|
|
if tourfile and (not op.exists(tourfile)): |
216
|
|
|
logging.debug("Tourfile `{}` not found".format(tourfile)) |
217
|
|
|
tourfile = None |
218
|
|
|
|
219
|
|
|
if tourfile: |
220
|
|
|
logging.debug("Importing tourfile `{}`".format(tourfile)) |
221
|
|
|
tour, tour_o = iter_last_tour(tourfile, self) |
222
|
|
|
self.active = set(tour) |
223
|
|
|
tig_to_idx = self.tig_to_idx |
224
|
|
|
tour = [tig_to_idx[x] for x in tour] |
225
|
|
|
signs = sorted([(x, FF[o]) for (x, o) in zip(tour, tour_o)]) |
226
|
|
|
_, signs = zip(*signs) |
227
|
|
|
self.signs = np.array(signs, dtype=int) |
228
|
|
|
if backuptour: |
229
|
|
|
backup(tourfile) |
230
|
|
|
tour = array.array('i', tour) |
231
|
|
|
else: |
232
|
|
|
self.report_active() |
233
|
|
|
while True: |
234
|
|
|
logdensities = self.calculate_densities() |
235
|
|
|
lb, ub = outlier_cutoff(logdensities.values()) |
236
|
|
|
logging.debug("Log10(link_densities) ~ [{}, {}]" |
237
|
|
|
.format(lb, ub)) |
238
|
|
|
remove = set(x for x, d in logdensities.items() if |
239
|
|
|
(d < lb and self.tig_to_size[x] < minsize * 10)) |
240
|
|
|
if remove: |
241
|
|
|
self.active -= remove |
242
|
|
|
self.report_active() |
243
|
|
|
else: |
244
|
|
|
break |
245
|
|
|
|
246
|
|
|
logging.debug("Remove contigs with size < {}".format(minsize)) |
247
|
|
|
self.active = set(x for x in self.active if |
248
|
|
|
self.tig_to_size[x] >= minsize) |
249
|
|
|
tour = range(self.N) # Use starting (random) order otherwise |
250
|
|
|
tour = array.array('i', tour) |
251
|
|
|
|
252
|
|
|
# Determine orientations |
253
|
|
|
self.flip_all(tour) |
254
|
|
|
|
255
|
|
|
self.report_active() |
256
|
|
|
self.tour = tour |
257
|
|
|
|
258
|
|
|
return tour |
259
|
|
|
|
260
|
|
|
def evaluate_tour_M(self, tour): |
261
|
|
|
""" Use Cythonized version to evaluate the score of a current tour |
262
|
|
|
""" |
263
|
|
|
from .chic import score_evaluate_M |
264
|
|
|
return score_evaluate_M(tour, self.active_sizes, self.M) |
265
|
|
|
|
266
|
|
|
def evaluate_tour_P(self, tour): |
267
|
|
|
""" Use Cythonized version to evaluate the score of a current tour, |
268
|
|
|
with better precision on the distance of the contigs. |
269
|
|
|
""" |
270
|
|
|
from .chic import score_evaluate_P |
271
|
|
|
return score_evaluate_P(tour, self.active_sizes, self.P) |
272
|
|
|
|
273
|
|
|
def evaluate_tour_Q(self, tour): |
274
|
|
|
""" Use Cythonized version to evaluate the score of a current tour, |
275
|
|
|
taking orientation into consideration. This may be the most accurate |
276
|
|
|
evaluation under the right condition. |
277
|
|
|
""" |
278
|
|
|
from .chic import score_evaluate_Q |
279
|
|
|
return score_evaluate_Q(tour, self.active_sizes, self.Q) |
280
|
|
|
|
281
|
|
|
def flip_log(self, method, score, score_flipped, tag): |
282
|
|
|
logging.debug("{}: {} => {} {}" |
283
|
|
|
.format(method, score, score_flipped, tag)) |
284
|
|
|
|
285
|
|
|
def flip_all(self, tour): |
286
|
|
|
""" Initialize the orientations based on pairwise O matrix. |
287
|
|
|
""" |
288
|
|
|
if self.signs is None: # First run |
289
|
|
|
score = 0 |
290
|
|
|
else: |
291
|
|
|
old_signs = self.signs[:self.N] |
292
|
|
|
score, = self.evaluate_tour_Q(tour) |
293
|
|
|
|
294
|
|
|
# Remember we cannot have ambiguous orientation code (0 or '?') here |
295
|
|
|
self.signs = get_signs(self.O, validate=False, ambiguous=False) |
296
|
|
|
score_flipped, = self.evaluate_tour_Q(tour) |
297
|
|
|
if score_flipped >= score: |
298
|
|
|
tag = ACCEPT |
299
|
|
|
else: |
300
|
|
|
self.signs = old_signs[:] |
301
|
|
|
tag = REJECT |
302
|
|
|
self.flip_log("FLIPALL", score, score_flipped, tag) |
303
|
|
|
return tag |
304
|
|
|
|
305
|
|
|
def flip_whole(self, tour): |
306
|
|
|
""" Test flipping all contigs at the same time to see if score improves. |
307
|
|
|
""" |
308
|
|
|
score, = self.evaluate_tour_Q(tour) |
309
|
|
|
self.signs = -self.signs |
310
|
|
|
score_flipped, = self.evaluate_tour_Q(tour) |
311
|
|
|
if score_flipped > score: |
312
|
|
|
tag = ACCEPT |
313
|
|
|
else: |
314
|
|
|
self.signs = -self.signs |
315
|
|
|
tag = REJECT |
316
|
|
|
self.flip_log("FLIPWHOLE", score, score_flipped, tag) |
317
|
|
|
return tag |
318
|
|
|
|
319
|
|
|
def flip_one(self, tour): |
320
|
|
|
""" Test flipping every single contig sequentially to see if score |
321
|
|
|
improves. |
322
|
|
|
""" |
323
|
|
|
n_accepts = n_rejects = 0 |
324
|
|
|
any_tag_ACCEPT = False |
325
|
|
|
for i, t in enumerate(tour): |
326
|
|
|
if i == 0: |
327
|
|
|
score, = self.evaluate_tour_Q(tour) |
328
|
|
|
self.signs[t] = -self.signs[t] |
329
|
|
|
score_flipped, = self.evaluate_tour_Q(tour) |
330
|
|
|
if score_flipped > score: |
331
|
|
|
n_accepts += 1 |
332
|
|
|
tag = ACCEPT |
333
|
|
|
else: |
334
|
|
|
self.signs[t] = -self.signs[t] |
335
|
|
|
n_rejects += 1 |
336
|
|
|
tag = REJECT |
337
|
|
|
self.flip_log("FLIPONE ({}/{})".format(i + 1, len(self.signs)), |
338
|
|
|
score, score_flipped, tag) |
339
|
|
|
if tag == ACCEPT: |
340
|
|
|
any_tag_ACCEPT = True |
341
|
|
|
score = score_flipped |
342
|
|
|
logging.debug("FLIPONE: N_accepts={} N_rejects={}" |
343
|
|
|
.format(n_accepts, n_rejects)) |
344
|
|
|
return ACCEPT if any_tag_ACCEPT else REJECT |
345
|
|
|
|
346
|
|
|
def prune_tour(self, tour, cpus): |
347
|
|
|
""" Test deleting each contig and check the delta_score; tour here must |
348
|
|
|
be an array of ints. |
349
|
|
|
""" |
350
|
|
|
while True: |
351
|
|
|
tour_score, = self.evaluate_tour_M(tour) |
352
|
|
|
logging.debug("Starting score: {}".format(tour_score)) |
353
|
|
|
active_sizes = self.active_sizes |
354
|
|
|
M = self.M |
355
|
|
|
args = [] |
356
|
|
|
for i, t in enumerate(tour): |
357
|
|
|
stour = tour[:i] + tour[i + 1:] |
358
|
|
|
args.append((t, stour, tour_score, active_sizes, M)) |
359
|
|
|
|
360
|
|
|
# Parallel run |
361
|
|
|
p = Pool(processes=cpus) |
362
|
|
|
results = list(p.imap(prune_tour_worker, args)) |
363
|
|
|
assert len(tour) == len(results), \ |
364
|
|
|
"Array size mismatch, tour({}) != results({})"\ |
365
|
|
|
.format(len(tour), len(results)) |
366
|
|
|
|
367
|
|
|
# Identify outliers |
368
|
|
|
active_contigs = self.active_contigs |
369
|
|
|
idx, log10deltas = zip(*results) |
370
|
|
|
lb, ub = outlier_cutoff(log10deltas) |
371
|
|
|
logging.debug("Log10(delta_score) ~ [{}, {}]".format(lb, ub)) |
372
|
|
|
|
373
|
|
|
remove = set(active_contigs[x] for (x, d) in results if d < lb) |
374
|
|
|
self.active -= remove |
375
|
|
|
self.report_active() |
376
|
|
|
|
377
|
|
|
tig_to_idx = self.tig_to_idx |
378
|
|
|
tour = [active_contigs[x] for x in tour] |
379
|
|
|
tour = array.array('i', [tig_to_idx[x] for x in tour |
380
|
|
|
if x not in remove]) |
381
|
|
|
if not remove: |
382
|
|
|
break |
383
|
|
|
|
384
|
|
|
self.tour = tour |
385
|
|
|
self.flip_all(tour) |
386
|
|
|
|
387
|
|
|
return tour |
388
|
|
|
|
389
|
|
|
@property |
390
|
|
|
def active_contigs(self): |
391
|
|
|
return list(self.active) |
392
|
|
|
|
393
|
|
|
@property |
394
|
|
|
def active_sizes(self): |
395
|
|
|
return np.array([self.tig_to_size[x] for x in self.active]) |
396
|
|
|
|
397
|
|
|
@property |
398
|
|
|
def N(self): |
399
|
|
|
return len(self.active) |
400
|
|
|
|
401
|
|
View Code Duplication |
@property |
|
|
|
|
402
|
|
|
def oo(self): |
403
|
|
|
return range(self.N) |
404
|
|
|
|
405
|
|
|
@property |
406
|
|
|
def tig_to_idx(self): |
407
|
|
|
return dict((x, i) for (i, x) in enumerate(self.active)) |
408
|
|
|
|
409
|
|
|
@property |
410
|
|
|
def M(self): |
411
|
|
|
""" |
412
|
|
|
Contact frequency matrix. Each cell contains how many inter-contig |
413
|
|
|
links between i-th and j-th contigs. |
414
|
|
|
""" |
415
|
|
|
N = self.N |
416
|
|
|
tig_to_idx = self.tig_to_idx |
417
|
|
|
M = np.zeros((N, N), dtype=int) |
418
|
|
View Code Duplication |
for (at, bt), links in self.contacts.items(): |
|
|
|
|
419
|
|
|
if not (at in tig_to_idx and bt in tig_to_idx): |
420
|
|
|
continue |
421
|
|
|
ai = tig_to_idx[at] |
422
|
|
|
bi = tig_to_idx[bt] |
423
|
|
|
M[ai, bi] = M[bi, ai] = links |
424
|
|
|
return M |
425
|
|
|
|
426
|
|
|
@property |
427
|
|
|
def O(self): |
428
|
|
|
""" |
429
|
|
|
Pairwise strandedness matrix. Each cell contains whether i-th and j-th |
430
|
|
|
contig are the same orientation +1, or opposite orientation -1. |
431
|
|
|
""" |
432
|
|
|
N = self.N |
433
|
|
|
tig_to_idx = self.tig_to_idx |
434
|
|
|
O = np.zeros((N, N), dtype=int) |
435
|
|
|
for (at, bt), (strandedness, md, mh) in self.orientations.items(): |
436
|
|
|
if not (at in tig_to_idx and bt in tig_to_idx): |
437
|
|
|
continue |
438
|
|
|
ai = tig_to_idx[at] |
439
|
|
|
bi = tig_to_idx[bt] |
440
|
|
|
score = strandedness * md |
441
|
|
|
O[ai, bi] = O[bi, ai] = score |
442
|
|
|
return O |
443
|
|
|
|
444
|
|
|
@property |
445
|
|
|
def P(self): |
446
|
|
|
""" |
447
|
|
|
Contact frequency matrix with better precision on distance between |
448
|
|
|
contigs. In the matrix M, the distance is assumed to be the distance |
449
|
|
|
between mid-points of two contigs. In matrix Q, however, we compute |
450
|
|
|
harmonic mean of the links for the orientation configuration that is |
451
|
|
|
shortest. This offers better precision for the distance between big |
452
|
|
|
contigs. |
453
|
|
|
""" |
454
|
|
|
N = self.N |
455
|
|
|
tig_to_idx = self.tig_to_idx |
456
|
|
|
P = np.zeros((N, N, 2), dtype=int) |
457
|
|
|
for (at, bt), (strandedness, md, mh) in self.orientations.items(): |
458
|
|
|
if not (at in tig_to_idx and bt in tig_to_idx): |
459
|
|
|
continue |
460
|
|
|
ai = tig_to_idx[at] |
461
|
|
|
bi = tig_to_idx[bt] |
462
|
|
|
P[ai, bi, 0] = P[bi, ai, 0] = md |
463
|
|
|
P[ai, bi, 1] = P[bi, ai, 1] = mh |
464
|
|
|
return P |
465
|
|
|
|
466
|
|
|
@property |
467
|
|
|
def Q(self): |
468
|
|
|
""" |
469
|
|
|
Contact frequency matrix when contigs are already oriented. This is s a |
470
|
|
|
similar matrix as M, but rather than having the number of links in the |
471
|
|
|
cell, it points to an array that has the actual distances. |
472
|
|
|
""" |
473
|
|
|
N = self.N |
474
|
|
|
tig_to_idx = self.tig_to_idx |
475
|
|
|
signs = self.signs |
476
|
|
|
Q = np.ones((N, N, BB), dtype=int) * -1 # Use -1 as the sentinel |
477
|
|
|
for (at, bt), k in self.contacts_oriented.items(): |
478
|
|
|
if not (at in tig_to_idx and bt in tig_to_idx): |
479
|
|
|
continue |
480
|
|
|
ai = tig_to_idx[at] |
481
|
|
|
bi = tig_to_idx[bt] |
482
|
|
|
ao = signs[ai] |
483
|
|
|
bo = signs[bi] |
484
|
|
|
Q[ai, bi] = k[(ao, bo)] |
485
|
|
|
return Q |
486
|
|
|
|
487
|
|
|
|
488
|
|
|
def hmean_int(a, a_min=5778, a_max=1149851): |
489
|
|
|
""" Harmonic mean of an array, returns the closest int |
490
|
|
|
""" |
491
|
|
|
from scipy.stats import hmean |
492
|
|
|
return int(round(hmean(np.clip(a, a_min, a_max)))) |
493
|
|
|
|
494
|
|
|
|
495
|
|
|
def golden_array(a, phi=1.61803398875, lb=LB, ub=UB): |
496
|
|
|
""" Given list of ints, we aggregate similar values so that it becomes an |
497
|
|
|
array of multiples of phi, where phi is the golden ratio. |
498
|
|
|
|
499
|
|
|
phi ^ 14 = 843 |
500
|
|
|
phi ^ 33 = 7881196 |
501
|
|
|
|
502
|
|
|
So the array of counts go between 843 to 788196. One triva is that the |
503
|
|
|
exponents of phi gets closer to integers as N grows. See interesting |
504
|
|
|
discussion here: |
505
|
|
|
<https://www.johndcook.com/blog/2017/03/22/golden-powers-are-nearly-integers/> |
506
|
|
|
""" |
507
|
|
|
counts = np.zeros(BB, dtype=int) |
508
|
|
|
for x in a: |
509
|
|
|
c = int(round(math.log(x, phi))) |
510
|
|
|
if c < lb: |
511
|
|
|
c = lb |
512
|
|
|
if c > ub: |
513
|
|
|
c = ub |
514
|
|
|
counts[c - lb] += 1 |
515
|
|
|
return counts |
516
|
|
|
|
517
|
|
|
|
518
|
|
|
def prune_tour_worker(arg): |
519
|
|
|
""" Worker thread for CLMFile.prune_tour() |
520
|
|
|
""" |
521
|
|
|
from .chic import score_evaluate_M |
522
|
|
|
|
523
|
|
|
t, stour, tour_score, active_sizes, M = arg |
524
|
|
|
stour_score, = score_evaluate_M(stour, active_sizes, M) |
525
|
|
|
delta_score = tour_score - stour_score |
526
|
|
|
log10d = np.log10(delta_score) if delta_score > 1e-9 else -9 |
527
|
|
|
return t, log10d |
528
|
|
|
|
529
|
|
|
|
530
|
|
|
def main(): |
531
|
|
|
|
532
|
|
|
actions = ( |
533
|
|
|
# LACHESIS output processing |
534
|
|
|
('agp', 'generate AGP file based on LACHESIS output'), |
535
|
|
|
('score', 'score the current LACHESIS CLM'), |
536
|
|
|
# Simulation |
537
|
|
|
('simulate', 'simulate CLM data'), |
538
|
|
|
# Scaffolding |
539
|
|
|
('optimize', 'optimize the contig order and orientation'), |
540
|
|
|
('density', 'estimate link density of contigs'), |
541
|
|
|
# Plotting |
542
|
|
|
('movieframe', 'plot heatmap and synteny for a particular tour'), |
543
|
|
|
('movie', 'plot heatmap optimization history in a tourfile'), |
544
|
|
|
# Reference-based analytics |
545
|
|
|
('bam2mat', 'convert bam file to .npy format used in plotting'), |
546
|
|
|
('mergemat', 'combine counts from multiple .npy data files'), |
547
|
|
|
('heatmap', 'plot heatmap based on .npy format'), |
548
|
|
|
) |
549
|
|
|
p = ActionDispatcher(actions) |
550
|
|
|
p.dispatch(globals()) |
551
|
|
|
|
552
|
|
|
|
553
|
|
|
def mergemat(args): |
554
|
|
|
""" |
555
|
|
|
%prog mergemat *.npy |
556
|
|
|
|
557
|
|
|
Combine counts from multiple .npy data files. |
558
|
|
|
""" |
559
|
|
|
p = OptionParser(mergemat.__doc__) |
560
|
|
|
p.set_outfile(outfile="out") |
561
|
|
|
opts, args = p.parse_args(args) |
562
|
|
|
|
563
|
|
|
if len(args) < 1: |
564
|
|
|
sys.exit(not p.print_help()) |
565
|
|
|
|
566
|
|
|
npyfiles = args |
567
|
|
|
A = np.load(npyfiles[0]) |
568
|
|
|
logging.debug("Load `{}`: matrix of shape {}:; sum={}" |
569
|
|
|
.format(npyfiles[0], A.shape, A.sum())) |
570
|
|
|
for npyfile in npyfiles[1:]: |
571
|
|
|
B = np.load(npyfile) |
572
|
|
|
A += B |
573
|
|
|
logging.debug("Load `{}`: sum={}" |
574
|
|
|
.format(npyfiles[0], A.sum())) |
575
|
|
|
|
576
|
|
|
pf = opts.outfile |
577
|
|
|
np.save(pf, A) |
578
|
|
|
logging.debug("Combined {} files into `{}.npy`".format(len(npyfiles), pf)) |
579
|
|
|
|
580
|
|
|
|
581
|
|
|
def get_seqstarts(bamfile, N): |
582
|
|
|
""" Go through the SQ headers and pull out all sequences with size |
583
|
|
|
greater than the resolution settings, i.e. contains at least a few cells |
584
|
|
|
""" |
585
|
|
|
import pysam |
586
|
|
|
bamfile = pysam.AlignmentFile(bamfile, "rb") |
587
|
|
|
seqsize = {} |
588
|
|
|
for kv in bamfile.header["SQ"]: |
589
|
|
|
if kv["LN"] < 10 * N: |
590
|
|
|
continue |
591
|
|
|
seqsize[kv["SN"]] = kv["LN"] / N + 1 |
592
|
|
|
|
593
|
|
|
allseqs = natsorted(seqsize.keys()) |
594
|
|
|
allseqsizes = np.array([seqsize[x] for x in allseqs]) |
595
|
|
|
seqstarts = np.cumsum(allseqsizes) |
596
|
|
|
seqstarts = np.roll(seqstarts, 1) |
597
|
|
|
total_bins = seqstarts[0] |
598
|
|
|
seqstarts[0] = 0 |
599
|
|
|
seqstarts = dict(zip(allseqs, seqstarts)) |
600
|
|
|
|
601
|
|
|
return seqstarts, seqsize, total_bins |
602
|
|
|
|
603
|
|
|
|
604
|
|
|
def bam2mat(args): |
605
|
|
|
""" |
606
|
|
|
%prog bam2mat input.bam |
607
|
|
|
|
608
|
|
|
Convert bam file to .mat format, which is simply numpy 2D array. Important |
609
|
|
|
parameter is the resolution, which is the cell size. Small cell size lead |
610
|
|
|
to more fine-grained heatmap, but leads to large .mat size and slower |
611
|
|
|
plotting. |
612
|
|
|
""" |
613
|
|
|
import pysam |
614
|
|
|
from jcvi.utils.cbook import percentage |
615
|
|
|
|
616
|
|
|
p = OptionParser(bam2mat.__doc__) |
617
|
|
|
p.add_option("--resolution", default=500000, type="int", |
618
|
|
|
help="Resolution when counting the links") |
619
|
|
|
opts, args = p.parse_args(args) |
620
|
|
|
|
621
|
|
|
if len(args) != 1: |
622
|
|
|
sys.exit(not p.print_help()) |
623
|
|
|
|
624
|
|
|
bamfilename, = args |
625
|
|
|
pf = bamfilename.rsplit(".", 1)[0] |
626
|
|
|
N = opts.resolution |
627
|
|
|
|
628
|
|
|
seqstarts, seqsize, total_bins = get_seqstarts(bamfilename, N) |
629
|
|
|
|
630
|
|
|
# Store the starts and sizes into a JSON file |
631
|
|
|
jsonfile = pf + ".json" |
632
|
|
|
fwjson = open(jsonfile, "w") |
633
|
|
|
header = {"starts": seqstarts, "sizes": seqsize, "total_bins": total_bins} |
634
|
|
|
json.dump(header, fwjson, sort_keys=True, indent=4) |
635
|
|
|
fwjson.close() |
636
|
|
|
logging.debug("Contig bin starts written to `{}`".format(jsonfile)) |
637
|
|
|
|
638
|
|
|
print(sorted(seqstarts.items(), key=lambda x: x[-1])) |
639
|
|
|
logging.debug("Initialize matrix of size {}x{}" |
640
|
|
|
.format(total_bins, total_bins)) |
641
|
|
|
A = np.zeros((total_bins, total_bins), dtype="int") |
642
|
|
|
|
643
|
|
|
# Find the bin ID of each read |
644
|
|
|
def bin_number(chr, pos): |
645
|
|
|
return seqstarts[chr] + pos / N |
646
|
|
|
|
647
|
|
|
bamfile = pysam.AlignmentFile(bamfilename, "rb") |
648
|
|
|
# Check all reads, rules borrowed from LACHESIS |
649
|
|
|
# https://github.com/shendurelab/LACHESIS/blob/master/src/GenomeLinkMatrix.cc#L1476 |
650
|
|
|
j = k = 0 |
651
|
|
|
for c in bamfile: |
652
|
|
|
j += 1 |
653
|
|
|
if j % 100000 == 0: |
654
|
|
|
print >> sys.stderr, "{} reads counted".format(j) |
655
|
|
|
# if j >= 10000: break |
656
|
|
|
|
657
|
|
|
if c.is_qcfail and c.is_duplicate: |
658
|
|
|
continue |
659
|
|
|
if c.is_secondary and c.is_supplementary: |
660
|
|
|
continue |
661
|
|
|
if c.mapping_quality == 0: |
662
|
|
|
continue |
663
|
|
|
if not c.is_paired: |
664
|
|
|
continue |
665
|
|
|
if c.is_read2: # Take only one read |
666
|
|
|
continue |
667
|
|
|
|
668
|
|
|
# pysam v0.8.3 does not support keyword reference_name |
669
|
|
|
achr = bamfile.getrname(c.reference_id) |
670
|
|
|
apos = c.reference_start |
671
|
|
|
bchr = bamfile.getrname(c.next_reference_id) |
672
|
|
|
bpos = c.next_reference_start |
673
|
|
|
if achr not in seqstarts or bchr not in seqstarts: |
674
|
|
|
continue |
675
|
|
|
abin, bbin = bin_number(achr, apos), bin_number(bchr, bpos) |
676
|
|
|
A[abin, bbin] += 1 |
677
|
|
|
if abin != bbin: |
678
|
|
|
A[bbin, abin] += 1 |
679
|
|
|
k += 1 |
680
|
|
|
|
681
|
|
|
logging.debug("Total reads counted: {}".format(percentage(2 * k, j))) |
682
|
|
|
bamfile.close() |
683
|
|
|
np.save(pf, A) |
684
|
|
|
logging.debug("Link counts written to `{}.npy`".format(pf)) |
685
|
|
|
|
686
|
|
|
|
687
|
|
|
def simulate(args): |
688
|
|
|
""" |
689
|
|
|
%prog simulate test |
690
|
|
|
|
691
|
|
|
Simulate CLM and IDS files with given names. |
692
|
|
|
|
693
|
|
|
The simulator assumes several distributions: |
694
|
|
|
- Links are distributed uniformly across genome |
695
|
|
|
- Log10(link_size) are distributed normally |
696
|
|
|
- Genes are distributed uniformly |
697
|
|
|
""" |
698
|
|
|
p = OptionParser(simulate.__doc__) |
699
|
|
|
p.add_option("--genomesize", default=10000000, type="int", |
700
|
|
|
help="Genome size") |
701
|
|
|
p.add_option("--genes", default=1000, type="int", |
702
|
|
|
help="Number of genes") |
703
|
|
|
p.add_option("--contigs", default=100, type="int", |
704
|
|
|
help="Number of contigs") |
705
|
|
|
p.add_option("--coverage", default=10, type="int", |
706
|
|
|
help="Link coverage") |
707
|
|
|
opts, args = p.parse_args(args) |
708
|
|
|
|
709
|
|
|
if len(args) != 1: |
710
|
|
|
sys.exit(not p.print_help()) |
711
|
|
|
|
712
|
|
|
pf, = args |
713
|
|
|
GenomeSize = opts.genomesize |
714
|
|
|
Genes = opts.genes |
715
|
|
|
Contigs = opts.contigs |
716
|
|
|
Coverage = opts.coverage |
717
|
|
|
PE = 500 |
718
|
|
|
Links = int(GenomeSize * Coverage / PE) |
719
|
|
|
|
720
|
|
|
# Simulate the contig sizes that sum to GenomeSize |
721
|
|
|
# See also: |
722
|
|
|
# <https://en.wikipedia.org/wiki/User:Skinnerd/Simplex_Point_Picking> |
723
|
|
|
ContigSizes, = np.random.dirichlet([1] * Contigs, 1) * GenomeSize |
724
|
|
|
ContigSizes = np.array(np.round_(ContigSizes, decimals=0), dtype=int) |
725
|
|
|
ContigStarts = np.zeros(Contigs, dtype=int) |
726
|
|
|
ContigStarts[1:] = np.cumsum(ContigSizes)[:-1] |
727
|
|
|
|
728
|
|
|
# Write IDS file |
729
|
|
|
idsfile = pf + ".ids" |
730
|
|
|
fw = open(idsfile, "w") |
731
|
|
|
for i, s in enumerate(ContigSizes): |
732
|
|
|
print >> fw, "tig{:04d}\t{}".format(i, s) |
733
|
|
|
fw.close() |
734
|
|
|
|
735
|
|
|
# Simulate the gene positions |
736
|
|
|
GenePositions = np.sort(np.random.random_integers(0, |
737
|
|
|
GenomeSize - 1, size=Genes)) |
738
|
|
|
write_last_and_beds(pf, GenePositions, ContigStarts) |
739
|
|
|
|
740
|
|
|
# Simulate links, uniform start, with link distances following 1/x, where x |
741
|
|
|
# is the distance between the links. As an approximation, we have links |
742
|
|
|
# between [1e3, 1e7], so we map from uniform [1e-7, 1e-3] |
743
|
|
|
LinkStarts = np.sort(np.random.random_integers(0, GenomeSize - 1, |
744
|
|
|
size=Links)) |
745
|
|
|
a, b = 1e-7, 1e-3 |
746
|
|
|
LinkSizes = np.array(np.round_(1 / ((b - a) * np.random.rand(Links) + a), |
747
|
|
|
decimals=0), dtype="int") |
748
|
|
|
LinkEnds = LinkStarts + LinkSizes |
749
|
|
|
|
750
|
|
|
# Find link to contig membership |
751
|
|
|
LinkStartContigs = np.searchsorted(ContigStarts, LinkStarts) - 1 |
752
|
|
|
LinkEndContigs = np.searchsorted(ContigStarts, LinkEnds) - 1 |
753
|
|
|
|
754
|
|
|
# Extract inter-contig links |
755
|
|
|
InterContigLinks = (LinkStartContigs != LinkEndContigs) & \ |
756
|
|
|
(LinkEndContigs != Contigs) |
757
|
|
|
ICLinkStartContigs = LinkStartContigs[InterContigLinks] |
758
|
|
|
ICLinkEndContigs = LinkEndContigs[InterContigLinks] |
759
|
|
|
ICLinkStarts = LinkStarts[InterContigLinks] |
760
|
|
|
ICLinkEnds = LinkEnds[InterContigLinks] |
761
|
|
|
|
762
|
|
|
# Write CLM file |
763
|
|
|
write_clm(pf, ICLinkStartContigs, ICLinkEndContigs, |
764
|
|
|
ICLinkStarts, ICLinkEnds, |
765
|
|
|
ContigStarts, ContigSizes) |
766
|
|
|
|
767
|
|
|
|
768
|
|
|
def write_last_and_beds(pf, GenePositions, ContigStarts): |
769
|
|
|
""" |
770
|
|
|
Write LAST file, query and subject BED files. |
771
|
|
|
""" |
772
|
|
|
qbedfile = pf + "tigs.bed" |
773
|
|
|
sbedfile = pf + "chr.bed" |
774
|
|
|
lastfile = "{}tigs.{}chr.last".format(pf, pf) |
775
|
|
|
qbedfw = open(qbedfile, "w") |
776
|
|
|
sbedfw = open(sbedfile, "w") |
777
|
|
|
lastfw = open(lastfile, "w") |
778
|
|
|
|
779
|
|
|
GeneContigs = np.searchsorted(ContigStarts, GenePositions) - 1 |
780
|
|
|
for i, (c, gstart) in enumerate(zip(GeneContigs, GenePositions)): |
781
|
|
|
gene = "gene{:05d}".format(i) |
782
|
|
|
tig = "tig{:04d}".format(c) |
783
|
|
|
start = ContigStarts[c] |
784
|
|
|
cstart = gstart - start |
785
|
|
|
print >> qbedfw, "\t".join(str(x) for x in |
786
|
|
|
(tig, cstart, cstart + 1, gene)) |
787
|
|
|
print >> sbedfw, "\t".join(str(x) for x in |
788
|
|
|
("chr1", gstart, gstart + 1, gene)) |
789
|
|
|
lastatoms = [gene, gene, 100] + [0] * 8 + [100] |
790
|
|
|
print >> lastfw, "\t".join(str(x) for x in lastatoms) |
791
|
|
|
|
792
|
|
|
qbedfw.close() |
793
|
|
|
sbedfw.close() |
794
|
|
|
lastfw.close() |
795
|
|
|
|
796
|
|
|
|
797
|
|
|
def write_clm(pf, ICLinkStartContigs, ICLinkEndContigs, |
798
|
|
|
ICLinkStarts, ICLinkEnds, |
799
|
|
|
ContigStarts, ContigSizes): |
800
|
|
|
""" |
801
|
|
|
Write CLM file from simulated data. |
802
|
|
|
""" |
803
|
|
|
clm = defaultdict(list) |
804
|
|
|
for start, end, linkstart, linkend in \ |
805
|
|
|
zip(ICLinkStartContigs, ICLinkEndContigs, |
806
|
|
|
ICLinkStarts, ICLinkEnds): |
807
|
|
|
start_a = ContigStarts[start] |
808
|
|
|
start_b = start_a + ContigSizes[start] |
809
|
|
|
end_a = ContigStarts[end] |
810
|
|
|
end_b = end_a + ContigSizes[end] |
811
|
|
|
if linkend >= end_b: |
812
|
|
|
continue |
813
|
|
|
clm[(start, end)].append((linkstart - start_a, start_b - linkstart, |
814
|
|
|
linkend - end_a, end_b - linkend)) |
815
|
|
|
|
816
|
|
|
clmfile = pf + ".clm" |
817
|
|
|
fw = open(clmfile, "w") |
818
|
|
|
|
819
|
|
|
def format_array(a): |
820
|
|
|
return [str(x) for x in sorted(a) if x > 0] |
821
|
|
|
|
822
|
|
|
for (start, end), links in sorted(clm.items()): |
823
|
|
|
start = "tig{:04d}".format(start) |
824
|
|
|
end = "tig{:04d}".format(end) |
825
|
|
|
nlinks = len(links) |
826
|
|
|
if not nlinks: |
827
|
|
|
continue |
828
|
|
|
ff = format_array([(b + c) for a, b, c, d in links]) |
829
|
|
|
fr = format_array([(b + d) for a, b, c, d in links]) |
830
|
|
|
rf = format_array([(a + c) for a, b, c, d in links]) |
831
|
|
|
rr = format_array([(a + d) for a, b, c, d in links]) |
832
|
|
|
print >> fw, "{}+ {}+\t{}\t{}".format(start, end, nlinks, " ".join(ff)) |
833
|
|
|
print >> fw, "{}+ {}-\t{}\t{}".format(start, end, nlinks, " ".join(fr)) |
834
|
|
|
print >> fw, "{}- {}+\t{}\t{}".format(start, end, nlinks, " ".join(rf)) |
835
|
|
|
print >> fw, "{}- {}-\t{}\t{}".format(start, end, nlinks, " ".join(rr)) |
836
|
|
|
fw.close() |
837
|
|
|
|
838
|
|
|
|
839
|
|
|
def density(args): |
840
|
|
|
""" |
841
|
|
|
%prog density test.clm |
842
|
|
|
|
843
|
|
|
Estimate link density of contigs. |
844
|
|
|
""" |
845
|
|
|
p = OptionParser(density.__doc__) |
846
|
|
|
p.add_option("--save", default=False, action="store_true", |
847
|
|
|
help="Write log densitites of contigs to file") |
848
|
|
|
p.set_cpus() |
849
|
|
|
opts, args = p.parse_args(args) |
850
|
|
|
|
851
|
|
|
if len(args) != 1: |
852
|
|
|
sys.exit(not p.print_help()) |
853
|
|
|
|
854
|
|
|
clmfile, = args |
855
|
|
|
clm = CLMFile(clmfile) |
856
|
|
|
pf = clmfile.rsplit(".", 1)[0] |
857
|
|
|
|
858
|
|
|
if opts.save: |
859
|
|
|
logdensities = clm.calculate_densities() |
860
|
|
|
densityfile = pf + ".density" |
861
|
|
|
fw = open(densityfile, "w") |
862
|
|
|
for name, logd in logdensities.items(): |
863
|
|
|
s = clm.tig_to_size[name] |
864
|
|
|
print >> fw, "\t".join(str(x) for x in (name, s, logd)) |
865
|
|
|
fw.close() |
866
|
|
|
logging.debug("Density written to `{}`".format(densityfile)) |
867
|
|
|
|
868
|
|
|
tourfile = clmfile.rsplit(".", 1)[0] + ".tour" |
869
|
|
|
tour = clm.activate(tourfile=tourfile, backuptour=False) |
870
|
|
|
clm.flip_all(tour) |
871
|
|
|
clm.flip_whole(tour) |
872
|
|
|
clm.flip_one(tour) |
873
|
|
|
|
874
|
|
|
|
875
|
|
|
def optimize(args): |
876
|
|
|
""" |
877
|
|
|
%prog optimize test.clm |
878
|
|
|
|
879
|
|
|
Optimize the contig order and orientation, based on CLM file. |
880
|
|
|
""" |
881
|
|
|
p = OptionParser(optimize.__doc__) |
882
|
|
|
p.add_option("--skiprecover", default=False, action="store_true", |
883
|
|
|
help="Do not import 'recover' contigs") |
884
|
|
|
p.add_option("--startover", default=False, action="store_true", |
885
|
|
|
help="Do not resume from existing tour file") |
886
|
|
|
p.add_option("--skipGA", default=False, action="store_true", |
887
|
|
|
help="Skip GA step") |
888
|
|
|
p.set_outfile(outfile=None) |
889
|
|
|
p.set_cpus() |
890
|
|
|
opts, args = p.parse_args(args) |
891
|
|
|
|
892
|
|
|
if len(args) != 1: |
893
|
|
|
sys.exit(not p.print_help()) |
894
|
|
|
|
895
|
|
|
clmfile, = args |
896
|
|
|
startover = opts.startover |
897
|
|
|
runGA = not opts.skipGA |
898
|
|
|
cpus = opts.cpus |
899
|
|
|
|
900
|
|
|
# Load contact map |
901
|
|
|
clm = CLMFile(clmfile, skiprecover=opts.skiprecover) |
902
|
|
|
|
903
|
|
|
tourfile = opts.outfile or clmfile.rsplit(".", 1)[0] + ".tour" |
904
|
|
|
if startover: |
905
|
|
|
tourfile = None |
906
|
|
|
tour = clm.activate(tourfile=tourfile) |
907
|
|
|
|
908
|
|
|
fwtour = open(tourfile, "w") |
909
|
|
|
# Store INIT tour |
910
|
|
|
print_tour(fwtour, clm.tour, "INIT", |
911
|
|
|
clm.active_contigs, clm.oo, signs=clm.signs) |
912
|
|
|
|
913
|
|
|
if runGA: |
914
|
|
|
for phase in range(1, 3): |
915
|
|
|
tour = optimize_ordering(fwtour, clm, phase, cpus) |
916
|
|
|
tour = clm.prune_tour(tour, cpus) |
917
|
|
|
|
918
|
|
|
# Flip orientations |
919
|
|
|
phase = 1 |
920
|
|
|
while True: |
921
|
|
|
tag1, tag2 = optimize_orientations(fwtour, clm, phase, cpus) |
922
|
|
|
if tag1 == REJECT and tag2 == REJECT: |
923
|
|
|
logging.debug("Terminating ... no more {}".format(ACCEPT)) |
924
|
|
|
break |
925
|
|
|
phase += 1 |
926
|
|
|
|
927
|
|
|
fwtour.close() |
928
|
|
|
|
929
|
|
|
|
930
|
|
|
def optimize_ordering(fwtour, clm, phase, cpus): |
931
|
|
|
""" |
932
|
|
|
Optimize the ordering of contigs by Genetic Algorithm (GA). |
933
|
|
|
""" |
934
|
|
|
from .chic import score_evaluate_M |
935
|
|
|
|
936
|
|
|
# Prepare input files |
937
|
|
|
tour_contigs = clm.active_contigs |
938
|
|
|
tour_sizes = clm.active_sizes |
939
|
|
|
tour_M = clm.M |
940
|
|
|
tour = clm.tour |
941
|
|
|
signs = clm.signs |
942
|
|
|
oo = clm.oo |
943
|
|
|
|
944
|
|
|
def callback(tour, gen, phase, oo): |
945
|
|
|
fitness = tour.fitness if hasattr(tour, "fitness") else None |
946
|
|
|
label = "GA{}-{}".format(phase, gen) |
947
|
|
|
if fitness: |
948
|
|
|
fitness = "{0}".format(fitness).split(",")[0].replace("(", "") |
949
|
|
|
label += "-" + fitness |
950
|
|
|
if gen % 20 == 0: |
951
|
|
|
print_tour(fwtour, tour, label, tour_contigs, oo, signs=signs) |
952
|
|
|
return tour |
953
|
|
|
|
954
|
|
|
callbacki = partial(callback, phase=phase, oo=oo) |
955
|
|
|
toolbox = GA_setup(tour) |
956
|
|
|
toolbox.register("evaluate", score_evaluate_M, |
957
|
|
|
tour_sizes=tour_sizes, tour_M=tour_M) |
958
|
|
|
tour, tour_fitness = GA_run(toolbox, ngen=1000, npop=100, cpus=cpus, |
959
|
|
|
callback=callbacki) |
960
|
|
|
clm.tour = tour |
961
|
|
|
|
962
|
|
|
return tour |
963
|
|
|
|
964
|
|
|
|
965
|
|
|
def optimize_orientations(fwtour, clm, phase, cpus): |
966
|
|
|
""" |
967
|
|
|
Optimize the orientations of contigs by using heuristic flipping. |
968
|
|
|
""" |
969
|
|
|
# Prepare input files |
970
|
|
|
tour_contigs = clm.active_contigs |
971
|
|
|
tour = clm.tour |
972
|
|
|
oo = clm.oo |
973
|
|
|
|
974
|
|
|
print_tour(fwtour, tour, "FLIPALL{}".format(phase), |
975
|
|
|
tour_contigs, oo, signs=clm.signs) |
976
|
|
|
tag1 = clm.flip_whole(tour) |
977
|
|
|
print_tour(fwtour, tour, "FLIPWHOLE{}".format(phase), |
978
|
|
|
tour_contigs, oo, signs=clm.signs) |
979
|
|
|
tag2 = clm.flip_one(tour) |
980
|
|
|
print_tour(fwtour, tour, "FLIPONE{}".format(phase), |
981
|
|
|
tour_contigs, oo, signs=clm.signs) |
982
|
|
|
|
983
|
|
|
return tag1, tag2 |
984
|
|
|
|
985
|
|
|
|
986
|
|
|
def prepare_synteny(tourfile, lastfile, odir, p, opts): |
987
|
|
|
""" |
988
|
|
|
Prepare synteny plots for movie(). |
989
|
|
|
""" |
990
|
|
|
qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) |
991
|
|
|
qbedfile = op.abspath(qbedfile) |
992
|
|
|
sbedfile = op.abspath(sbedfile) |
993
|
|
|
|
994
|
|
|
qbed = Bed(qbedfile, sorted=False) |
995
|
|
|
contig_to_beds = dict(qbed.sub_beds()) |
996
|
|
|
|
997
|
|
|
# Create a separate directory for the subplots and movie |
998
|
|
|
mkdir(odir, overwrite=True) |
999
|
|
|
os.chdir(odir) |
1000
|
|
|
logging.debug("Change into subdir `{}`".format(odir)) |
1001
|
|
|
|
1002
|
|
|
# Make anchorsfile |
1003
|
|
|
anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \ |
1004
|
|
|
+ ".anchors" |
1005
|
|
|
fw = open(anchorsfile, "w") |
1006
|
|
|
for b in Blast(lastfile): |
1007
|
|
|
print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject), |
1008
|
|
|
str(int(b.score)))) |
1009
|
|
|
fw.close() |
1010
|
|
|
|
1011
|
|
|
# Symlink sbed |
1012
|
|
|
symlink(sbedfile, op.basename(sbedfile)) |
1013
|
|
|
|
1014
|
|
|
return anchorsfile, qbedfile, contig_to_beds |
1015
|
|
|
|
1016
|
|
|
|
1017
|
|
|
def separate_tour_and_o(row): |
1018
|
|
|
""" |
1019
|
|
|
The tour line typically contains contig list like: |
1020
|
|
|
tig00044568+ tig00045748- tig00071055- tig00015093- tig00030900- |
1021
|
|
|
|
1022
|
|
|
This function separates the names from the orientations. |
1023
|
|
|
""" |
1024
|
|
|
tour = [] |
1025
|
|
|
tour_o = [] |
1026
|
|
|
for contig in row.split(): |
1027
|
|
|
if contig[-1] in ('+', '-', '?'): |
1028
|
|
|
tour.append(contig[:-1]) |
1029
|
|
|
tour_o.append(contig[-1]) |
1030
|
|
|
else: # Unoriented |
1031
|
|
|
tour.append(contig) |
1032
|
|
|
tour_o.append('?') |
1033
|
|
|
return tour, tour_o |
1034
|
|
|
|
1035
|
|
|
|
1036
|
|
|
def iter_last_tour(tourfile, clm): |
1037
|
|
|
""" |
1038
|
|
|
Extract last tour from tourfile. The clm instance is also passed in to see |
1039
|
|
|
if any contig is covered in the clm. |
1040
|
|
|
""" |
1041
|
|
|
row = open(tourfile).readlines()[-1] |
1042
|
|
|
_tour, _tour_o = separate_tour_and_o(row) |
1043
|
|
|
tour = [] |
1044
|
|
|
tour_o = [] |
1045
|
|
|
for tc, to in zip(_tour, _tour_o): |
1046
|
|
|
if tc not in clm.contigs: |
1047
|
|
|
logging.debug("Contig `{}` in file `{}` not found in `{}`" |
1048
|
|
|
.format(tc, tourfile, clm.idsfile)) |
1049
|
|
|
continue |
1050
|
|
|
tour.append(tc) |
1051
|
|
|
tour_o.append(to) |
1052
|
|
|
return tour, tour_o |
1053
|
|
|
|
1054
|
|
|
|
1055
|
|
|
def iter_tours(tourfile, frames=1): |
1056
|
|
|
""" |
1057
|
|
|
Extract tours from tourfile. Tourfile contains a set of contig |
1058
|
|
|
configurations, generated at each iteration of the genetic algorithm. Each |
1059
|
|
|
configuration has two rows, first row contains iteration id and score, |
1060
|
|
|
second row contains list of contigs, separated by comma. |
1061
|
|
|
""" |
1062
|
|
|
fp = open(tourfile) |
1063
|
|
|
|
1064
|
|
|
i = 0 |
1065
|
|
|
for row in fp: |
1066
|
|
|
if row[0] == '>': |
1067
|
|
|
label = row[1:].strip() |
1068
|
|
|
if label.startswith("GA"): |
1069
|
|
|
pf, j, score = label.split("-") |
1070
|
|
|
j = int(j) |
1071
|
|
|
else: |
1072
|
|
|
j = 0 |
1073
|
|
|
i += 1 |
1074
|
|
|
else: |
1075
|
|
|
if j % frames != 0: |
1076
|
|
|
continue |
1077
|
|
|
tour, tour_o = separate_tour_and_o(row) |
1078
|
|
|
yield i, label, tour, tour_o |
1079
|
|
|
|
1080
|
|
|
fp.close() |
1081
|
|
|
|
1082
|
|
|
|
1083
|
|
|
def movie(args): |
1084
|
|
|
""" |
1085
|
|
|
%prog movie test.tour test.clm ref.contigs.last |
1086
|
|
|
|
1087
|
|
|
Plot optimization history. |
1088
|
|
|
""" |
1089
|
|
|
p = OptionParser(movie.__doc__) |
1090
|
|
|
p.add_option("--frames", default=500, type="int", |
1091
|
|
|
help="Only plot every N frames") |
1092
|
|
|
p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"), |
1093
|
|
|
help="Movie engine, output MP4 or GIF") |
1094
|
|
|
p.set_beds() |
1095
|
|
|
opts, args, iopts = p.set_image_options(args, figsize="16x8", |
1096
|
|
|
style="white", cmap="coolwarm", |
1097
|
|
|
format="png", dpi=300) |
1098
|
|
|
|
1099
|
|
|
if len(args) != 3: |
1100
|
|
|
sys.exit(not p.print_help()) |
1101
|
|
|
|
1102
|
|
|
tourfile, clmfile, lastfile = args |
1103
|
|
|
tourfile = op.abspath(tourfile) |
1104
|
|
|
clmfile = op.abspath(clmfile) |
1105
|
|
|
lastfile = op.abspath(lastfile) |
1106
|
|
|
cwd = os.getcwd() |
1107
|
|
|
odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie" |
1108
|
|
|
anchorsfile, qbedfile, contig_to_beds = \ |
1109
|
|
|
prepare_synteny(tourfile, lastfile, odir, p, opts) |
1110
|
|
|
|
1111
|
|
|
args = [] |
1112
|
|
|
for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames): |
1113
|
|
|
padi = "{:06d}".format(i) |
1114
|
|
|
# Make sure the anchorsfile and bedfile has the serial number in, |
1115
|
|
|
# otherwise parallelization may fail |
1116
|
|
|
a, b = op.basename(anchorsfile).split(".", 1) |
1117
|
|
|
ianchorsfile = a + "_" + padi + "." + b |
1118
|
|
|
symlink(anchorsfile, ianchorsfile) |
1119
|
|
|
|
1120
|
|
|
# Make BED file with new order |
1121
|
|
|
qb = Bed() |
1122
|
|
|
for contig, o in zip(tour, tour_o): |
1123
|
|
|
if contig not in contig_to_beds: |
1124
|
|
|
continue |
1125
|
|
|
bedlines = contig_to_beds[contig][:] |
1126
|
|
|
if o == '-': |
1127
|
|
|
bedlines.reverse() |
1128
|
|
|
for x in bedlines: |
1129
|
|
|
qb.append(x) |
1130
|
|
|
|
1131
|
|
|
a, b = op.basename(qbedfile).split(".", 1) |
1132
|
|
|
ibedfile = a + "_" + padi + "." + b |
1133
|
|
|
qb.print_to_file(ibedfile) |
1134
|
|
|
# Plot dot plot, but do not sort contigs by name (otherwise losing |
1135
|
|
|
# order) |
1136
|
|
|
image_name = padi + "." + iopts.format |
1137
|
|
|
|
1138
|
|
|
tour = ",".join(tour) |
1139
|
|
|
args.append([[tour, clmfile, ianchorsfile, |
1140
|
|
|
"--outfile", image_name, "--label", label]]) |
1141
|
|
|
|
1142
|
|
|
Jobs(movieframe, args).run() |
1143
|
|
|
|
1144
|
|
|
os.chdir(cwd) |
1145
|
|
|
make_movie(odir, odir, engine=opts.engine, format=iopts.format) |
1146
|
|
|
|
1147
|
|
|
|
1148
|
|
|
def score(args): |
1149
|
|
|
""" |
1150
|
|
|
%prog score main_results/ cached_data/ contigsfasta |
1151
|
|
|
|
1152
|
|
|
Score the current LACHESIS CLM. |
1153
|
|
|
""" |
1154
|
|
|
p = OptionParser(score.__doc__) |
1155
|
|
|
p.set_cpus() |
1156
|
|
|
opts, args = p.parse_args(args) |
1157
|
|
|
|
1158
|
|
|
if len(args) != 3: |
1159
|
|
|
sys.exit(not p.print_help()) |
1160
|
|
|
|
1161
|
|
|
mdir, cdir, contigsfasta = args |
1162
|
|
|
orderingfiles = natsorted(iglob(mdir, "*.ordering")) |
1163
|
|
|
sizes = Sizes(contigsfasta) |
1164
|
|
|
contig_names = list(sizes.iter_names()) |
1165
|
|
|
contig_ids = dict((name, i) for (i, name) in enumerate(contig_names)) |
1166
|
|
|
|
1167
|
|
|
oo = [] |
1168
|
|
|
# Load contact matrix |
1169
|
|
|
glm = op.join(cdir, "all.GLM") |
1170
|
|
|
N = len(contig_ids) |
1171
|
|
|
M = np.zeros((N, N), dtype=int) |
1172
|
|
|
fp = open(glm) |
1173
|
|
|
for row in fp: |
1174
|
|
|
if row[0] == '#': |
1175
|
|
|
continue |
1176
|
|
|
x, y, z = row.split() |
1177
|
|
|
if x == 'X': |
1178
|
|
|
continue |
1179
|
|
|
M[int(x), int(y)] = int(z) |
1180
|
|
|
|
1181
|
|
|
fwtour = open("tour", "w") |
1182
|
|
|
|
1183
|
|
|
def callback(tour, gen, oo): |
1184
|
|
|
fitness = tour.fitness if hasattr(tour, "fitness") else None |
1185
|
|
|
label = "GA-{0}".format(gen) |
1186
|
|
|
if fitness: |
1187
|
|
|
fitness = "{0}".format(fitness).split(",")[0].replace("(", "") |
1188
|
|
|
label += "-" + fitness |
1189
|
|
|
print_tour(fwtour, tour, label, contig_names, oo) |
1190
|
|
|
return tour |
1191
|
|
|
|
1192
|
|
|
for ofile in orderingfiles: |
1193
|
|
|
co = ContigOrdering(ofile) |
1194
|
|
|
for x in co: |
1195
|
|
|
contig_id = contig_ids[x.contig_name] |
1196
|
|
|
oo.append(contig_id) |
1197
|
|
|
pf = op.basename(ofile).split(".")[0] |
1198
|
|
|
print pf |
1199
|
|
|
print oo |
1200
|
|
|
|
1201
|
|
|
tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M) |
1202
|
|
|
# Store INIT tour |
1203
|
|
|
print_tour(fwtour, tour, "INIT", contig_names, oo) |
1204
|
|
|
|
1205
|
|
|
# Faster Cython version for evaluation |
1206
|
|
|
from .chic import score_evaluate_M |
1207
|
|
|
callbacki = partial(callback, oo=oo) |
1208
|
|
|
toolbox = GA_setup(tour) |
1209
|
|
|
toolbox.register("evaluate", score_evaluate_M, |
1210
|
|
|
tour_sizes=tour_sizes, tour_M=tour_M) |
1211
|
|
|
tour, tour.fitness = GA_run(toolbox, npop=100, cpus=opts.cpus, |
1212
|
|
|
callback=callbacki) |
1213
|
|
|
print tour, tour.fitness |
1214
|
|
|
break |
1215
|
|
|
|
1216
|
|
|
fwtour.close() |
1217
|
|
|
|
1218
|
|
|
|
1219
|
|
|
def print_tour(fwtour, tour, label, contig_names, oo, signs=None): |
1220
|
|
|
print >> fwtour, ">" + label |
1221
|
|
|
if signs is not None: |
1222
|
|
|
contig_o = [] |
1223
|
|
|
for x in tour: |
1224
|
|
|
idx = oo[x] |
1225
|
|
|
sign = {1: '+', 0: '?', -1: '-'}[signs[idx]] |
1226
|
|
|
contig_o.append(contig_names[idx] + sign) |
1227
|
|
|
print >> fwtour, " ".join(contig_o) |
1228
|
|
|
else: |
1229
|
|
|
print >> fwtour, " ".join(contig_names[oo[x]] for x in tour) |
1230
|
|
|
|
1231
|
|
|
|
1232
|
|
|
def prepare_ec(oo, sizes, M): |
1233
|
|
|
""" |
1234
|
|
|
This prepares EC and converts from contig_id to an index. |
1235
|
|
|
""" |
1236
|
|
|
tour = range(len(oo)) |
1237
|
|
|
tour_sizes = np.array([sizes.sizes[x] for x in oo]) |
1238
|
|
|
tour_M = M[oo, :][:, oo] |
1239
|
|
|
return tour, tour_sizes, tour_M |
1240
|
|
|
|
1241
|
|
|
|
1242
|
|
|
def score_evaluate(tour, tour_sizes=None, tour_M=None): |
1243
|
|
|
""" SLOW python version of the evaluation function. For benchmarking |
1244
|
|
|
purposes only. Do not use in production. |
1245
|
|
|
""" |
1246
|
|
|
sizes_oo = np.array([tour_sizes[x] for x in tour]) |
1247
|
|
|
sizes_cum = np.cumsum(sizes_oo) - sizes_oo / 2 |
1248
|
|
|
s = 0 |
1249
|
|
|
size = len(tour) |
1250
|
|
|
for ia in xrange(size): |
1251
|
|
|
a = tour[ia] |
1252
|
|
|
for ib in xrange(ia + 1, size): |
1253
|
|
|
b = tour[ib] |
1254
|
|
|
links = tour_M[a, b] |
1255
|
|
|
dist = sizes_cum[ib] - sizes_cum[ia] |
1256
|
|
|
if dist > 1e7: |
1257
|
|
|
break |
1258
|
|
|
s += links * 1. / dist |
1259
|
|
|
return s, |
1260
|
|
|
|
1261
|
|
|
|
1262
|
|
|
def movieframe(args): |
1263
|
|
|
""" |
1264
|
|
|
%prog movieframe tour test.clm contigs.ref.anchors |
1265
|
|
|
|
1266
|
|
|
Draw heatmap and synteny in the same plot. |
1267
|
|
|
""" |
1268
|
|
|
p = OptionParser(movieframe.__doc__) |
1269
|
|
|
p.add_option("--label", help="Figure title") |
1270
|
|
|
p.set_beds() |
1271
|
|
|
p.set_outfile(outfile=None) |
1272
|
|
|
opts, args, iopts = p.set_image_options(args, figsize="16x8", |
1273
|
|
|
style="white", cmap="coolwarm", |
1274
|
|
|
format="png", dpi=120) |
1275
|
|
|
|
1276
|
|
|
if len(args) != 3: |
1277
|
|
|
sys.exit(not p.print_help()) |
1278
|
|
|
|
1279
|
|
|
tour, clmfile, anchorsfile = args |
1280
|
|
|
tour = tour.split(",") |
1281
|
|
|
image_name = opts.outfile or ("movieframe." + iopts.format) |
1282
|
|
|
label = opts.label or op.basename(image_name).rsplit(".", 1)[0] |
1283
|
|
|
|
1284
|
|
|
clm = CLMFile(clmfile) |
1285
|
|
|
totalbins, bins, breaks = make_bins(tour, clm.tig_to_size) |
1286
|
|
|
M = read_clm(clm, totalbins, bins) |
1287
|
|
|
|
1288
|
|
|
fig = plt.figure(1, (iopts.w, iopts.h)) |
1289
|
|
|
root = fig.add_axes([0, 0, 1, 1]) # whole canvas |
1290
|
|
|
ax1 = fig.add_axes([.05, .1, .4, .8]) # heatmap |
1291
|
|
|
ax2 = fig.add_axes([.55, .1, .4, .8]) # dot plot |
1292
|
|
|
ax2_root = fig.add_axes([.5, 0, .5, 1]) # dot plot canvas |
1293
|
|
|
|
1294
|
|
|
# Left axis: heatmap |
1295
|
|
|
plot_heatmap(ax1, M, breaks, iopts) |
1296
|
|
|
|
1297
|
|
|
# Right axis: synteny |
1298
|
|
|
qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts, |
1299
|
|
|
sorted=False) |
1300
|
|
|
dotplot(anchorsfile, qbed, sbed, fig, ax2_root, ax2, sep=False, title="") |
1301
|
|
|
|
1302
|
|
|
root.text(.5, .98, clm.name, color="g", ha="center", va="center") |
1303
|
|
|
root.text(.5, .95, label, color="darkslategray", ha="center", va="center") |
1304
|
|
|
normalize_axes(root) |
1305
|
|
|
savefig(image_name, dpi=iopts.dpi, iopts=iopts) |
1306
|
|
|
|
1307
|
|
|
|
1308
|
|
|
def make_bins(tour, sizes): |
1309
|
|
|
breaks = [] |
1310
|
|
|
start = 0 |
1311
|
|
|
bins = {} |
1312
|
|
|
for x in tour: |
1313
|
|
|
size = sizes[x] |
1314
|
|
|
end = start + int(round(size * 1. / BINSIZE)) |
1315
|
|
|
bins[x] = (start, end) |
1316
|
|
|
start = end |
1317
|
|
|
breaks.append(start) |
1318
|
|
|
|
1319
|
|
|
totalbins = start |
1320
|
|
|
return totalbins, bins, breaks |
1321
|
|
|
|
1322
|
|
|
|
1323
|
|
|
def read_clm(clm, totalbins, bins): |
1324
|
|
|
M = np.zeros((totalbins, totalbins)) |
1325
|
|
|
for (x, y), z in clm.contacts.items(): |
1326
|
|
|
if x not in bins or y not in bins: |
1327
|
|
|
continue |
1328
|
|
|
xstart, xend = bins[x] |
1329
|
|
|
ystart, yend = bins[y] |
1330
|
|
|
M[xstart:xend, ystart:yend] = z |
1331
|
|
|
M[ystart:yend, xstart:xend] = z |
1332
|
|
|
|
1333
|
|
|
M = np.log10(M + 1) |
1334
|
|
|
return M |
1335
|
|
|
|
1336
|
|
|
|
1337
|
|
|
def plot_heatmap(ax, M, breaks, iopts): |
1338
|
|
|
ax.imshow(M, cmap=iopts.cmap, origin="lower", interpolation='none') |
1339
|
|
|
xlim = ax.get_xlim() |
1340
|
|
|
for b in breaks[:-1]: |
1341
|
|
|
ax.plot([b, b], xlim, 'w-') |
1342
|
|
|
ax.plot(xlim, [b, b], 'w-') |
1343
|
|
|
ax.set_xlim(xlim) |
1344
|
|
|
ax.set_ylim(xlim) |
1345
|
|
|
ax.set_xticklabels([int(x) for x in ax.get_xticks()], |
1346
|
|
|
family='Helvetica', color="gray") |
1347
|
|
|
ax.set_yticklabels([int(x) for x in ax.get_yticks()], |
1348
|
|
|
family='Helvetica', color="gray") |
1349
|
|
|
binlabel = "Bins ({} per bin)".format(human_size(BINSIZE, precision=0)) |
1350
|
|
|
ax.set_xlabel(binlabel) |
1351
|
|
|
ax.set_ylabel(binlabel) |
1352
|
|
|
|
1353
|
|
|
|
1354
|
|
|
def agp(args): |
1355
|
|
|
""" |
1356
|
|
|
%prog agp main_results/ contigs.fasta |
1357
|
|
|
|
1358
|
|
|
Generate AGP file based on LACHESIS output. |
1359
|
|
|
""" |
1360
|
|
|
p = OptionParser(agp.__doc__) |
1361
|
|
|
p.set_outfile() |
1362
|
|
|
opts, args = p.parse_args(args) |
1363
|
|
|
|
1364
|
|
|
if len(args) != 2: |
1365
|
|
|
sys.exit(not p.print_help()) |
1366
|
|
|
|
1367
|
|
|
odir, contigsfasta = args |
1368
|
|
|
fwagp = must_open(opts.outfile, 'w') |
1369
|
|
|
orderingfiles = natsorted(iglob(odir, "*.ordering")) |
1370
|
|
|
sizes = Sizes(contigsfasta).mapping |
1371
|
|
|
contigs = set(sizes.keys()) |
1372
|
|
|
anchored = set() |
1373
|
|
|
|
1374
|
|
|
for ofile in orderingfiles: |
1375
|
|
|
co = ContigOrdering(ofile) |
1376
|
|
|
anchored |= set([x.contig_name for x in co]) |
1377
|
|
|
obj = op.basename(ofile).split('.')[0] |
1378
|
|
|
co.write_agp(obj, sizes, fwagp) |
1379
|
|
|
|
1380
|
|
|
singletons = contigs - anchored |
1381
|
|
|
logging.debug('Anchored: {}, Singletons: {}'. |
1382
|
|
|
format(len(anchored), len(singletons))) |
1383
|
|
|
|
1384
|
|
|
for s in natsorted(singletons): |
1385
|
|
|
order_to_agp(s, [(s, "?")], sizes, fwagp) |
1386
|
|
|
|
1387
|
|
|
|
1388
|
|
|
if __name__ == '__main__': |
1389
|
|
|
main() |
1390
|
|
|
|