Completed
Push — master ( a7cc0a...9e2190 )
by Haibao
58s
created

mergemat()   B

Complexity

Conditions 3

Size

Total Lines 26

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
c 1
b 0
f 0
dl 0
loc 26
rs 8.8571
1
#!/usr/bin/env python
2
# -*- coding: UTF-8 -*-
3
4
"""
5
Process Hi-C output into AGP for chromosomal-scale scaffolding.
6
"""
7
8
import array
9
import json
10
import logging
11
import sys
12
import os
13
import os.path as op
14
import numpy as np
15
import math
16
17
from collections import defaultdict
18
from functools import partial
19
from multiprocessing import Pool
20
21
from jcvi.algorithms.formula import outlier_cutoff
22
from jcvi.algorithms.ec import GA_setup, GA_run
23
from jcvi.algorithms.matrix import get_signs
24
from jcvi.apps.base import OptionParser, ActionDispatcher, backup, iglob, \
25
    mkdir, symlink
26
from jcvi.apps.console import green, red
27
from jcvi.apps.grid import Jobs
28
from jcvi.assembly.allmaps import make_movie
29
from jcvi.compara.synteny import check_beds, get_bed_filenames
30
from jcvi.formats.agp import order_to_agp
31
from jcvi.formats.base import LineFile, must_open
32
from jcvi.formats.bed import Bed
33
from jcvi.formats.sizes import Sizes
34
from jcvi.formats.blast import Blast
35
from jcvi.graphics.base import normalize_axes, plt, savefig
36
from jcvi.graphics.dotplot import dotplot
37
from jcvi.utils.cbook import gene_name, human_size
38
from jcvi.utils.natsort import natsorted
39
40
41
# Map orientations to ints
42
FF = {'+': 1, '-': -1, '?': 1}
43
RR = {'+': -1, '-': 1, '?': -1}
44
LB = 18             # Lower bound for golden_array()
45
UB = 29             # Upper bound for golden_array()
46
BB = UB - LB + 1    # Span for golden_array()
47
ACCEPT = green("ACCEPT")
48
REJECT = red("REJECT")
49
BINSIZE = 50000
50
51
52
class ContigOrderingLine(object):
53
    '''Stores one line in the ContigOrdering file
54
    '''
55
    def __init__(self, line, sep="|"):
56
        args = line.split()
57
        self.contig_id = args[0]
58
        self.contig_name = args[1].split(sep)[0]
59
        contig_rc = args[2]
60
        assert contig_rc in ('0', '1')
61
        self.strand = '+' if contig_rc == '0' else '-'
62
        self.orientation_score = args[3]
63
        self.gap_size_after_contig = args[4]
64
65
66
class ContigOrdering(LineFile):
67
    '''ContigOrdering file as created by LACHESIS, one per chromosome group.
68
    Header contains summary information per group, followed by list of contigs
69
    with given ordering.
70
    '''
71
    def __init__(self, filename):
72
        super(ContigOrdering, self).__init__(filename)
73
        fp = open(filename)
74
        for row in fp:
75
            if row[0] == '#':
76
                continue
77
            orderline = ContigOrderingLine(row)
78
            self.append(orderline)
79
80
    def write_agp(self, obj, sizes, fw=sys.stdout, gapsize=100,
81
                  gaptype="contig", evidence="map"):
82
        '''Converts the ContigOrdering file into AGP format
83
        '''
84
        contigorder = [(x.contig_name, x.strand) for x in self]
85
        order_to_agp(obj, contigorder, sizes, fw,
86
                     gapsize=gapsize, gaptype=gaptype, evidence=evidence)
87
88
89
class CLMFile:
90
    '''CLM file (modified) has the following format:
91
92
    tig00046211+ tig00063795+       1       53173
93
    tig00046211+ tig00063795-       1       116050
94
    tig00046211- tig00063795+       1       71155
95
    tig00046211- tig00063795-       1       134032
96
    tig00030676+ tig00077819+       5       136407 87625 87625 106905 102218
97
    tig00030676+ tig00077819-       5       126178 152952 152952 35680 118923
98
    tig00030676- tig00077819+       5       118651 91877 91877 209149 125906
99
    tig00030676- tig00077819-       5       108422 157204 157204 137924 142611
100
    '''
101
    def __init__(self, clmfile, skiprecover=False):
102
        self.name = op.basename(clmfile).rsplit(".", 1)[0]
103
        self.clmfile = clmfile
104
        self.idsfile = clmfile.rsplit(".", 1)[0] + ".ids"
105
        self.parse_ids(skiprecover)
106
        self.parse_clm()
107
        self.signs = None
108
109
    def parse_ids(self, skiprecover):
110
        '''IDS file has a list of contigs that need to be ordered. 'recover',
111
        keyword, if available in the third column, is less confident.
112
113
        tig00015093     46912
114
        tig00035238     46779   recover
115
        tig00030900     119291
116
        '''
117
        idsfile = self.idsfile
118
        logging.debug("Parse idsfile `{}`".format(idsfile))
119
        fp = open(idsfile)
120
        tigs = []
121
        for row in fp:
122
            atoms = row.split()
123
            tig, size = atoms[:2]
124
            size = int(size)
125
            if skiprecover and len(atoms) == 3 and atoms[2] == 'recover':
126
                continue
127
            tigs.append((tig, size))
128
129
        # Arrange contig names and sizes
130
        _tigs, _sizes = zip(*tigs)
131
        self.contigs = set(_tigs)
132
        self.sizes = np.array(_sizes)
133
        self.tig_to_size = dict(tigs)
134
135
        # Initially all contigs are considered active
136
        self.active = set(_tigs)
137
138
    def parse_clm(self):
139
        clmfile = self.clmfile
140
        logging.debug("Parse clmfile `{}`".format(clmfile))
141
        fp = open(clmfile)
142
        contacts = {}
143
        contacts_oriented = defaultdict(dict)
144
        orientations = defaultdict(list)
145
        for row in fp:
146
            atoms = row.strip().split('\t')
147
            assert len(atoms) == 3, "Malformed line `{}`".format(atoms)
148
            abtig, links, dists = atoms
149
            atig, btig = abtig.split()
150
            at, ao = atig[:-1], atig[-1]
151
            bt, bo = btig[:-1], btig[-1]
152
            if at not in self.tig_to_size:
153
                continue
154
            if bt not in self.tig_to_size:
155
                continue
156
            dists = [int(x) for x in dists.split()]
157
            contacts[(at, bt)] = len(dists)
158
            gdists = golden_array(dists)
159
            contacts_oriented[(at, bt)][(FF[ao], FF[bo])] = gdists
160
            contacts_oriented[(bt, at)][(RR[bo], RR[ao])] = gdists
161
            strandedness = 1 if ao == bo else -1
162
            orientations[(at, bt)].append((strandedness, dists))
163
164
        self.contacts = contacts
165
        self.contacts_oriented = contacts_oriented
166
        # Preprocess the orientations dict
167
        for (at, bt), dists in orientations.items():
168
            dists = [(s, d, hmean_int(d)) for (s, d) in dists]
169
            strandedness, md, mh = min(dists, key=lambda x: x[-1])
170
            orientations[(at, bt)] = (strandedness, len(md), mh)
171
        self.orientations = orientations
172
173
    def calculate_densities(self):
174
        """
175
        Calculate the density of inter-contig links per base. Strong contigs
176
        considered to have high level of inter-contig links in the current
177
        partition.
178
        """
179
        active = self.active
180
        densities = defaultdict(int)
181
        for (at, bt), links in self.contacts.items():
182
            if not (at in active and bt in active):
183
                continue
184
            densities[at] += links
185
            densities[bt] += links
186
187
        logdensities = {}
188
        for x, d in densities.items():
189
            s = self.tig_to_size[x]
190
            logd = np.log10(d * 1. / min(s, 500000))
191
            logdensities[x] = logd
192
193
        return logdensities
194
195
    def report_active(self):
196
        logging.debug("Active contigs: {} (length={})"
197
                      .format(self.N, self.active_sizes.sum()))
198
199
    def activate(self, tourfile=None, minsize=10000, backuptour=True):
200
        """
201
        Select contigs in the current partition. This is the setup phase of the
202
        algorithm, and supports two modes:
203
204
        - "de novo": This is useful at the start of a new run where no tours
205
          available. We select the strong contigs that have significant number
206
          of links to other contigs in the partition. We build a histogram of
207
          link density (# links per bp) and remove the contigs that appear as
208
          outliers. The orientations are derived from the matrix decomposition
209
          of the pairwise strandedness matrix O.
210
211
        - "hotstart": This is useful when there was a past run, with a given
212
          tourfile. In this case, the active contig list and orientations are
213
          derived from the last tour in the file.
214
        """
215
        if tourfile and (not op.exists(tourfile)):
216
            logging.debug("Tourfile `{}` not found".format(tourfile))
217
            tourfile = None
218
219
        if tourfile:
220
            logging.debug("Importing tourfile `{}`".format(tourfile))
221
            tour, tour_o = iter_last_tour(tourfile, self)
222
            self.active = set(tour)
223
            tig_to_idx = self.tig_to_idx
224
            tour = [tig_to_idx[x] for x in tour]
225
            signs = sorted([(x, FF[o]) for (x, o) in zip(tour, tour_o)])
226
            _, signs = zip(*signs)
227
            self.signs = np.array(signs, dtype=int)
228
            if backuptour:
229
                backup(tourfile)
230
            tour = array.array('i', tour)
231
        else:
232
            self.report_active()
233
            while True:
234
                logdensities = self.calculate_densities()
235
                lb, ub = outlier_cutoff(logdensities.values())
236
                logging.debug("Log10(link_densities) ~ [{}, {}]"
237
                              .format(lb, ub))
238
                remove = set(x for x, d in logdensities.items() if
239
                             (d < lb and self.tig_to_size[x] < minsize * 10))
240
                if remove:
241
                    self.active -= remove
242
                    self.report_active()
243
                else:
244
                    break
245
246
            logging.debug("Remove contigs with size < {}".format(minsize))
247
            self.active = set(x for x in self.active if
248
                              self.tig_to_size[x] >= minsize)
249
            tour = range(self.N)  # Use starting (random) order otherwise
250
            tour = array.array('i', tour)
251
252
            # Determine orientations
253
            self.flip_all(tour)
254
255
        self.report_active()
256
        self.tour = tour
257
258
        return tour
259
260
    def evaluate_tour_M(self, tour):
261
        """ Use Cythonized version to evaluate the score of a current tour
262
        """
263
        from .chic import score_evaluate_M
264
        return score_evaluate_M(tour, self.active_sizes, self.M)
265
266
    def evaluate_tour_P(self, tour):
267
        """ Use Cythonized version to evaluate the score of a current tour,
268
        with better precision on the distance of the contigs.
269
        """
270
        from .chic import score_evaluate_P
271
        return score_evaluate_P(tour, self.active_sizes, self.P)
272
273
    def evaluate_tour_Q(self, tour):
274
        """ Use Cythonized version to evaluate the score of a current tour,
275
        taking orientation into consideration. This may be the most accurate
276
        evaluation under the right condition.
277
        """
278
        from .chic import score_evaluate_Q
279
        return score_evaluate_Q(tour, self.active_sizes, self.Q)
280
281
    def flip_log(self, method, score, score_flipped, tag):
282
        logging.debug("{}: {} => {} {}"
283
                      .format(method, score, score_flipped, tag))
284
285
    def flip_all(self, tour):
286
        """ Initialize the orientations based on pairwise O matrix.
287
        """
288
        if self.signs is None:  # First run
289
            score = 0
290
        else:
291
            old_signs = self.signs[:self.N]
292
            score, = self.evaluate_tour_Q(tour)
293
294
        # Remember we cannot have ambiguous orientation code (0 or '?') here
295
        self.signs = get_signs(self.O, validate=False, ambiguous=False)
296
        score_flipped, = self.evaluate_tour_Q(tour)
297
        if score_flipped >= score:
298
            tag = ACCEPT
299
        else:
300
            self.signs = old_signs[:]
301
            tag = REJECT
302
        self.flip_log("FLIPALL", score, score_flipped, tag)
303
        return tag
304
305
    def flip_whole(self, tour):
306
        """ Test flipping all contigs at the same time to see if score improves.
307
        """
308
        score, = self.evaluate_tour_Q(tour)
309
        self.signs = -self.signs
310
        score_flipped, = self.evaluate_tour_Q(tour)
311
        if score_flipped > score:
312
            tag = ACCEPT
313
        else:
314
            self.signs = -self.signs
315
            tag = REJECT
316
        self.flip_log("FLIPWHOLE", score, score_flipped, tag)
317
        return tag
318
319
    def flip_one(self, tour):
320
        """ Test flipping every single contig sequentially to see if score
321
        improves.
322
        """
323
        n_accepts = n_rejects = 0
324
        any_tag_ACCEPT = False
325
        for i, t in enumerate(tour):
326
            if i == 0:
327
                score, = self.evaluate_tour_Q(tour)
328
            self.signs[t] = -self.signs[t]
329
            score_flipped, = self.evaluate_tour_Q(tour)
330
            if score_flipped > score:
331
                n_accepts += 1
332
                tag = ACCEPT
333
            else:
334
                self.signs[t] = -self.signs[t]
335
                n_rejects += 1
336
                tag = REJECT
337
            self.flip_log("FLIPONE ({}/{})".format(i + 1, len(self.signs)),
338
                          score, score_flipped, tag)
339
            if tag == ACCEPT:
340
                any_tag_ACCEPT = True
341
                score = score_flipped
342
        logging.debug("FLIPONE: N_accepts={} N_rejects={}"
343
                      .format(n_accepts, n_rejects))
344
        return ACCEPT if any_tag_ACCEPT else REJECT
345
346
    def prune_tour(self, tour, cpus):
347
        """ Test deleting each contig and check the delta_score; tour here must
348
        be an array of ints.
349
        """
350
        while True:
351
            tour_score, = self.evaluate_tour_M(tour)
352
            logging.debug("Starting score: {}".format(tour_score))
353
            active_sizes = self.active_sizes
354
            M = self.M
355
            args = []
356
            for i, t in enumerate(tour):
357
                stour = tour[:i] + tour[i + 1:]
358
                args.append((t, stour, tour_score, active_sizes, M))
359
360
            # Parallel run
361
            p = Pool(processes=cpus)
362
            results = list(p.imap(prune_tour_worker, args))
363
            assert len(tour) == len(results), \
364
                "Array size mismatch, tour({}) != results({})"\
365
                .format(len(tour), len(results))
366
367
            # Identify outliers
368
            active_contigs = self.active_contigs
369
            idx, log10deltas = zip(*results)
370
            lb, ub = outlier_cutoff(log10deltas)
371
            logging.debug("Log10(delta_score) ~ [{}, {}]".format(lb, ub))
372
373
            remove = set(active_contigs[x] for (x, d) in results if d < lb)
374
            self.active -= remove
375
            self.report_active()
376
377
            tig_to_idx = self.tig_to_idx
378
            tour = [active_contigs[x] for x in tour]
379
            tour = array.array('i', [tig_to_idx[x] for x in tour
380
                                     if x not in remove])
381
            if not remove:
382
                break
383
384
        self.tour = tour
385
        self.flip_all(tour)
386
387
        return tour
388
389
    @property
390
    def active_contigs(self):
391
        return list(self.active)
392
393
    @property
394
    def active_sizes(self):
395
        return np.array([self.tig_to_size[x] for x in self.active])
396
397
    @property
398
    def N(self):
399
        return len(self.active)
400
401 View Code Duplication
    @property
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
402
    def oo(self):
403
        return range(self.N)
404
405
    @property
406
    def tig_to_idx(self):
407
        return dict((x, i) for (i, x) in enumerate(self.active))
408
409
    @property
410
    def M(self):
411
        """
412
        Contact frequency matrix. Each cell contains how many inter-contig
413
        links between i-th and j-th contigs.
414
        """
415
        N = self.N
416
        tig_to_idx = self.tig_to_idx
417
        M = np.zeros((N, N), dtype=int)
418 View Code Duplication
        for (at, bt), links in self.contacts.items():
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
419
            if not (at in tig_to_idx and bt in tig_to_idx):
420
                continue
421
            ai = tig_to_idx[at]
422
            bi = tig_to_idx[bt]
423
            M[ai, bi] = M[bi, ai] = links
424
        return M
425
426
    @property
427
    def O(self):
428
        """
429
        Pairwise strandedness matrix. Each cell contains whether i-th and j-th
430
        contig are the same orientation +1, or opposite orientation -1.
431
        """
432
        N = self.N
433
        tig_to_idx = self.tig_to_idx
434
        O = np.zeros((N, N), dtype=int)
435
        for (at, bt), (strandedness, md, mh) in self.orientations.items():
436
            if not (at in tig_to_idx and bt in tig_to_idx):
437
                continue
438
            ai = tig_to_idx[at]
439
            bi = tig_to_idx[bt]
440
            score = strandedness * md
441
            O[ai, bi] = O[bi, ai] = score
442
        return O
443
444
    @property
445
    def P(self):
446
        """
447
        Contact frequency matrix with better precision on distance between
448
        contigs. In the matrix M, the distance is assumed to be the distance
449
        between mid-points of two contigs. In matrix Q, however, we compute
450
        harmonic mean of the links for the orientation configuration that is
451
        shortest. This offers better precision for the distance between big
452
        contigs.
453
        """
454
        N = self.N
455
        tig_to_idx = self.tig_to_idx
456
        P = np.zeros((N, N, 2), dtype=int)
457
        for (at, bt), (strandedness, md, mh) in self.orientations.items():
458
            if not (at in tig_to_idx and bt in tig_to_idx):
459
                continue
460
            ai = tig_to_idx[at]
461
            bi = tig_to_idx[bt]
462
            P[ai, bi, 0] = P[bi, ai, 0] = md
463
            P[ai, bi, 1] = P[bi, ai, 1] = mh
464
        return P
465
466
    @property
467
    def Q(self):
468
        """
469
        Contact frequency matrix when contigs are already oriented. This is s a
470
        similar matrix as M, but rather than having the number of links in the
471
        cell, it points to an array that has the actual distances.
472
        """
473
        N = self.N
474
        tig_to_idx = self.tig_to_idx
475
        signs = self.signs
476
        Q = np.ones((N, N, BB), dtype=int) * -1  # Use -1 as the sentinel
477
        for (at, bt), k in self.contacts_oriented.items():
478
            if not (at in tig_to_idx and bt in tig_to_idx):
479
                continue
480
            ai = tig_to_idx[at]
481
            bi = tig_to_idx[bt]
482
            ao = signs[ai]
483
            bo = signs[bi]
484
            Q[ai, bi] = k[(ao, bo)]
485
        return Q
486
487
488
def hmean_int(a, a_min=5778, a_max=1149851):
489
    """ Harmonic mean of an array, returns the closest int
490
    """
491
    from scipy.stats import hmean
492
    return int(round(hmean(np.clip(a, a_min, a_max))))
493
494
495
def golden_array(a, phi=1.61803398875, lb=LB, ub=UB):
496
    """ Given list of ints, we aggregate similar values so that it becomes an
497
    array of multiples of phi, where phi is the golden ratio.
498
499
    phi ^ 14 = 843
500
    phi ^ 33 = 7881196
501
502
    So the array of counts go between 843 to 788196. One triva is that the
503
    exponents of phi gets closer to integers as N grows. See interesting
504
    discussion here:
505
    <https://www.johndcook.com/blog/2017/03/22/golden-powers-are-nearly-integers/>
506
    """
507
    counts = np.zeros(BB, dtype=int)
508
    for x in a:
509
        c = int(round(math.log(x, phi)))
510
        if c < lb:
511
            c = lb
512
        if c > ub:
513
            c = ub
514
        counts[c - lb] += 1
515
    return counts
516
517
518
def prune_tour_worker(arg):
519
    """ Worker thread for CLMFile.prune_tour()
520
    """
521
    from .chic import score_evaluate_M
522
523
    t, stour, tour_score, active_sizes, M = arg
524
    stour_score, = score_evaluate_M(stour, active_sizes, M)
525
    delta_score = tour_score - stour_score
526
    log10d = np.log10(delta_score) if delta_score > 1e-9 else -9
527
    return t, log10d
528
529
530
def main():
531
532
    actions = (
533
        # LACHESIS output processing
534
        ('agp', 'generate AGP file based on LACHESIS output'),
535
        ('score', 'score the current LACHESIS CLM'),
536
        # Simulation
537
        ('simulate', 'simulate CLM data'),
538
        # Scaffolding
539
        ('optimize', 'optimize the contig order and orientation'),
540
        ('density', 'estimate link density of contigs'),
541
        # Plotting
542
        ('movieframe', 'plot heatmap and synteny for a particular tour'),
543
        ('movie', 'plot heatmap optimization history in a tourfile'),
544
        # Reference-based analytics
545
        ('bam2mat', 'convert bam file to .npy format used in plotting'),
546
        ('mergemat', 'combine counts from multiple .npy data files'),
547
        ('heatmap', 'plot heatmap based on .npy format'),
548
            )
549
    p = ActionDispatcher(actions)
550
    p.dispatch(globals())
551
552
553
def mergemat(args):
554
    """
555
    %prog mergemat *.npy
556
557
    Combine counts from multiple .npy data files.
558
    """
559
    p = OptionParser(mergemat.__doc__)
560
    p.set_outfile(outfile="out")
561
    opts, args = p.parse_args(args)
562
563
    if len(args) < 1:
564
        sys.exit(not p.print_help())
565
566
    npyfiles = args
567
    A = np.load(npyfiles[0])
568
    logging.debug("Load `{}`: matrix of shape {}:; sum={}"
569
                  .format(npyfiles[0], A.shape, A.sum()))
570
    for npyfile in npyfiles[1:]:
571
        B = np.load(npyfile)
572
        A += B
573
        logging.debug("Load `{}`: sum={}"
574
                      .format(npyfiles[0], A.sum()))
575
576
    pf = opts.outfile
577
    np.save(pf, A)
578
    logging.debug("Combined {} files into `{}.npy`".format(len(npyfiles), pf))
579
580
581
def get_seqstarts(bamfile, N):
582
    """ Go through the SQ headers and pull out all sequences with size
583
    greater than the resolution settings, i.e. contains at least a few cells
584
    """
585
    import pysam
586
    bamfile = pysam.AlignmentFile(bamfile, "rb")
587
    seqsize = {}
588
    for kv in bamfile.header["SQ"]:
589
        if kv["LN"] < 10 * N:
590
            continue
591
        seqsize[kv["SN"]] = kv["LN"] / N + 1
592
593
    allseqs = natsorted(seqsize.keys())
594
    allseqsizes = np.array([seqsize[x] for x in allseqs])
595
    seqstarts = np.cumsum(allseqsizes)
596
    seqstarts = np.roll(seqstarts, 1)
597
    total_bins = seqstarts[0]
598
    seqstarts[0] = 0
599
    seqstarts = dict(zip(allseqs, seqstarts))
600
601
    return seqstarts, seqsize, total_bins
602
603
604
def bam2mat(args):
605
    """
606
    %prog bam2mat input.bam
607
608
    Convert bam file to .mat format, which is simply numpy 2D array. Important
609
    parameter is the resolution, which is the cell size. Small cell size lead
610
    to more fine-grained heatmap, but leads to large .mat size and slower
611
    plotting.
612
    """
613
    import pysam
614
    from jcvi.utils.cbook import percentage
615
616
    p = OptionParser(bam2mat.__doc__)
617
    p.add_option("--resolution", default=500000, type="int",
618
                 help="Resolution when counting the links")
619
    opts, args = p.parse_args(args)
620
621
    if len(args) != 1:
622
        sys.exit(not p.print_help())
623
624
    bamfilename, = args
625
    pf = bamfilename.rsplit(".", 1)[0]
626
    N = opts.resolution
627
628
    seqstarts, seqsize, total_bins = get_seqstarts(bamfilename, N)
629
630
    # Store the starts and sizes into a JSON file
631
    jsonfile = pf + ".json"
632
    fwjson = open(jsonfile, "w")
633
    header = {"starts": seqstarts, "sizes": seqsize, "total_bins": total_bins}
634
    json.dump(header, fwjson, sort_keys=True, indent=4)
635
    fwjson.close()
636
    logging.debug("Contig bin starts written to `{}`".format(jsonfile))
637
638
    print(sorted(seqstarts.items(), key=lambda x: x[-1]))
639
    logging.debug("Initialize matrix of size {}x{}"
640
                  .format(total_bins, total_bins))
641
    A = np.zeros((total_bins, total_bins), dtype="int")
642
643
    # Find the bin ID of each read
644
    def bin_number(chr, pos):
645
        return seqstarts[chr] + pos / N
646
647
    bamfile = pysam.AlignmentFile(bamfilename, "rb")
648
    # Check all reads, rules borrowed from LACHESIS
649
    # https://github.com/shendurelab/LACHESIS/blob/master/src/GenomeLinkMatrix.cc#L1476
650
    j = k = 0
651
    for c in bamfile:
652
        j += 1
653
        if j % 100000 == 0:
654
            print >> sys.stderr, "{} reads counted".format(j)
655
        # if j >= 10000: break
656
657
        if c.is_qcfail and c.is_duplicate:
658
            continue
659
        if c.is_secondary and c.is_supplementary:
660
            continue
661
        if c.mapping_quality == 0:
662
            continue
663
        if not c.is_paired:
664
            continue
665
        if c.is_read2:  # Take only one read
666
            continue
667
668
        # pysam v0.8.3 does not support keyword reference_name
669
        achr = bamfile.getrname(c.reference_id)
670
        apos = c.reference_start
671
        bchr = bamfile.getrname(c.next_reference_id)
672
        bpos = c.next_reference_start
673
        if achr not in seqstarts or bchr not in seqstarts:
674
            continue
675
        abin, bbin = bin_number(achr, apos), bin_number(bchr, bpos)
676
        A[abin, bbin] += 1
677
        if abin != bbin:
678
            A[bbin, abin] += 1
679
        k += 1
680
681
    logging.debug("Total reads counted: {}".format(percentage(2 * k, j)))
682
    bamfile.close()
683
    np.save(pf, A)
684
    logging.debug("Link counts written to `{}.npy`".format(pf))
685
686
687
def simulate(args):
688
    """
689
    %prog simulate test
690
691
    Simulate CLM and IDS files with given names.
692
693
    The simulator assumes several distributions:
694
    - Links are distributed uniformly across genome
695
    - Log10(link_size) are distributed normally
696
    - Genes are distributed uniformly
697
    """
698
    p = OptionParser(simulate.__doc__)
699
    p.add_option("--genomesize", default=10000000, type="int",
700
                 help="Genome size")
701
    p.add_option("--genes", default=1000, type="int",
702
                 help="Number of genes")
703
    p.add_option("--contigs", default=100, type="int",
704
                 help="Number of contigs")
705
    p.add_option("--coverage", default=10, type="int",
706
                 help="Link coverage")
707
    opts, args = p.parse_args(args)
708
709
    if len(args) != 1:
710
        sys.exit(not p.print_help())
711
712
    pf, = args
713
    GenomeSize = opts.genomesize
714
    Genes = opts.genes
715
    Contigs = opts.contigs
716
    Coverage = opts.coverage
717
    PE = 500
718
    Links = int(GenomeSize * Coverage / PE)
719
720
    # Simulate the contig sizes that sum to GenomeSize
721
    # See also:
722
    # <https://en.wikipedia.org/wiki/User:Skinnerd/Simplex_Point_Picking>
723
    ContigSizes, = np.random.dirichlet([1] * Contigs, 1) * GenomeSize
724
    ContigSizes = np.array(np.round_(ContigSizes, decimals=0), dtype=int)
725
    ContigStarts = np.zeros(Contigs, dtype=int)
726
    ContigStarts[1:] = np.cumsum(ContigSizes)[:-1]
727
728
    # Write IDS file
729
    idsfile = pf + ".ids"
730
    fw = open(idsfile, "w")
731
    for i, s in enumerate(ContigSizes):
732
        print >> fw, "tig{:04d}\t{}".format(i, s)
733
    fw.close()
734
735
    # Simulate the gene positions
736
    GenePositions = np.sort(np.random.random_integers(0,
737
                            GenomeSize - 1, size=Genes))
738
    write_last_and_beds(pf, GenePositions, ContigStarts)
739
740
    # Simulate links, uniform start, with link distances following 1/x, where x
741
    # is the distance between the links. As an approximation, we have links
742
    # between [1e3, 1e7], so we map from uniform [1e-7, 1e-3]
743
    LinkStarts = np.sort(np.random.random_integers(0, GenomeSize - 1,
744
                                                   size=Links))
745
    a, b = 1e-7, 1e-3
746
    LinkSizes = np.array(np.round_(1 / ((b - a) * np.random.rand(Links) + a),
747
                         decimals=0), dtype="int")
748
    LinkEnds = LinkStarts + LinkSizes
749
750
    # Find link to contig membership
751
    LinkStartContigs = np.searchsorted(ContigStarts, LinkStarts) - 1
752
    LinkEndContigs = np.searchsorted(ContigStarts, LinkEnds) - 1
753
754
    # Extract inter-contig links
755
    InterContigLinks = (LinkStartContigs != LinkEndContigs) & \
756
                       (LinkEndContigs != Contigs)
757
    ICLinkStartContigs = LinkStartContigs[InterContigLinks]
758
    ICLinkEndContigs = LinkEndContigs[InterContigLinks]
759
    ICLinkStarts = LinkStarts[InterContigLinks]
760
    ICLinkEnds = LinkEnds[InterContigLinks]
761
762
    # Write CLM file
763
    write_clm(pf, ICLinkStartContigs, ICLinkEndContigs,
764
              ICLinkStarts, ICLinkEnds,
765
              ContigStarts, ContigSizes)
766
767
768
def write_last_and_beds(pf, GenePositions, ContigStarts):
769
    """
770
    Write LAST file, query and subject BED files.
771
    """
772
    qbedfile = pf + "tigs.bed"
773
    sbedfile = pf + "chr.bed"
774
    lastfile = "{}tigs.{}chr.last".format(pf, pf)
775
    qbedfw = open(qbedfile, "w")
776
    sbedfw = open(sbedfile, "w")
777
    lastfw = open(lastfile, "w")
778
779
    GeneContigs = np.searchsorted(ContigStarts, GenePositions) - 1
780
    for i, (c, gstart) in enumerate(zip(GeneContigs, GenePositions)):
781
        gene = "gene{:05d}".format(i)
782
        tig = "tig{:04d}".format(c)
783
        start = ContigStarts[c]
784
        cstart = gstart - start
785
        print >> qbedfw, "\t".join(str(x) for x in
786
                                   (tig, cstart, cstart + 1, gene))
787
        print >> sbedfw, "\t".join(str(x) for x in
788
                                   ("chr1", gstart, gstart + 1, gene))
789
        lastatoms = [gene, gene, 100] + [0] * 8 + [100]
790
        print >> lastfw, "\t".join(str(x) for x in lastatoms)
791
792
    qbedfw.close()
793
    sbedfw.close()
794
    lastfw.close()
795
796
797
def write_clm(pf, ICLinkStartContigs, ICLinkEndContigs,
798
              ICLinkStarts, ICLinkEnds,
799
              ContigStarts, ContigSizes):
800
    """
801
    Write CLM file from simulated data.
802
    """
803
    clm = defaultdict(list)
804
    for start, end, linkstart, linkend in \
805
            zip(ICLinkStartContigs, ICLinkEndContigs,
806
                ICLinkStarts, ICLinkEnds):
807
        start_a = ContigStarts[start]
808
        start_b = start_a + ContigSizes[start]
809
        end_a = ContigStarts[end]
810
        end_b = end_a + ContigSizes[end]
811
        if linkend >= end_b:
812
            continue
813
        clm[(start, end)].append((linkstart - start_a, start_b - linkstart,
814
                                  linkend - end_a, end_b - linkend))
815
816
    clmfile = pf + ".clm"
817
    fw = open(clmfile, "w")
818
819
    def format_array(a):
820
        return [str(x) for x in sorted(a) if x > 0]
821
822
    for (start, end), links in sorted(clm.items()):
823
        start = "tig{:04d}".format(start)
824
        end = "tig{:04d}".format(end)
825
        nlinks = len(links)
826
        if not nlinks:
827
            continue
828
        ff = format_array([(b + c) for a, b, c, d in links])
829
        fr = format_array([(b + d) for a, b, c, d in links])
830
        rf = format_array([(a + c) for a, b, c, d in links])
831
        rr = format_array([(a + d) for a, b, c, d in links])
832
        print >> fw, "{}+ {}+\t{}\t{}".format(start, end, nlinks, " ".join(ff))
833
        print >> fw, "{}+ {}-\t{}\t{}".format(start, end, nlinks, " ".join(fr))
834
        print >> fw, "{}- {}+\t{}\t{}".format(start, end, nlinks, " ".join(rf))
835
        print >> fw, "{}- {}-\t{}\t{}".format(start, end, nlinks, " ".join(rr))
836
    fw.close()
837
838
839
def density(args):
840
    """
841
    %prog density test.clm
842
843
    Estimate link density of contigs.
844
    """
845
    p = OptionParser(density.__doc__)
846
    p.add_option("--save", default=False, action="store_true",
847
                 help="Write log densitites of contigs to file")
848
    p.set_cpus()
849
    opts, args = p.parse_args(args)
850
851
    if len(args) != 1:
852
        sys.exit(not p.print_help())
853
854
    clmfile, = args
855
    clm = CLMFile(clmfile)
856
    pf = clmfile.rsplit(".", 1)[0]
857
858
    if opts.save:
859
        logdensities = clm.calculate_densities()
860
        densityfile = pf + ".density"
861
        fw = open(densityfile, "w")
862
        for name, logd in logdensities.items():
863
            s = clm.tig_to_size[name]
864
            print >> fw, "\t".join(str(x) for x in (name, s, logd))
865
        fw.close()
866
        logging.debug("Density written to `{}`".format(densityfile))
867
868
    tourfile = clmfile.rsplit(".", 1)[0] + ".tour"
869
    tour = clm.activate(tourfile=tourfile, backuptour=False)
870
    clm.flip_all(tour)
871
    clm.flip_whole(tour)
872
    clm.flip_one(tour)
873
874
875
def optimize(args):
876
    """
877
    %prog optimize test.clm
878
879
    Optimize the contig order and orientation, based on CLM file.
880
    """
881
    p = OptionParser(optimize.__doc__)
882
    p.add_option("--skiprecover", default=False, action="store_true",
883
                 help="Do not import 'recover' contigs")
884
    p.add_option("--startover", default=False, action="store_true",
885
                 help="Do not resume from existing tour file")
886
    p.add_option("--skipGA", default=False, action="store_true",
887
                 help="Skip GA step")
888
    p.set_outfile(outfile=None)
889
    p.set_cpus()
890
    opts, args = p.parse_args(args)
891
892
    if len(args) != 1:
893
        sys.exit(not p.print_help())
894
895
    clmfile, = args
896
    startover = opts.startover
897
    runGA = not opts.skipGA
898
    cpus = opts.cpus
899
900
    # Load contact map
901
    clm = CLMFile(clmfile, skiprecover=opts.skiprecover)
902
903
    tourfile = opts.outfile or clmfile.rsplit(".", 1)[0] + ".tour"
904
    if startover:
905
        tourfile = None
906
    tour = clm.activate(tourfile=tourfile)
907
908
    fwtour = open(tourfile, "w")
909
    # Store INIT tour
910
    print_tour(fwtour, clm.tour, "INIT",
911
               clm.active_contigs, clm.oo, signs=clm.signs)
912
913
    if runGA:
914
        for phase in range(1, 3):
915
            tour = optimize_ordering(fwtour, clm, phase, cpus)
916
            tour = clm.prune_tour(tour, cpus)
917
918
    # Flip orientations
919
    phase = 1
920
    while True:
921
        tag1, tag2 = optimize_orientations(fwtour, clm, phase, cpus)
922
        if tag1 == REJECT and tag2 == REJECT:
923
            logging.debug("Terminating ... no more {}".format(ACCEPT))
924
            break
925
        phase += 1
926
927
    fwtour.close()
928
929
930
def optimize_ordering(fwtour, clm, phase, cpus):
931
    """
932
    Optimize the ordering of contigs by Genetic Algorithm (GA).
933
    """
934
    from .chic import score_evaluate_M
935
936
    # Prepare input files
937
    tour_contigs = clm.active_contigs
938
    tour_sizes = clm.active_sizes
939
    tour_M = clm.M
940
    tour = clm.tour
941
    signs = clm.signs
942
    oo = clm.oo
943
944
    def callback(tour, gen, phase, oo):
945
        fitness = tour.fitness if hasattr(tour, "fitness") else None
946
        label = "GA{}-{}".format(phase, gen)
947
        if fitness:
948
            fitness = "{0}".format(fitness).split(",")[0].replace("(", "")
949
            label += "-" + fitness
950
        if gen % 20 == 0:
951
            print_tour(fwtour, tour, label, tour_contigs, oo, signs=signs)
952
        return tour
953
954
    callbacki = partial(callback, phase=phase, oo=oo)
955
    toolbox = GA_setup(tour)
956
    toolbox.register("evaluate", score_evaluate_M,
957
                     tour_sizes=tour_sizes, tour_M=tour_M)
958
    tour, tour_fitness = GA_run(toolbox, ngen=1000, npop=100, cpus=cpus,
959
                                callback=callbacki)
960
    clm.tour = tour
961
962
    return tour
963
964
965
def optimize_orientations(fwtour, clm, phase, cpus):
966
    """
967
    Optimize the orientations of contigs by using heuristic flipping.
968
    """
969
    # Prepare input files
970
    tour_contigs = clm.active_contigs
971
    tour = clm.tour
972
    oo = clm.oo
973
974
    print_tour(fwtour, tour, "FLIPALL{}".format(phase),
975
               tour_contigs, oo, signs=clm.signs)
976
    tag1 = clm.flip_whole(tour)
977
    print_tour(fwtour, tour, "FLIPWHOLE{}".format(phase),
978
               tour_contigs, oo, signs=clm.signs)
979
    tag2 = clm.flip_one(tour)
980
    print_tour(fwtour, tour, "FLIPONE{}".format(phase),
981
               tour_contigs, oo, signs=clm.signs)
982
983
    return tag1, tag2
984
985
986
def prepare_synteny(tourfile, lastfile, odir, p, opts):
987
    """
988
    Prepare synteny plots for movie().
989
    """
990
    qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts)
991
    qbedfile = op.abspath(qbedfile)
992
    sbedfile = op.abspath(sbedfile)
993
994
    qbed = Bed(qbedfile, sorted=False)
995
    contig_to_beds = dict(qbed.sub_beds())
996
997
    # Create a separate directory for the subplots and movie
998
    mkdir(odir, overwrite=True)
999
    os.chdir(odir)
1000
    logging.debug("Change into subdir `{}`".format(odir))
1001
1002
    # Make anchorsfile
1003
    anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \
1004
                  + ".anchors"
1005
    fw = open(anchorsfile, "w")
1006
    for b in Blast(lastfile):
1007
        print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject),
1008
                                str(int(b.score))))
1009
    fw.close()
1010
1011
    # Symlink sbed
1012
    symlink(sbedfile, op.basename(sbedfile))
1013
1014
    return anchorsfile, qbedfile, contig_to_beds
1015
1016
1017
def separate_tour_and_o(row):
1018
    """
1019
    The tour line typically contains contig list like:
1020
    tig00044568+ tig00045748- tig00071055- tig00015093- tig00030900-
1021
1022
    This function separates the names from the orientations.
1023
    """
1024
    tour = []
1025
    tour_o = []
1026
    for contig in row.split():
1027
        if contig[-1] in ('+', '-', '?'):
1028
            tour.append(contig[:-1])
1029
            tour_o.append(contig[-1])
1030
        else:  # Unoriented
1031
            tour.append(contig)
1032
            tour_o.append('?')
1033
    return tour, tour_o
1034
1035
1036
def iter_last_tour(tourfile, clm):
1037
    """
1038
    Extract last tour from tourfile. The clm instance is also passed in to see
1039
    if any contig is covered in the clm.
1040
    """
1041
    row = open(tourfile).readlines()[-1]
1042
    _tour, _tour_o = separate_tour_and_o(row)
1043
    tour = []
1044
    tour_o = []
1045
    for tc, to in zip(_tour, _tour_o):
1046
        if tc not in clm.contigs:
1047
            logging.debug("Contig `{}` in file `{}` not found in `{}`"
1048
                          .format(tc, tourfile, clm.idsfile))
1049
            continue
1050
        tour.append(tc)
1051
        tour_o.append(to)
1052
    return tour, tour_o
1053
1054
1055
def iter_tours(tourfile, frames=1):
1056
    """
1057
    Extract tours from tourfile. Tourfile contains a set of contig
1058
    configurations, generated at each iteration of the genetic algorithm. Each
1059
    configuration has two rows, first row contains iteration id and score,
1060
    second row contains list of contigs, separated by comma.
1061
    """
1062
    fp = open(tourfile)
1063
1064
    i = 0
1065
    for row in fp:
1066
        if row[0] == '>':
1067
            label = row[1:].strip()
1068
            if label.startswith("GA"):
1069
                pf, j, score = label.split("-")
1070
                j = int(j)
1071
            else:
1072
                j = 0
1073
            i += 1
1074
        else:
1075
            if j % frames != 0:
1076
                continue
1077
            tour, tour_o = separate_tour_and_o(row)
1078
            yield i, label, tour, tour_o
1079
1080
    fp.close()
1081
1082
1083
def movie(args):
1084
    """
1085
    %prog movie test.tour test.clm ref.contigs.last
1086
1087
    Plot optimization history.
1088
    """
1089
    p = OptionParser(movie.__doc__)
1090
    p.add_option("--frames", default=500, type="int",
1091
                 help="Only plot every N frames")
1092
    p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"),
1093
                 help="Movie engine, output MP4 or GIF")
1094
    p.set_beds()
1095
    opts, args, iopts = p.set_image_options(args, figsize="16x8",
1096
                                            style="white", cmap="coolwarm",
1097
                                            format="png", dpi=300)
1098
1099
    if len(args) != 3:
1100
        sys.exit(not p.print_help())
1101
1102
    tourfile, clmfile, lastfile = args
1103
    tourfile = op.abspath(tourfile)
1104
    clmfile = op.abspath(clmfile)
1105
    lastfile = op.abspath(lastfile)
1106
    cwd = os.getcwd()
1107
    odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie"
1108
    anchorsfile, qbedfile, contig_to_beds = \
1109
        prepare_synteny(tourfile, lastfile, odir, p, opts)
1110
1111
    args = []
1112
    for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames):
1113
        padi = "{:06d}".format(i)
1114
        # Make sure the anchorsfile and bedfile has the serial number in,
1115
        # otherwise parallelization may fail
1116
        a, b = op.basename(anchorsfile).split(".", 1)
1117
        ianchorsfile = a + "_" + padi + "." + b
1118
        symlink(anchorsfile, ianchorsfile)
1119
1120
        # Make BED file with new order
1121
        qb = Bed()
1122
        for contig, o in zip(tour, tour_o):
1123
            if contig not in contig_to_beds:
1124
                continue
1125
            bedlines = contig_to_beds[contig][:]
1126
            if o == '-':
1127
                bedlines.reverse()
1128
            for x in bedlines:
1129
                qb.append(x)
1130
1131
        a, b = op.basename(qbedfile).split(".", 1)
1132
        ibedfile = a + "_" + padi + "." + b
1133
        qb.print_to_file(ibedfile)
1134
        # Plot dot plot, but do not sort contigs by name (otherwise losing
1135
        # order)
1136
        image_name = padi + "." + iopts.format
1137
1138
        tour = ",".join(tour)
1139
        args.append([[tour, clmfile, ianchorsfile,
1140
                    "--outfile", image_name, "--label", label]])
1141
1142
    Jobs(movieframe, args).run()
1143
1144
    os.chdir(cwd)
1145
    make_movie(odir, odir, engine=opts.engine, format=iopts.format)
1146
1147
1148
def score(args):
1149
    """
1150
    %prog score main_results/ cached_data/ contigsfasta
1151
1152
    Score the current LACHESIS CLM.
1153
    """
1154
    p = OptionParser(score.__doc__)
1155
    p.set_cpus()
1156
    opts, args = p.parse_args(args)
1157
1158
    if len(args) != 3:
1159
        sys.exit(not p.print_help())
1160
1161
    mdir, cdir, contigsfasta = args
1162
    orderingfiles = natsorted(iglob(mdir, "*.ordering"))
1163
    sizes = Sizes(contigsfasta)
1164
    contig_names = list(sizes.iter_names())
1165
    contig_ids = dict((name, i) for (i, name) in enumerate(contig_names))
1166
1167
    oo = []
1168
    # Load contact matrix
1169
    glm = op.join(cdir, "all.GLM")
1170
    N = len(contig_ids)
1171
    M = np.zeros((N, N), dtype=int)
1172
    fp = open(glm)
1173
    for row in fp:
1174
        if row[0] == '#':
1175
            continue
1176
        x, y, z = row.split()
1177
        if x == 'X':
1178
            continue
1179
        M[int(x), int(y)] = int(z)
1180
1181
    fwtour = open("tour", "w")
1182
1183
    def callback(tour, gen, oo):
1184
        fitness = tour.fitness if hasattr(tour, "fitness") else None
1185
        label = "GA-{0}".format(gen)
1186
        if fitness:
1187
            fitness = "{0}".format(fitness).split(",")[0].replace("(", "")
1188
            label += "-" + fitness
1189
        print_tour(fwtour, tour, label, contig_names, oo)
1190
        return tour
1191
1192
    for ofile in orderingfiles:
1193
        co = ContigOrdering(ofile)
1194
        for x in co:
1195
            contig_id = contig_ids[x.contig_name]
1196
            oo.append(contig_id)
1197
        pf = op.basename(ofile).split(".")[0]
1198
        print pf
1199
        print oo
1200
1201
        tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M)
1202
        # Store INIT tour
1203
        print_tour(fwtour, tour, "INIT", contig_names, oo)
1204
1205
        # Faster Cython version for evaluation
1206
        from .chic import score_evaluate_M
1207
        callbacki = partial(callback, oo=oo)
1208
        toolbox = GA_setup(tour)
1209
        toolbox.register("evaluate", score_evaluate_M,
1210
                         tour_sizes=tour_sizes, tour_M=tour_M)
1211
        tour, tour.fitness = GA_run(toolbox, npop=100, cpus=opts.cpus,
1212
                                    callback=callbacki)
1213
        print tour, tour.fitness
1214
        break
1215
1216
    fwtour.close()
1217
1218
1219
def print_tour(fwtour, tour, label, contig_names, oo, signs=None):
1220
    print >> fwtour, ">" + label
1221
    if signs is not None:
1222
        contig_o = []
1223
        for x in tour:
1224
            idx = oo[x]
1225
            sign = {1: '+', 0: '?', -1: '-'}[signs[idx]]
1226
            contig_o.append(contig_names[idx] + sign)
1227
        print >> fwtour, " ".join(contig_o)
1228
    else:
1229
        print >> fwtour, " ".join(contig_names[oo[x]] for x in tour)
1230
1231
1232
def prepare_ec(oo, sizes, M):
1233
    """
1234
    This prepares EC and converts from contig_id to an index.
1235
    """
1236
    tour = range(len(oo))
1237
    tour_sizes = np.array([sizes.sizes[x] for x in oo])
1238
    tour_M = M[oo, :][:, oo]
1239
    return tour, tour_sizes, tour_M
1240
1241
1242
def score_evaluate(tour, tour_sizes=None, tour_M=None):
1243
    """ SLOW python version of the evaluation function. For benchmarking
1244
    purposes only. Do not use in production.
1245
    """
1246
    sizes_oo = np.array([tour_sizes[x] for x in tour])
1247
    sizes_cum = np.cumsum(sizes_oo) - sizes_oo / 2
1248
    s = 0
1249
    size = len(tour)
1250
    for ia in xrange(size):
1251
        a = tour[ia]
1252
        for ib in xrange(ia + 1, size):
1253
            b = tour[ib]
1254
            links = tour_M[a, b]
1255
            dist = sizes_cum[ib] - sizes_cum[ia]
1256
            if dist > 1e7:
1257
                break
1258
            s += links * 1. / dist
1259
    return s,
1260
1261
1262
def movieframe(args):
1263
    """
1264
    %prog movieframe tour test.clm contigs.ref.anchors
1265
1266
    Draw heatmap and synteny in the same plot.
1267
    """
1268
    p = OptionParser(movieframe.__doc__)
1269
    p.add_option("--label", help="Figure title")
1270
    p.set_beds()
1271
    p.set_outfile(outfile=None)
1272
    opts, args, iopts = p.set_image_options(args, figsize="16x8",
1273
                                            style="white", cmap="coolwarm",
1274
                                            format="png", dpi=120)
1275
1276
    if len(args) != 3:
1277
        sys.exit(not p.print_help())
1278
1279
    tour, clmfile, anchorsfile = args
1280
    tour = tour.split(",")
1281
    image_name = opts.outfile or ("movieframe." + iopts.format)
1282
    label = opts.label or op.basename(image_name).rsplit(".", 1)[0]
1283
1284
    clm = CLMFile(clmfile)
1285
    totalbins, bins, breaks = make_bins(tour, clm.tig_to_size)
1286
    M = read_clm(clm, totalbins, bins)
1287
1288
    fig = plt.figure(1, (iopts.w, iopts.h))
1289
    root = fig.add_axes([0, 0, 1, 1])        # whole canvas
1290
    ax1 = fig.add_axes([.05, .1, .4, .8])    # heatmap
1291
    ax2 = fig.add_axes([.55, .1, .4, .8])    # dot plot
1292
    ax2_root = fig.add_axes([.5, 0, .5, 1])  # dot plot canvas
1293
1294
    # Left axis: heatmap
1295
    plot_heatmap(ax1, M, breaks, iopts)
1296
1297
    # Right axis: synteny
1298
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts,
1299
                                                     sorted=False)
1300
    dotplot(anchorsfile, qbed, sbed, fig, ax2_root, ax2, sep=False, title="")
1301
1302
    root.text(.5, .98, clm.name, color="g", ha="center", va="center")
1303
    root.text(.5, .95, label, color="darkslategray", ha="center", va="center")
1304
    normalize_axes(root)
1305
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
1306
1307
1308
def make_bins(tour, sizes):
1309
    breaks = []
1310
    start = 0
1311
    bins = {}
1312
    for x in tour:
1313
        size = sizes[x]
1314
        end = start + int(round(size * 1. / BINSIZE))
1315
        bins[x] = (start, end)
1316
        start = end
1317
    breaks.append(start)
1318
1319
    totalbins = start
1320
    return totalbins, bins, breaks
1321
1322
1323
def read_clm(clm, totalbins, bins):
1324
    M = np.zeros((totalbins, totalbins))
1325
    for (x, y), z in clm.contacts.items():
1326
        if x not in bins or y not in bins:
1327
            continue
1328
        xstart, xend = bins[x]
1329
        ystart, yend = bins[y]
1330
        M[xstart:xend, ystart:yend] = z
1331
        M[ystart:yend, xstart:xend] = z
1332
1333
    M = np.log10(M + 1)
1334
    return M
1335
1336
1337
def plot_heatmap(ax, M, breaks, iopts):
1338
    ax.imshow(M, cmap=iopts.cmap, origin="lower", interpolation='none')
1339
    xlim = ax.get_xlim()
1340
    for b in breaks[:-1]:
1341
        ax.plot([b, b], xlim, 'w-')
1342
        ax.plot(xlim, [b, b], 'w-')
1343
    ax.set_xlim(xlim)
1344
    ax.set_ylim(xlim)
1345
    ax.set_xticklabels([int(x) for x in ax.get_xticks()],
1346
                       family='Helvetica', color="gray")
1347
    ax.set_yticklabels([int(x) for x in ax.get_yticks()],
1348
                       family='Helvetica', color="gray")
1349
    binlabel = "Bins ({} per bin)".format(human_size(BINSIZE, precision=0))
1350
    ax.set_xlabel(binlabel)
1351
    ax.set_ylabel(binlabel)
1352
1353
1354
def agp(args):
1355
    """
1356
    %prog agp main_results/ contigs.fasta
1357
1358
    Generate AGP file based on LACHESIS output.
1359
    """
1360
    p = OptionParser(agp.__doc__)
1361
    p.set_outfile()
1362
    opts, args = p.parse_args(args)
1363
1364
    if len(args) != 2:
1365
        sys.exit(not p.print_help())
1366
1367
    odir, contigsfasta = args
1368
    fwagp = must_open(opts.outfile, 'w')
1369
    orderingfiles = natsorted(iglob(odir, "*.ordering"))
1370
    sizes = Sizes(contigsfasta).mapping
1371
    contigs = set(sizes.keys())
1372
    anchored = set()
1373
1374
    for ofile in orderingfiles:
1375
        co = ContigOrdering(ofile)
1376
        anchored |= set([x.contig_name for x in co])
1377
        obj = op.basename(ofile).split('.')[0]
1378
        co.write_agp(obj, sizes, fwagp)
1379
1380
    singletons = contigs - anchored
1381
    logging.debug('Anchored: {}, Singletons: {}'.
1382
                  format(len(anchored), len(singletons)))
1383
1384
    for s in natsorted(singletons):
1385
        order_to_agp(s, [(s, "?")], sizes, fwagp)
1386
1387
1388
if __name__ == '__main__':
1389
    main()
1390