Completed
Push — master ( 7b2afc...c13f26 )
by Haibao
04:24
created

distbin_number()   A

Complexity

Conditions 1

Size

Total Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
c 1
b 0
f 0
dl 0
loc 2
rs 10
1
#!/usr/bin/env python
2
# -*- coding: UTF-8 -*-
3
4
"""
5
Process Hi-C output into AGP for chromosomal-scale scaffolding.
6
"""
7
8
import array
9
import json
10
import logging
11
import sys
12
import os
13
import os.path as op
14
import numpy as np
15
import math
16
17
from collections import defaultdict
18
from functools import partial
19
from multiprocessing import Pool
20
21
from jcvi.algorithms.formula import outlier_cutoff
22
from jcvi.algorithms.ec import GA_setup, GA_run
23
from jcvi.algorithms.matrix import get_signs
24
from jcvi.apps.base import OptionParser, ActionDispatcher, backup, iglob, \
25
    mkdir, symlink
26
from jcvi.apps.console import green, red
27
from jcvi.apps.grid import Jobs
28
from jcvi.assembly.allmaps import make_movie
29
from jcvi.compara.synteny import check_beds, get_bed_filenames
30
from jcvi.formats.agp import order_to_agp
31
from jcvi.formats.base import LineFile, must_open
32
from jcvi.formats.bed import Bed
33
from jcvi.formats.sizes import Sizes
34
from jcvi.formats.blast import Blast
35
from jcvi.graphics.base import normalize_axes, plt, savefig
36
from jcvi.graphics.dotplot import dotplot
37
from jcvi.utils.cbook import gene_name, human_size
38
from jcvi.utils.natsort import natsorted
39
40
41
# Map orientations to ints
42
FF = {'+': 1, '-': -1, '?': 1}
43
RR = {'+': -1, '-': 1, '?': -1}
44
LB = 18             # Lower bound for golden_array()
45
UB = 29             # Upper bound for golden_array()
46
BB = UB - LB + 1    # Span for golden_array()
47
ACCEPT = green("ACCEPT")
48
REJECT = red("REJECT")
49
BINSIZE = 50000
50
51
52
class ContigOrderingLine(object):
53
    '''Stores one line in the ContigOrdering file
54
    '''
55
    def __init__(self, line, sep="|"):
56
        args = line.split()
57
        self.contig_id = args[0]
58
        self.contig_name = args[1].split(sep)[0]
59
        contig_rc = args[2]
60
        assert contig_rc in ('0', '1')
61
        self.strand = '+' if contig_rc == '0' else '-'
62
        self.orientation_score = args[3]
63
        self.gap_size_after_contig = args[4]
64
65
66
class ContigOrdering(LineFile):
67
    '''ContigOrdering file as created by LACHESIS, one per chromosome group.
68
    Header contains summary information per group, followed by list of contigs
69
    with given ordering.
70
    '''
71
    def __init__(self, filename):
72
        super(ContigOrdering, self).__init__(filename)
73
        fp = open(filename)
74
        for row in fp:
75
            if row[0] == '#':
76
                continue
77
            orderline = ContigOrderingLine(row)
78
            self.append(orderline)
79
80
    def write_agp(self, obj, sizes, fw=sys.stdout, gapsize=100,
81
                  gaptype="contig", evidence="map"):
82
        '''Converts the ContigOrdering file into AGP format
83
        '''
84
        contigorder = [(x.contig_name, x.strand) for x in self]
85
        order_to_agp(obj, contigorder, sizes, fw,
86
                     gapsize=gapsize, gaptype=gaptype, evidence=evidence)
87
88
89
class CLMFile:
90
    '''CLM file (modified) has the following format:
91
92
    tig00046211+ tig00063795+       1       53173
93
    tig00046211+ tig00063795-       1       116050
94
    tig00046211- tig00063795+       1       71155
95
    tig00046211- tig00063795-       1       134032
96
    tig00030676+ tig00077819+       5       136407 87625 87625 106905 102218
97
    tig00030676+ tig00077819-       5       126178 152952 152952 35680 118923
98
    tig00030676- tig00077819+       5       118651 91877 91877 209149 125906
99
    tig00030676- tig00077819-       5       108422 157204 157204 137924 142611
100
    '''
101
    def __init__(self, clmfile, skiprecover=False):
102
        self.name = op.basename(clmfile).rsplit(".", 1)[0]
103
        self.clmfile = clmfile
104
        self.idsfile = clmfile.rsplit(".", 1)[0] + ".ids"
105
        self.parse_ids(skiprecover)
106
        self.parse_clm()
107
        self.signs = None
108
109
    def parse_ids(self, skiprecover):
110
        '''IDS file has a list of contigs that need to be ordered. 'recover',
111
        keyword, if available in the third column, is less confident.
112
113
        tig00015093     46912
114
        tig00035238     46779   recover
115
        tig00030900     119291
116
        '''
117
        idsfile = self.idsfile
118
        logging.debug("Parse idsfile `{}`".format(idsfile))
119
        fp = open(idsfile)
120
        tigs = []
121
        for row in fp:
122
            atoms = row.split()
123
            tig, size = atoms[:2]
124
            size = int(size)
125
            if skiprecover and len(atoms) == 3 and atoms[2] == 'recover':
126
                continue
127
            tigs.append((tig, size))
128
129
        # Arrange contig names and sizes
130
        _tigs, _sizes = zip(*tigs)
131
        self.contigs = set(_tigs)
132
        self.sizes = np.array(_sizes)
133
        self.tig_to_size = dict(tigs)
134
135
        # Initially all contigs are considered active
136
        self.active = set(_tigs)
137
138
    def parse_clm(self):
139
        clmfile = self.clmfile
140
        logging.debug("Parse clmfile `{}`".format(clmfile))
141
        fp = open(clmfile)
142
        contacts = {}
143
        contacts_oriented = defaultdict(dict)
144
        orientations = defaultdict(list)
145
        for row in fp:
146
            atoms = row.strip().split('\t')
147
            assert len(atoms) == 3, "Malformed line `{}`".format(atoms)
148
            abtig, links, dists = atoms
149
            atig, btig = abtig.split()
150
            at, ao = atig[:-1], atig[-1]
151
            bt, bo = btig[:-1], btig[-1]
152
            if at not in self.tig_to_size:
153
                continue
154
            if bt not in self.tig_to_size:
155
                continue
156
            dists = [int(x) for x in dists.split()]
157
            contacts[(at, bt)] = len(dists)
158
            gdists = golden_array(dists)
159
            contacts_oriented[(at, bt)][(FF[ao], FF[bo])] = gdists
160
            contacts_oriented[(bt, at)][(RR[bo], RR[ao])] = gdists
161
            strandedness = 1 if ao == bo else -1
162
            orientations[(at, bt)].append((strandedness, dists))
163
164
        self.contacts = contacts
165
        self.contacts_oriented = contacts_oriented
166
        # Preprocess the orientations dict
167
        for (at, bt), dists in orientations.items():
168
            dists = [(s, d, hmean_int(d)) for (s, d) in dists]
169
            strandedness, md, mh = min(dists, key=lambda x: x[-1])
170
            orientations[(at, bt)] = (strandedness, len(md), mh)
171
        self.orientations = orientations
172
173
    def calculate_densities(self):
174
        """
175
        Calculate the density of inter-contig links per base. Strong contigs
176
        considered to have high level of inter-contig links in the current
177
        partition.
178
        """
179
        active = self.active
180
        densities = defaultdict(int)
181
        for (at, bt), links in self.contacts.items():
182
            if not (at in active and bt in active):
183
                continue
184
            densities[at] += links
185
            densities[bt] += links
186
187
        logdensities = {}
188
        for x, d in densities.items():
189
            s = self.tig_to_size[x]
190
            logd = np.log10(d * 1. / min(s, 500000))
191
            logdensities[x] = logd
192
193
        return logdensities
194
195
    def report_active(self):
196
        logging.debug("Active contigs: {} (length={})"
197
                      .format(self.N, self.active_sizes.sum()))
198
199
    def activate(self, tourfile=None, minsize=10000, backuptour=True):
200
        """
201
        Select contigs in the current partition. This is the setup phase of the
202
        algorithm, and supports two modes:
203
204
        - "de novo": This is useful at the start of a new run where no tours
205
          available. We select the strong contigs that have significant number
206
          of links to other contigs in the partition. We build a histogram of
207
          link density (# links per bp) and remove the contigs that appear as
208
          outliers. The orientations are derived from the matrix decomposition
209
          of the pairwise strandedness matrix O.
210
211
        - "hotstart": This is useful when there was a past run, with a given
212
          tourfile. In this case, the active contig list and orientations are
213
          derived from the last tour in the file.
214
        """
215
        if tourfile and (not op.exists(tourfile)):
216
            logging.debug("Tourfile `{}` not found".format(tourfile))
217
            tourfile = None
218
219
        if tourfile:
220
            logging.debug("Importing tourfile `{}`".format(tourfile))
221
            tour, tour_o = iter_last_tour(tourfile, self)
222
            self.active = set(tour)
223
            tig_to_idx = self.tig_to_idx
224
            tour = [tig_to_idx[x] for x in tour]
225
            signs = sorted([(x, FF[o]) for (x, o) in zip(tour, tour_o)])
226
            _, signs = zip(*signs)
227
            self.signs = np.array(signs, dtype=int)
228
            if backuptour:
229
                backup(tourfile)
230
            tour = array.array('i', tour)
231
        else:
232
            self.report_active()
233
            while True:
234
                logdensities = self.calculate_densities()
235
                lb, ub = outlier_cutoff(logdensities.values())
236
                logging.debug("Log10(link_densities) ~ [{}, {}]"
237
                              .format(lb, ub))
238
                remove = set(x for x, d in logdensities.items() if
239
                             (d < lb and self.tig_to_size[x] < minsize * 10))
240
                if remove:
241
                    self.active -= remove
242
                    self.report_active()
243
                else:
244
                    break
245
246
            logging.debug("Remove contigs with size < {}".format(minsize))
247
            self.active = set(x for x in self.active if
248
                              self.tig_to_size[x] >= minsize)
249
            tour = range(self.N)  # Use starting (random) order otherwise
250
            tour = array.array('i', tour)
251
252
            # Determine orientations
253
            self.flip_all(tour)
254
255
        self.report_active()
256
        self.tour = tour
257
258
        return tour
259
260
    def evaluate_tour_M(self, tour):
261
        """ Use Cythonized version to evaluate the score of a current tour
262
        """
263
        from .chic import score_evaluate_M
264
        return score_evaluate_M(tour, self.active_sizes, self.M)
265
266
    def evaluate_tour_P(self, tour):
267
        """ Use Cythonized version to evaluate the score of a current tour,
268
        with better precision on the distance of the contigs.
269
        """
270
        from .chic import score_evaluate_P
271
        return score_evaluate_P(tour, self.active_sizes, self.P)
272
273
    def evaluate_tour_Q(self, tour):
274
        """ Use Cythonized version to evaluate the score of a current tour,
275
        taking orientation into consideration. This may be the most accurate
276
        evaluation under the right condition.
277
        """
278
        from .chic import score_evaluate_Q
279
        return score_evaluate_Q(tour, self.active_sizes, self.Q)
280
281
    def flip_log(self, method, score, score_flipped, tag):
282
        logging.debug("{}: {} => {} {}"
283
                      .format(method, score, score_flipped, tag))
284
285
    def flip_all(self, tour):
286
        """ Initialize the orientations based on pairwise O matrix.
287
        """
288
        if self.signs is None:  # First run
289
            score = 0
290
        else:
291
            old_signs = self.signs[:self.N]
292
            score, = self.evaluate_tour_Q(tour)
293
294
        # Remember we cannot have ambiguous orientation code (0 or '?') here
295
        self.signs = get_signs(self.O, validate=False, ambiguous=False)
296
        score_flipped, = self.evaluate_tour_Q(tour)
297
        if score_flipped >= score:
298
            tag = ACCEPT
299
        else:
300
            self.signs = old_signs[:]
301
            tag = REJECT
302
        self.flip_log("FLIPALL", score, score_flipped, tag)
303
        return tag
304
305
    def flip_whole(self, tour):
306
        """ Test flipping all contigs at the same time to see if score improves.
307
        """
308
        score, = self.evaluate_tour_Q(tour)
309
        self.signs = -self.signs
310
        score_flipped, = self.evaluate_tour_Q(tour)
311
        if score_flipped > score:
312
            tag = ACCEPT
313
        else:
314
            self.signs = -self.signs
315
            tag = REJECT
316
        self.flip_log("FLIPWHOLE", score, score_flipped, tag)
317
        return tag
318
319
    def flip_one(self, tour):
320
        """ Test flipping every single contig sequentially to see if score
321
        improves.
322
        """
323
        n_accepts = n_rejects = 0
324
        any_tag_ACCEPT = False
325
        for i, t in enumerate(tour):
326
            if i == 0:
327
                score, = self.evaluate_tour_Q(tour)
328
            self.signs[t] = -self.signs[t]
329
            score_flipped, = self.evaluate_tour_Q(tour)
330
            if score_flipped > score:
331
                n_accepts += 1
332
                tag = ACCEPT
333
            else:
334
                self.signs[t] = -self.signs[t]
335
                n_rejects += 1
336
                tag = REJECT
337
            self.flip_log("FLIPONE ({}/{})".format(i + 1, len(self.signs)),
338
                          score, score_flipped, tag)
339
            if tag == ACCEPT:
340
                any_tag_ACCEPT = True
341
                score = score_flipped
342
        logging.debug("FLIPONE: N_accepts={} N_rejects={}"
343
                      .format(n_accepts, n_rejects))
344
        return ACCEPT if any_tag_ACCEPT else REJECT
345
346
    def prune_tour(self, tour, cpus):
347
        """ Test deleting each contig and check the delta_score; tour here must
348
        be an array of ints.
349
        """
350
        while True:
351
            tour_score, = self.evaluate_tour_M(tour)
352
            logging.debug("Starting score: {}".format(tour_score))
353
            active_sizes = self.active_sizes
354
            M = self.M
355
            args = []
356
            for i, t in enumerate(tour):
357
                stour = tour[:i] + tour[i + 1:]
358
                args.append((t, stour, tour_score, active_sizes, M))
359
360
            # Parallel run
361
            p = Pool(processes=cpus)
362
            results = list(p.imap(prune_tour_worker, args))
363
            assert len(tour) == len(results), \
364
                "Array size mismatch, tour({}) != results({})"\
365
                .format(len(tour), len(results))
366
367
            # Identify outliers
368
            active_contigs = self.active_contigs
369
            idx, log10deltas = zip(*results)
370
            lb, ub = outlier_cutoff(log10deltas)
371
            logging.debug("Log10(delta_score) ~ [{}, {}]".format(lb, ub))
372
373
            remove = set(active_contigs[x] for (x, d) in results if d < lb)
374
            self.active -= remove
375
            self.report_active()
376
377
            tig_to_idx = self.tig_to_idx
378
            tour = [active_contigs[x] for x in tour]
379
            tour = array.array('i', [tig_to_idx[x] for x in tour
380
                                     if x not in remove])
381
            if not remove:
382
                break
383
384
        self.tour = tour
385
        self.flip_all(tour)
386
387
        return tour
388
389
    @property
390
    def active_contigs(self):
391
        return list(self.active)
392
393
    @property
394
    def active_sizes(self):
395
        return np.array([self.tig_to_size[x] for x in self.active])
396
397
    @property
398
    def N(self):
399
        return len(self.active)
400
401 View Code Duplication
    @property
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
402
    def oo(self):
403
        return range(self.N)
404
405
    @property
406
    def tig_to_idx(self):
407
        return dict((x, i) for (i, x) in enumerate(self.active))
408
409
    @property
410
    def M(self):
411
        """
412
        Contact frequency matrix. Each cell contains how many inter-contig
413
        links between i-th and j-th contigs.
414
        """
415
        N = self.N
416
        tig_to_idx = self.tig_to_idx
417
        M = np.zeros((N, N), dtype=int)
418 View Code Duplication
        for (at, bt), links in self.contacts.items():
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
419
            if not (at in tig_to_idx and bt in tig_to_idx):
420
                continue
421
            ai = tig_to_idx[at]
422
            bi = tig_to_idx[bt]
423
            M[ai, bi] = M[bi, ai] = links
424
        return M
425
426
    @property
427
    def O(self):
428
        """
429
        Pairwise strandedness matrix. Each cell contains whether i-th and j-th
430
        contig are the same orientation +1, or opposite orientation -1.
431
        """
432
        N = self.N
433
        tig_to_idx = self.tig_to_idx
434
        O = np.zeros((N, N), dtype=int)
435
        for (at, bt), (strandedness, md, mh) in self.orientations.items():
436
            if not (at in tig_to_idx and bt in tig_to_idx):
437
                continue
438
            ai = tig_to_idx[at]
439
            bi = tig_to_idx[bt]
440
            score = strandedness * md
441
            O[ai, bi] = O[bi, ai] = score
442
        return O
443
444
    @property
445
    def P(self):
446
        """
447
        Contact frequency matrix with better precision on distance between
448
        contigs. In the matrix M, the distance is assumed to be the distance
449
        between mid-points of two contigs. In matrix Q, however, we compute
450
        harmonic mean of the links for the orientation configuration that is
451
        shortest. This offers better precision for the distance between big
452
        contigs.
453
        """
454
        N = self.N
455
        tig_to_idx = self.tig_to_idx
456
        P = np.zeros((N, N, 2), dtype=int)
457
        for (at, bt), (strandedness, md, mh) in self.orientations.items():
458
            if not (at in tig_to_idx and bt in tig_to_idx):
459
                continue
460
            ai = tig_to_idx[at]
461
            bi = tig_to_idx[bt]
462
            P[ai, bi, 0] = P[bi, ai, 0] = md
463
            P[ai, bi, 1] = P[bi, ai, 1] = mh
464
        return P
465
466
    @property
467
    def Q(self):
468
        """
469
        Contact frequency matrix when contigs are already oriented. This is s a
470
        similar matrix as M, but rather than having the number of links in the
471
        cell, it points to an array that has the actual distances.
472
        """
473
        N = self.N
474
        tig_to_idx = self.tig_to_idx
475
        signs = self.signs
476
        Q = np.ones((N, N, BB), dtype=int) * -1  # Use -1 as the sentinel
477
        for (at, bt), k in self.contacts_oriented.items():
478
            if not (at in tig_to_idx and bt in tig_to_idx):
479
                continue
480
            ai = tig_to_idx[at]
481
            bi = tig_to_idx[bt]
482
            ao = signs[ai]
483
            bo = signs[bi]
484
            Q[ai, bi] = k[(ao, bo)]
485
        return Q
486
487
488
def hmean_int(a, a_min=5778, a_max=1149851):
489
    """ Harmonic mean of an array, returns the closest int
490
    """
491
    from scipy.stats import hmean
492
    return int(round(hmean(np.clip(a, a_min, a_max))))
493
494
495
def golden_array(a, phi=1.61803398875, lb=LB, ub=UB):
496
    """ Given list of ints, we aggregate similar values so that it becomes an
497
    array of multiples of phi, where phi is the golden ratio.
498
499
    phi ^ 14 = 843
500
    phi ^ 33 = 7881196
501
502
    So the array of counts go between 843 to 788196. One triva is that the
503
    exponents of phi gets closer to integers as N grows. See interesting
504
    discussion here:
505
    <https://www.johndcook.com/blog/2017/03/22/golden-powers-are-nearly-integers/>
506
    """
507
    counts = np.zeros(BB, dtype=int)
508
    for x in a:
509
        c = int(round(math.log(x, phi)))
510
        if c < lb:
511
            c = lb
512
        if c > ub:
513
            c = ub
514
        counts[c - lb] += 1
515
    return counts
516
517
518
def prune_tour_worker(arg):
519
    """ Worker thread for CLMFile.prune_tour()
520
    """
521
    from .chic import score_evaluate_M
522
523
    t, stour, tour_score, active_sizes, M = arg
524
    stour_score, = score_evaluate_M(stour, active_sizes, M)
525
    delta_score = tour_score - stour_score
526
    log10d = np.log10(delta_score) if delta_score > 1e-9 else -9
527
    return t, log10d
528
529
530
def main():
531
532
    actions = (
533
        # LACHESIS output processing
534
        ('agp', 'generate AGP file based on LACHESIS output'),
535
        ('score', 'score the current LACHESIS CLM'),
536
        # Simulation
537
        ('simulate', 'simulate CLM data'),
538
        # Scaffolding
539
        ('optimize', 'optimize the contig order and orientation'),
540
        ('density', 'estimate link density of contigs'),
541
        # Plotting
542
        ('movieframe', 'plot heatmap and synteny for a particular tour'),
543
        ('movie', 'plot heatmap optimization history in a tourfile'),
544
        # Reference-based analytics
545
        ('bam2mat', 'convert bam file to .npy format used in plotting'),
546
        ('mergemat', 'combine counts from multiple .npy data files'),
547
        ('heatmap', 'plot heatmap based on .npy file'),
548
            )
549
    p = ActionDispatcher(actions)
550
    p.dispatch(globals())
551
552
553
def heatmap(args):
554
    """
555
    %prog heatmap input.npy genome.json
556
557
    Plot heatmap based on .npy data file. The .npy stores a square matrix with
558
    bins of genome, and cells inside the matrix represent number of links
559
    between bin i and bin j. The `genome.json` contains the offsets of each
560
    contig/chr so that we know where to draw boundary lines, or extract per
561
    contig/chromosome heatmap.
562
    """
563
    p = OptionParser(heatmap.__doc__)
564
    p.add_option("--resolution", default=500000, type="int",
565
                 help="Resolution when counting the links")
566
    opts, args, iopts = p.set_image_options(args, figsize="10x10",
567
                                            style="white", cmap="coolwarm",
568
                                            format="png", dpi=120)
569
570
    if len(args) != 2:
571
        sys.exit(not p.print_help())
572
573
    npyfile, jsonfile = args
574
    # Load contig/chromosome starts and sizes
575
    header = json.loads(open(jsonfile).read())
576
    # Load the matrix
577
    A = np.load(npyfile)
578
    # Several concerns in practice:
579
    # The diagonal counts may be too strong, this can either be resolved by
580
    # masking them. Or perform a log transform on the entire heatmap.
581
    B = A.astype("float64")
582
    B += 1.0
583
    B = np.log(B)
584
    vmin, vmax = 1, 7
585
    B[B < vmin] = vmin
586
    B[B > vmax] = vmax
587
    print B
588
    logging.debug("Matrix log-transformation and thresholding ({}-{}) done"
589
                  .format(vmin, vmax))
590
591
    # Canvas
592
    fig = plt.figure(1, (iopts.w, iopts.h))
593
    root = fig.add_axes([0, 0, 1, 1])       # whole canvas
594
    ax = fig.add_axes([.05, .05, .9, .9])   # just the heatmap
595
596
    breaks = header["starts"].values()
597
    breaks += [header["total_bins"]]   # This is actually discarded
598
    breaks = sorted(breaks)[1:]
599
    plot_heatmap(ax, B, breaks, iopts, binsize=opts.resolution)
600
601
    # Title
602
    pf = npyfile.rsplit(".", 1)[0]
603
    root.text(.5, .98, pf, color="darkslategray", size=18,
604
              ha="center", va="center")
605
606
    normalize_axes(root)
607
    image_name = pf + "." + iopts.format
608
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
609
610
611
def mergemat(args):
612
    """
613
    %prog mergemat *.npy
614
615
    Combine counts from multiple .npy data files.
616
    """
617
    p = OptionParser(mergemat.__doc__)
618
    p.set_outfile(outfile="out")
619
    opts, args = p.parse_args(args)
620
621
    if len(args) < 1:
622
        sys.exit(not p.print_help())
623
624
    npyfiles = args
625
    A = np.load(npyfiles[0])
626
    logging.debug("Load `{}`: matrix of shape {}:; sum={}"
627
                  .format(npyfiles[0], A.shape, A.sum()))
628
    for npyfile in npyfiles[1:]:
629
        B = np.load(npyfile)
630
        A += B
631
        logging.debug("Load `{}`: sum={}"
632
                      .format(npyfiles[0], A.sum()))
633
634
    pf = opts.outfile
635
    np.save(pf, A)
636
    logging.debug("Combined {} files into `{}.npy`".format(len(npyfiles), pf))
637
638
639
def get_seqstarts(bamfile, N):
640
    """ Go through the SQ headers and pull out all sequences with size
641
    greater than the resolution settings, i.e. contains at least a few cells
642
    """
643
    import pysam
644
    bamfile = pysam.AlignmentFile(bamfile, "rb")
645
    seqsize = {}
646
    for kv in bamfile.header["SQ"]:
647
        if kv["LN"] < 10 * N:
648
            continue
649
        seqsize[kv["SN"]] = kv["LN"] / N + 1
650
651
    allseqs = natsorted(seqsize.keys())
652
    allseqsizes = np.array([seqsize[x] for x in allseqs])
653
    seqstarts = np.cumsum(allseqsizes)
654
    seqstarts = np.roll(seqstarts, 1)
655
    total_bins = seqstarts[0]
656
    seqstarts[0] = 0
657
    seqstarts = dict(zip(allseqs, seqstarts))
658
659
    return seqstarts, seqsize, total_bins
660
661
662
def get_distbins(start=100, bins=2500, ratio=1.01):
663
    """ Get exponentially sized
664
    """
665
    b = np.ones(bins, dtype="float64")
666
    b[0] = 100
667
    for i in range(1, bins):
668
        b[i] = b[i - 1] * ratio
669
    bins = np.around(b).astype(dtype="int")
670
    binsizes = np.diff(bins)
671
    return bins, binsizes
672
673
674
def bam2mat(args):
675
    """
676
    %prog bam2mat input.bam
677
678
    Convert bam file to .mat format, which is simply numpy 2D array. Important
679
    parameter is the resolution, which is the cell size. Small cell size lead
680
    to more fine-grained heatmap, but leads to large .mat size and slower
681
    plotting.
682
    """
683
    import pysam
684
    from jcvi.utils.cbook import percentage
685
686
    p = OptionParser(bam2mat.__doc__)
687
    p.add_option("--resolution", default=500000, type="int",
688
                 help="Resolution when counting the links")
689
    opts, args = p.parse_args(args)
690
691
    if len(args) != 1:
692
        sys.exit(not p.print_help())
693
694
    bamfilename, = args
695
    pf = bamfilename.rsplit(".", 1)[0]
696
    N = opts.resolution
697
    bins = 2500
698
    minsize = 100
699
700
    seqstarts, seqsize, total_bins = get_seqstarts(bamfilename, N)
701
    distbinstarts, distbinsizes = get_distbins(start=minsize, bins=bins)
702
703
    # Store the starts and sizes into a JSON file
704
    jsonfile = pf + ".json"
705
    fwjson = open(jsonfile, "w")
706
    header = {"starts": seqstarts, "sizes": seqsize, "total_bins": total_bins,
707
              "distbinstarts": list(distbinstarts),
708
              "distbinsizes": list(distbinsizes)}
709
    json.dump(header, fwjson, sort_keys=True, indent=4)
710
    fwjson.close()
711
    logging.debug("Contig bin starts written to `{}`".format(jsonfile))
712
713
    print(sorted(seqstarts.items(), key=lambda x: x[-1]))
714
    logging.debug("Initialize matrix of size {}x{}"
715
                  .format(total_bins, total_bins))
716
    A = np.zeros((total_bins, total_bins), dtype="int")
717
    B = np.zeros(bins, dtype="int")
718
719
    # Find the bin ID of each read
720
    def bin_number(chr, pos):
721
        return seqstarts[chr] + pos / N
722
723
    def distbin_number(dist, start=minsize, ratio=1.01):
724
        return int(round(math.log(dist * 1.0 / start, ratio)))
725
726
    bamfile = pysam.AlignmentFile(bamfilename, "rb")
727
    # Check all reads, rules borrowed from LACHESIS
728
    # https://github.com/shendurelab/LACHESIS/blob/master/src/GenomeLinkMatrix.cc#L1476
729
    j = k = 0
730
    for c in bamfile:
731
        j += 1
732
        if j % 100000 == 0:
733
            print >> sys.stderr, "{} reads counted".format(j)
734
        if j >= 10000: break
735
736
        if c.is_qcfail and c.is_duplicate:
737
            continue
738
        if c.is_secondary and c.is_supplementary:
739
            continue
740
        if c.mapping_quality == 0:
741
            continue
742
        if not c.is_paired:
743
            continue
744
        if c.is_read2:  # Take only one read
745
            continue
746
747
        # pysam v0.8.3 does not support keyword reference_name
748
        achr = bamfile.getrname(c.reference_id)
749
        apos = c.reference_start
750
        bchr = bamfile.getrname(c.next_reference_id)
751
        bpos = c.next_reference_start
752
        if achr not in seqstarts or bchr not in seqstarts:
753
            continue
754
        if achr == bchr:
755
            dist = abs(apos - bpos)
756
	    if dist < minsize:
757
	        continue
758
            db = distbin_number(dist)
759
            B[db] += 1
760
761
        abin, bbin = bin_number(achr, apos), bin_number(bchr, bpos)
762
        A[abin, bbin] += 1
763
        if abin != bbin:
764
            A[bbin, abin] += 1
765
766
        k += 1
767
768
    logging.debug("Total reads counted: {}".format(percentage(2 * k, j)))
769
    bamfile.close()
770
    np.save(pf, A)
771
    logging.debug("Link counts written to `{}.npy`".format(pf))
772
    np.save(pf + ".dist", B)
773
    logging.debug("Link dists written to `{}.dist.npy`".format(pf))
774
775
776
def simulate(args):
777
    """
778
    %prog simulate test
779
780
    Simulate CLM and IDS files with given names.
781
782
    The simulator assumes several distributions:
783
    - Links are distributed uniformly across genome
784
    - Log10(link_size) are distributed normally
785
    - Genes are distributed uniformly
786
    """
787
    p = OptionParser(simulate.__doc__)
788
    p.add_option("--genomesize", default=10000000, type="int",
789
                 help="Genome size")
790
    p.add_option("--genes", default=1000, type="int",
791
                 help="Number of genes")
792
    p.add_option("--contigs", default=100, type="int",
793
                 help="Number of contigs")
794
    p.add_option("--coverage", default=10, type="int",
795
                 help="Link coverage")
796
    opts, args = p.parse_args(args)
797
798
    if len(args) != 1:
799
        sys.exit(not p.print_help())
800
801
    pf, = args
802
    GenomeSize = opts.genomesize
803
    Genes = opts.genes
804
    Contigs = opts.contigs
805
    Coverage = opts.coverage
806
    PE = 500
807
    Links = int(GenomeSize * Coverage / PE)
808
809
    # Simulate the contig sizes that sum to GenomeSize
810
    # See also:
811
    # <https://en.wikipedia.org/wiki/User:Skinnerd/Simplex_Point_Picking>
812
    ContigSizes, = np.random.dirichlet([1] * Contigs, 1) * GenomeSize
813
    ContigSizes = np.array(np.round_(ContigSizes, decimals=0), dtype=int)
814
    ContigStarts = np.zeros(Contigs, dtype=int)
815
    ContigStarts[1:] = np.cumsum(ContigSizes)[:-1]
816
817
    # Write IDS file
818
    idsfile = pf + ".ids"
819
    fw = open(idsfile, "w")
820
    for i, s in enumerate(ContigSizes):
821
        print >> fw, "tig{:04d}\t{}".format(i, s)
822
    fw.close()
823
824
    # Simulate the gene positions
825
    GenePositions = np.sort(np.random.random_integers(0,
826
                            GenomeSize - 1, size=Genes))
827
    write_last_and_beds(pf, GenePositions, ContigStarts)
828
829
    # Simulate links, uniform start, with link distances following 1/x, where x
830
    # is the distance between the links. As an approximation, we have links
831
    # between [1e3, 1e7], so we map from uniform [1e-7, 1e-3]
832
    LinkStarts = np.sort(np.random.random_integers(0, GenomeSize - 1,
833
                                                   size=Links))
834
    a, b = 1e-7, 1e-3
835
    LinkSizes = np.array(np.round_(1 / ((b - a) * np.random.rand(Links) + a),
836
                         decimals=0), dtype="int")
837
    LinkEnds = LinkStarts + LinkSizes
838
839
    # Find link to contig membership
840
    LinkStartContigs = np.searchsorted(ContigStarts, LinkStarts) - 1
841
    LinkEndContigs = np.searchsorted(ContigStarts, LinkEnds) - 1
842
843
    # Extract inter-contig links
844
    InterContigLinks = (LinkStartContigs != LinkEndContigs) & \
845
                       (LinkEndContigs != Contigs)
846
    ICLinkStartContigs = LinkStartContigs[InterContigLinks]
847
    ICLinkEndContigs = LinkEndContigs[InterContigLinks]
848
    ICLinkStarts = LinkStarts[InterContigLinks]
849
    ICLinkEnds = LinkEnds[InterContigLinks]
850
851
    # Write CLM file
852
    write_clm(pf, ICLinkStartContigs, ICLinkEndContigs,
853
              ICLinkStarts, ICLinkEnds,
854
              ContigStarts, ContigSizes)
855
856
857
def write_last_and_beds(pf, GenePositions, ContigStarts):
858
    """
859
    Write LAST file, query and subject BED files.
860
    """
861
    qbedfile = pf + "tigs.bed"
862
    sbedfile = pf + "chr.bed"
863
    lastfile = "{}tigs.{}chr.last".format(pf, pf)
864
    qbedfw = open(qbedfile, "w")
865
    sbedfw = open(sbedfile, "w")
866
    lastfw = open(lastfile, "w")
867
868
    GeneContigs = np.searchsorted(ContigStarts, GenePositions) - 1
869
    for i, (c, gstart) in enumerate(zip(GeneContigs, GenePositions)):
870
        gene = "gene{:05d}".format(i)
871
        tig = "tig{:04d}".format(c)
872
        start = ContigStarts[c]
873
        cstart = gstart - start
874
        print >> qbedfw, "\t".join(str(x) for x in
875
                                   (tig, cstart, cstart + 1, gene))
876
        print >> sbedfw, "\t".join(str(x) for x in
877
                                   ("chr1", gstart, gstart + 1, gene))
878
        lastatoms = [gene, gene, 100] + [0] * 8 + [100]
879
        print >> lastfw, "\t".join(str(x) for x in lastatoms)
880
881
    qbedfw.close()
882
    sbedfw.close()
883
    lastfw.close()
884
885
886
def write_clm(pf, ICLinkStartContigs, ICLinkEndContigs,
887
              ICLinkStarts, ICLinkEnds,
888
              ContigStarts, ContigSizes):
889
    """
890
    Write CLM file from simulated data.
891
    """
892
    clm = defaultdict(list)
893
    for start, end, linkstart, linkend in \
894
            zip(ICLinkStartContigs, ICLinkEndContigs,
895
                ICLinkStarts, ICLinkEnds):
896
        start_a = ContigStarts[start]
897
        start_b = start_a + ContigSizes[start]
898
        end_a = ContigStarts[end]
899
        end_b = end_a + ContigSizes[end]
900
        if linkend >= end_b:
901
            continue
902
        clm[(start, end)].append((linkstart - start_a, start_b - linkstart,
903
                                  linkend - end_a, end_b - linkend))
904
905
    clmfile = pf + ".clm"
906
    fw = open(clmfile, "w")
907
908
    def format_array(a):
909
        return [str(x) for x in sorted(a) if x > 0]
910
911
    for (start, end), links in sorted(clm.items()):
912
        start = "tig{:04d}".format(start)
913
        end = "tig{:04d}".format(end)
914
        nlinks = len(links)
915
        if not nlinks:
916
            continue
917
        ff = format_array([(b + c) for a, b, c, d in links])
918
        fr = format_array([(b + d) for a, b, c, d in links])
919
        rf = format_array([(a + c) for a, b, c, d in links])
920
        rr = format_array([(a + d) for a, b, c, d in links])
921
        print >> fw, "{}+ {}+\t{}\t{}".format(start, end, nlinks, " ".join(ff))
922
        print >> fw, "{}+ {}-\t{}\t{}".format(start, end, nlinks, " ".join(fr))
923
        print >> fw, "{}- {}+\t{}\t{}".format(start, end, nlinks, " ".join(rf))
924
        print >> fw, "{}- {}-\t{}\t{}".format(start, end, nlinks, " ".join(rr))
925
    fw.close()
926
927
928
def density(args):
929
    """
930
    %prog density test.clm
931
932
    Estimate link density of contigs.
933
    """
934
    p = OptionParser(density.__doc__)
935
    p.add_option("--save", default=False, action="store_true",
936
                 help="Write log densitites of contigs to file")
937
    p.set_cpus()
938
    opts, args = p.parse_args(args)
939
940
    if len(args) != 1:
941
        sys.exit(not p.print_help())
942
943
    clmfile, = args
944
    clm = CLMFile(clmfile)
945
    pf = clmfile.rsplit(".", 1)[0]
946
947
    if opts.save:
948
        logdensities = clm.calculate_densities()
949
        densityfile = pf + ".density"
950
        fw = open(densityfile, "w")
951
        for name, logd in logdensities.items():
952
            s = clm.tig_to_size[name]
953
            print >> fw, "\t".join(str(x) for x in (name, s, logd))
954
        fw.close()
955
        logging.debug("Density written to `{}`".format(densityfile))
956
957
    tourfile = clmfile.rsplit(".", 1)[0] + ".tour"
958
    tour = clm.activate(tourfile=tourfile, backuptour=False)
959
    clm.flip_all(tour)
960
    clm.flip_whole(tour)
961
    clm.flip_one(tour)
962
963
964
def optimize(args):
965
    """
966
    %prog optimize test.clm
967
968
    Optimize the contig order and orientation, based on CLM file.
969
    """
970
    p = OptionParser(optimize.__doc__)
971
    p.add_option("--skiprecover", default=False, action="store_true",
972
                 help="Do not import 'recover' contigs")
973
    p.add_option("--startover", default=False, action="store_true",
974
                 help="Do not resume from existing tour file")
975
    p.add_option("--skipGA", default=False, action="store_true",
976
                 help="Skip GA step")
977
    p.set_outfile(outfile=None)
978
    p.set_cpus()
979
    opts, args = p.parse_args(args)
980
981
    if len(args) != 1:
982
        sys.exit(not p.print_help())
983
984
    clmfile, = args
985
    startover = opts.startover
986
    runGA = not opts.skipGA
987
    cpus = opts.cpus
988
989
    # Load contact map
990
    clm = CLMFile(clmfile, skiprecover=opts.skiprecover)
991
992
    tourfile = opts.outfile or clmfile.rsplit(".", 1)[0] + ".tour"
993
    if startover:
994
        tourfile = None
995
    tour = clm.activate(tourfile=tourfile)
996
997
    fwtour = open(tourfile, "w")
998
    # Store INIT tour
999
    print_tour(fwtour, clm.tour, "INIT",
1000
               clm.active_contigs, clm.oo, signs=clm.signs)
1001
1002
    if runGA:
1003
        for phase in range(1, 3):
1004
            tour = optimize_ordering(fwtour, clm, phase, cpus)
1005
            tour = clm.prune_tour(tour, cpus)
1006
1007
    # Flip orientations
1008
    phase = 1
1009
    while True:
1010
        tag1, tag2 = optimize_orientations(fwtour, clm, phase, cpus)
1011
        if tag1 == REJECT and tag2 == REJECT:
1012
            logging.debug("Terminating ... no more {}".format(ACCEPT))
1013
            break
1014
        phase += 1
1015
1016
    fwtour.close()
1017
1018
1019
def optimize_ordering(fwtour, clm, phase, cpus):
1020
    """
1021
    Optimize the ordering of contigs by Genetic Algorithm (GA).
1022
    """
1023
    from .chic import score_evaluate_M
1024
1025
    # Prepare input files
1026
    tour_contigs = clm.active_contigs
1027
    tour_sizes = clm.active_sizes
1028
    tour_M = clm.M
1029
    tour = clm.tour
1030
    signs = clm.signs
1031
    oo = clm.oo
1032
1033
    def callback(tour, gen, phase, oo):
1034
        fitness = tour.fitness if hasattr(tour, "fitness") else None
1035
        label = "GA{}-{}".format(phase, gen)
1036
        if fitness:
1037
            fitness = "{0}".format(fitness).split(",")[0].replace("(", "")
1038
            label += "-" + fitness
1039
        if gen % 20 == 0:
1040
            print_tour(fwtour, tour, label, tour_contigs, oo, signs=signs)
1041
        return tour
1042
1043
    callbacki = partial(callback, phase=phase, oo=oo)
1044
    toolbox = GA_setup(tour)
1045
    toolbox.register("evaluate", score_evaluate_M,
1046
                     tour_sizes=tour_sizes, tour_M=tour_M)
1047
    tour, tour_fitness = GA_run(toolbox, ngen=1000, npop=100, cpus=cpus,
1048
                                callback=callbacki)
1049
    clm.tour = tour
1050
1051
    return tour
1052
1053
1054
def optimize_orientations(fwtour, clm, phase, cpus):
1055
    """
1056
    Optimize the orientations of contigs by using heuristic flipping.
1057
    """
1058
    # Prepare input files
1059
    tour_contigs = clm.active_contigs
1060
    tour = clm.tour
1061
    oo = clm.oo
1062
1063
    print_tour(fwtour, tour, "FLIPALL{}".format(phase),
1064
               tour_contigs, oo, signs=clm.signs)
1065
    tag1 = clm.flip_whole(tour)
1066
    print_tour(fwtour, tour, "FLIPWHOLE{}".format(phase),
1067
               tour_contigs, oo, signs=clm.signs)
1068
    tag2 = clm.flip_one(tour)
1069
    print_tour(fwtour, tour, "FLIPONE{}".format(phase),
1070
               tour_contigs, oo, signs=clm.signs)
1071
1072
    return tag1, tag2
1073
1074
1075
def prepare_synteny(tourfile, lastfile, odir, p, opts):
1076
    """
1077
    Prepare synteny plots for movie().
1078
    """
1079
    qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts)
1080
    qbedfile = op.abspath(qbedfile)
1081
    sbedfile = op.abspath(sbedfile)
1082
1083
    qbed = Bed(qbedfile, sorted=False)
1084
    contig_to_beds = dict(qbed.sub_beds())
1085
1086
    # Create a separate directory for the subplots and movie
1087
    mkdir(odir, overwrite=True)
1088
    os.chdir(odir)
1089
    logging.debug("Change into subdir `{}`".format(odir))
1090
1091
    # Make anchorsfile
1092
    anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \
1093
                  + ".anchors"
1094
    fw = open(anchorsfile, "w")
1095
    for b in Blast(lastfile):
1096
        print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject),
1097
                                str(int(b.score))))
1098
    fw.close()
1099
1100
    # Symlink sbed
1101
    symlink(sbedfile, op.basename(sbedfile))
1102
1103
    return anchorsfile, qbedfile, contig_to_beds
1104
1105
1106
def separate_tour_and_o(row):
1107
    """
1108
    The tour line typically contains contig list like:
1109
    tig00044568+ tig00045748- tig00071055- tig00015093- tig00030900-
1110
1111
    This function separates the names from the orientations.
1112
    """
1113
    tour = []
1114
    tour_o = []
1115
    for contig in row.split():
1116
        if contig[-1] in ('+', '-', '?'):
1117
            tour.append(contig[:-1])
1118
            tour_o.append(contig[-1])
1119
        else:  # Unoriented
1120
            tour.append(contig)
1121
            tour_o.append('?')
1122
    return tour, tour_o
1123
1124
1125
def iter_last_tour(tourfile, clm):
1126
    """
1127
    Extract last tour from tourfile. The clm instance is also passed in to see
1128
    if any contig is covered in the clm.
1129
    """
1130
    row = open(tourfile).readlines()[-1]
1131
    _tour, _tour_o = separate_tour_and_o(row)
1132
    tour = []
1133
    tour_o = []
1134
    for tc, to in zip(_tour, _tour_o):
1135
        if tc not in clm.contigs:
1136
            logging.debug("Contig `{}` in file `{}` not found in `{}`"
1137
                          .format(tc, tourfile, clm.idsfile))
1138
            continue
1139
        tour.append(tc)
1140
        tour_o.append(to)
1141
    return tour, tour_o
1142
1143
1144
def iter_tours(tourfile, frames=1):
1145
    """
1146
    Extract tours from tourfile. Tourfile contains a set of contig
1147
    configurations, generated at each iteration of the genetic algorithm. Each
1148
    configuration has two rows, first row contains iteration id and score,
1149
    second row contains list of contigs, separated by comma.
1150
    """
1151
    fp = open(tourfile)
1152
1153
    i = 0
1154
    for row in fp:
1155
        if row[0] == '>':
1156
            label = row[1:].strip()
1157
            if label.startswith("GA"):
1158
                pf, j, score = label.split("-")
1159
                j = int(j)
1160
            else:
1161
                j = 0
1162
            i += 1
1163
        else:
1164
            if j % frames != 0:
1165
                continue
1166
            tour, tour_o = separate_tour_and_o(row)
1167
            yield i, label, tour, tour_o
1168
1169
    fp.close()
1170
1171
1172
def movie(args):
1173
    """
1174
    %prog movie test.tour test.clm ref.contigs.last
1175
1176
    Plot optimization history.
1177
    """
1178
    p = OptionParser(movie.__doc__)
1179
    p.add_option("--frames", default=500, type="int",
1180
                 help="Only plot every N frames")
1181
    p.add_option("--engine", default="ffmpeg", choices=("ffmpeg", "gifsicle"),
1182
                 help="Movie engine, output MP4 or GIF")
1183
    p.set_beds()
1184
    opts, args, iopts = p.set_image_options(args, figsize="16x8",
1185
                                            style="white", cmap="coolwarm",
1186
                                            format="png", dpi=300)
1187
1188
    if len(args) != 3:
1189
        sys.exit(not p.print_help())
1190
1191
    tourfile, clmfile, lastfile = args
1192
    tourfile = op.abspath(tourfile)
1193
    clmfile = op.abspath(clmfile)
1194
    lastfile = op.abspath(lastfile)
1195
    cwd = os.getcwd()
1196
    odir = op.basename(tourfile).rsplit(".", 1)[0] + "-movie"
1197
    anchorsfile, qbedfile, contig_to_beds = \
1198
        prepare_synteny(tourfile, lastfile, odir, p, opts)
1199
1200
    args = []
1201
    for i, label, tour, tour_o in iter_tours(tourfile, frames=opts.frames):
1202
        padi = "{:06d}".format(i)
1203
        # Make sure the anchorsfile and bedfile has the serial number in,
1204
        # otherwise parallelization may fail
1205
        a, b = op.basename(anchorsfile).split(".", 1)
1206
        ianchorsfile = a + "_" + padi + "." + b
1207
        symlink(anchorsfile, ianchorsfile)
1208
1209
        # Make BED file with new order
1210
        qb = Bed()
1211
        for contig, o in zip(tour, tour_o):
1212
            if contig not in contig_to_beds:
1213
                continue
1214
            bedlines = contig_to_beds[contig][:]
1215
            if o == '-':
1216
                bedlines.reverse()
1217
            for x in bedlines:
1218
                qb.append(x)
1219
1220
        a, b = op.basename(qbedfile).split(".", 1)
1221
        ibedfile = a + "_" + padi + "." + b
1222
        qb.print_to_file(ibedfile)
1223
        # Plot dot plot, but do not sort contigs by name (otherwise losing
1224
        # order)
1225
        image_name = padi + "." + iopts.format
1226
1227
        tour = ",".join(tour)
1228
        args.append([[tour, clmfile, ianchorsfile,
1229
                    "--outfile", image_name, "--label", label]])
1230
1231
    Jobs(movieframe, args).run()
1232
1233
    os.chdir(cwd)
1234
    make_movie(odir, odir, engine=opts.engine, format=iopts.format)
1235
1236
1237
def score(args):
1238
    """
1239
    %prog score main_results/ cached_data/ contigsfasta
1240
1241
    Score the current LACHESIS CLM.
1242
    """
1243
    p = OptionParser(score.__doc__)
1244
    p.set_cpus()
1245
    opts, args = p.parse_args(args)
1246
1247
    if len(args) != 3:
1248
        sys.exit(not p.print_help())
1249
1250
    mdir, cdir, contigsfasta = args
1251
    orderingfiles = natsorted(iglob(mdir, "*.ordering"))
1252
    sizes = Sizes(contigsfasta)
1253
    contig_names = list(sizes.iter_names())
1254
    contig_ids = dict((name, i) for (i, name) in enumerate(contig_names))
1255
1256
    oo = []
1257
    # Load contact matrix
1258
    glm = op.join(cdir, "all.GLM")
1259
    N = len(contig_ids)
1260
    M = np.zeros((N, N), dtype=int)
1261
    fp = open(glm)
1262
    for row in fp:
1263
        if row[0] == '#':
1264
            continue
1265
        x, y, z = row.split()
1266
        if x == 'X':
1267
            continue
1268
        M[int(x), int(y)] = int(z)
1269
1270
    fwtour = open("tour", "w")
1271
1272
    def callback(tour, gen, oo):
1273
        fitness = tour.fitness if hasattr(tour, "fitness") else None
1274
        label = "GA-{0}".format(gen)
1275
        if fitness:
1276
            fitness = "{0}".format(fitness).split(",")[0].replace("(", "")
1277
            label += "-" + fitness
1278
        print_tour(fwtour, tour, label, contig_names, oo)
1279
        return tour
1280
1281
    for ofile in orderingfiles:
1282
        co = ContigOrdering(ofile)
1283
        for x in co:
1284
            contig_id = contig_ids[x.contig_name]
1285
            oo.append(contig_id)
1286
        pf = op.basename(ofile).split(".")[0]
1287
        print pf
1288
        print oo
1289
1290
        tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M)
1291
        # Store INIT tour
1292
        print_tour(fwtour, tour, "INIT", contig_names, oo)
1293
1294
        # Faster Cython version for evaluation
1295
        from .chic import score_evaluate_M
1296
        callbacki = partial(callback, oo=oo)
1297
        toolbox = GA_setup(tour)
1298
        toolbox.register("evaluate", score_evaluate_M,
1299
                         tour_sizes=tour_sizes, tour_M=tour_M)
1300
        tour, tour.fitness = GA_run(toolbox, npop=100, cpus=opts.cpus,
1301
                                    callback=callbacki)
1302
        print tour, tour.fitness
1303
        break
1304
1305
    fwtour.close()
1306
1307
1308
def print_tour(fwtour, tour, label, contig_names, oo, signs=None):
1309
    print >> fwtour, ">" + label
1310
    if signs is not None:
1311
        contig_o = []
1312
        for x in tour:
1313
            idx = oo[x]
1314
            sign = {1: '+', 0: '?', -1: '-'}[signs[idx]]
1315
            contig_o.append(contig_names[idx] + sign)
1316
        print >> fwtour, " ".join(contig_o)
1317
    else:
1318
        print >> fwtour, " ".join(contig_names[oo[x]] for x in tour)
1319
1320
1321
def prepare_ec(oo, sizes, M):
1322
    """
1323
    This prepares EC and converts from contig_id to an index.
1324
    """
1325
    tour = range(len(oo))
1326
    tour_sizes = np.array([sizes.sizes[x] for x in oo])
1327
    tour_M = M[oo, :][:, oo]
1328
    return tour, tour_sizes, tour_M
1329
1330
1331
def score_evaluate(tour, tour_sizes=None, tour_M=None):
1332
    """ SLOW python version of the evaluation function. For benchmarking
1333
    purposes only. Do not use in production.
1334
    """
1335
    sizes_oo = np.array([tour_sizes[x] for x in tour])
1336
    sizes_cum = np.cumsum(sizes_oo) - sizes_oo / 2
1337
    s = 0
1338
    size = len(tour)
1339
    for ia in xrange(size):
1340
        a = tour[ia]
1341
        for ib in xrange(ia + 1, size):
1342
            b = tour[ib]
1343
            links = tour_M[a, b]
1344
            dist = sizes_cum[ib] - sizes_cum[ia]
1345
            if dist > 1e7:
1346
                break
1347
            s += links * 1. / dist
1348
    return s,
1349
1350
1351
def movieframe(args):
1352
    """
1353
    %prog movieframe tour test.clm contigs.ref.anchors
1354
1355
    Draw heatmap and synteny in the same plot.
1356
    """
1357
    p = OptionParser(movieframe.__doc__)
1358
    p.add_option("--label", help="Figure title")
1359
    p.set_beds()
1360
    p.set_outfile(outfile=None)
1361
    opts, args, iopts = p.set_image_options(args, figsize="16x8",
1362
                                            style="white", cmap="coolwarm",
1363
                                            format="png", dpi=120)
1364
1365
    if len(args) != 3:
1366
        sys.exit(not p.print_help())
1367
1368
    tour, clmfile, anchorsfile = args
1369
    tour = tour.split(",")
1370
    image_name = opts.outfile or ("movieframe." + iopts.format)
1371
    label = opts.label or op.basename(image_name).rsplit(".", 1)[0]
1372
1373
    clm = CLMFile(clmfile)
1374
    totalbins, bins, breaks = make_bins(tour, clm.tig_to_size)
1375
    M = read_clm(clm, totalbins, bins)
1376
1377
    fig = plt.figure(1, (iopts.w, iopts.h))
1378
    root = fig.add_axes([0, 0, 1, 1])        # whole canvas
1379
    ax1 = fig.add_axes([.05, .1, .4, .8])    # heatmap
1380
    ax2 = fig.add_axes([.55, .1, .4, .8])    # dot plot
1381
    ax2_root = fig.add_axes([.5, 0, .5, 1])  # dot plot canvas
1382
1383
    # Left axis: heatmap
1384
    plot_heatmap(ax1, M, breaks, iopts)
1385
1386
    # Right axis: synteny
1387
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts,
1388
                                                     sorted=False)
1389
    dotplot(anchorsfile, qbed, sbed, fig, ax2_root, ax2, sep=False, title="")
1390
1391
    root.text(.5, .98, clm.name, color="g", ha="center", va="center")
1392
    root.text(.5, .95, label, color="darkslategray", ha="center", va="center")
1393
    normalize_axes(root)
1394
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
1395
1396
1397
def make_bins(tour, sizes):
1398
    breaks = []
1399
    start = 0
1400
    bins = {}
1401
    for x in tour:
1402
        size = sizes[x]
1403
        end = start + int(round(size * 1. / BINSIZE))
1404
        bins[x] = (start, end)
1405
        start = end
1406
    breaks.append(start)
1407
1408
    totalbins = start
1409
    return totalbins, bins, breaks
1410
1411
1412
def read_clm(clm, totalbins, bins):
1413
    M = np.zeros((totalbins, totalbins))
1414
    for (x, y), z in clm.contacts.items():
1415
        if x not in bins or y not in bins:
1416
            continue
1417
        xstart, xend = bins[x]
1418
        ystart, yend = bins[y]
1419
        M[xstart:xend, ystart:yend] = z
1420
        M[ystart:yend, xstart:xend] = z
1421
1422
    M = np.log10(M + 1)
1423
    return M
1424
1425
1426
def plot_heatmap(ax, M, breaks, iopts, binsize=BINSIZE):
1427
    ax.imshow(M, cmap=iopts.cmap, origin="lower", interpolation='none')
1428
    xlim = ax.get_xlim()
1429
    for b in breaks[:-1]:
1430
        ax.plot([b, b], xlim, 'w-')
1431
        ax.plot(xlim, [b, b], 'w-')
1432
    ax.set_xlim(xlim)
1433
    ax.set_ylim(xlim)
1434
    ax.set_xticklabels([int(x) for x in ax.get_xticks()],
1435
                       family='Helvetica', color="gray")
1436
    ax.set_yticklabels([int(x) for x in ax.get_yticks()],
1437
                       family='Helvetica', color="gray")
1438
    binlabel = "Bins ({} per bin)".format(human_size(binsize, precision=0))
1439
    ax.set_xlabel(binlabel)
1440
1441
1442
def agp(args):
1443
    """
1444
    %prog agp main_results/ contigs.fasta
1445
1446
    Generate AGP file based on LACHESIS output.
1447
    """
1448
    p = OptionParser(agp.__doc__)
1449
    p.set_outfile()
1450
    opts, args = p.parse_args(args)
1451
1452
    if len(args) != 2:
1453
        sys.exit(not p.print_help())
1454
1455
    odir, contigsfasta = args
1456
    fwagp = must_open(opts.outfile, 'w')
1457
    orderingfiles = natsorted(iglob(odir, "*.ordering"))
1458
    sizes = Sizes(contigsfasta).mapping
1459
    contigs = set(sizes.keys())
1460
    anchored = set()
1461
1462
    for ofile in orderingfiles:
1463
        co = ContigOrdering(ofile)
1464
        anchored |= set([x.contig_name for x in co])
1465
        obj = op.basename(ofile).split('.')[0]
1466
        co.write_agp(obj, sizes, fwagp)
1467
1468
    singletons = contigs - anchored
1469
    logging.debug('Anchored: {}, Singletons: {}'.
1470
                  format(len(anchored), len(singletons)))
1471
1472
    for s in natsorted(singletons):
1473
        order_to_agp(s, [(s, "?")], sizes, fwagp)
1474
1475
1476
if __name__ == '__main__':
1477
    main()
1478