Completed
Push — master ( 1e2f69...852146 )
by Haibao
41s
created

set_align_options()   A

Complexity

Conditions 1

Size

Total Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 9
rs 9.6666
c 0
b 0
f 0
1
#!/usr/bin/env python
2
# -*- coding: UTF-8 -*-
3
4
"""
5
Run bwa command and skips the manual run of naming intermediate output files
6
The whole pipeline is following bwa documentation at
7
<http://bio-bwa.sf.net/bwa.shtml>
8
"""
9
10
import os.path as op
11
import sys
12
import logging
13
14
from jcvi.formats.sam import output_bam, get_samfile, mapped
15
from jcvi.formats.base import FileShredder
16
from jcvi.assembly.automaton import iter_project
17
from jcvi.apps.grid import MakeManager
18
from jcvi.apps.base import OptionParser, ActionDispatcher, need_update, sh, \
19
            get_abs_path, mkdir
20
21
22
def main():
23
24
    actions = (
25
        ('index', 'wraps bwa index'),
26
        ('align', 'wraps bwa aln|mem|bwasw'),
27
        ('batch', 'run bwa in batch mode'),
28
            )
29
    p = ActionDispatcher(actions)
30
    p.dispatch(globals())
31
32
33
def batch(args):
34
    """
35
    %proj batch database.fasta project_dir output_dir
36
37
    Run bwa in batch mode.
38
    """
39
    p = OptionParser(batch.__doc__)
40
    set_align_options(p)
41
    p.set_sam_options()
42
    opts, args = p.parse_args(args)
43
44
    if len(args) != 3:
45
        sys.exit(not p.print_help())
46
47
    ref_fasta, proj_dir, outdir = args
48
    outdir = outdir.rstrip("/")
49
    s3dir = None
50
    if outdir.startswith("s3://"):
51
        s3dir = outdir
52
        outdir = op.basename(outdir)
53
        mkdir(outdir)
54
55
    mm = MakeManager()
56
    for p, pf in iter_project(proj_dir):
57
        targs = [ref_fasta] + p
58
        cmd1, bamfile = mem(targs, opts)
59
        if cmd1:
60
            cmd1 = output_bam(cmd1, bamfile)
61
        nbamfile = op.join(outdir, bamfile)
62
        cmd2 = "mv {} {}".format(bamfile, nbamfile)
63
        cmds = [cmd1, cmd2]
64
65
        if s3dir:
66
            cmd = "aws cp {} {} --sse".format(nbamfile,
67
                                              op.join(s3dir, bamfile))
68
            cmds.append(cmd)
69
70
        mm.add(p, nbamfile, cmds)
71
72
    mm.write()
73
74
75
def check_index(dbfile):
76
    dbfile = get_abs_path(dbfile)
77
    safile = dbfile + ".sa"
78
    if not op.exists(safile):
79
        cmd = "bwa index {0}".format(dbfile)
80
        sh(cmd)
81
    else:
82
        logging.error("`{0}` exists. `bwa index` already run.".format(safile))
83
84
    return dbfile
85
86
87
def check_aln(dbfile, readfile, cpus=32):
88
    from jcvi.formats.fastq import guessoffset
89
90
    saifile = readfile.rsplit(".", 1)[0] + ".sai"
91
    if need_update((dbfile, readfile), saifile):
92
        offset = guessoffset([readfile])
93
        cmd = "bwa aln " + " ".join((dbfile, readfile))
94
        cmd += " -t {0}".format(cpus)
95
        if offset == 64:
96
            cmd += " -I"
97
        sh(cmd, outfile=saifile)
98
    else:
99
        logging.error("`{0}` exists. `bwa aln` already run.".format(saifile))
100
101
    return saifile
102
103
104
def index(args):
105
    """
106
    %prog index database.fasta
107
108
    Wrapper for `bwa index`. Same interface.
109
    """
110
    p = OptionParser(index.__doc__)
111
    opts, args = p.parse_args(args)
112
113
    if len(args) != 1:
114
        sys.exit(not p.print_help())
115
116
    dbfile, = args
117
    check_index(dbfile)
118
119
120
def set_align_options(p):
121
    """ Used in align() and batch()
122
    """
123
    p.add_option("--bwa", default="bwa", help="Run bwa at this path")
124
    p.add_option("--rg", help="Read group")
125
    p.add_option("--readtype",
126
                 choices=("pacbio", "pbread", "ont2d", "intractg"),
127
                 help="Read type in bwa-mem")
128
    p.set_cutoff(cutoff=800)
129
130
131 View Code Duplication
def align(args):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
132
    """
133
    %prog align database.fasta read1.fq [read2.fq]
134
135
    Wrapper for three modes of BWA - mem (default), aln, bwasw (long reads).
136
    """
137
    valid_modes = ("bwasw", "aln", "mem")
138
    p = OptionParser(align.__doc__)
139
    p.add_option("--mode", default="mem", choices=valid_modes, help="BWA mode")
140
    set_align_options(p)
141
    p.set_sam_options()
142
143
    opts, args = p.parse_args(args)
144
    mode = opts.mode
145
    nargs = len(args)
146
147
    if nargs not in (2, 3):
148
        sys.exit(not p.print_help())
149
150
    tag = "bwa-{0}: ".format(mode)
151
    c = mem
152
    if nargs == 2:
153
        tag += "Single-end alignment"
154
        if mode == "bwasw":
155 View Code Duplication
            c = bwasw
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
156
        elif mode == "aln":
157
            c = samse
158
    else:
159
        assert mode != "bwasw", "Cannot use --bwasw with paired-end mode"
160
        tag += "Paired-end alignment"
161
        if mode == "aln":
162
            c = sampe
163
164
    logging.debug(tag)
165
    cmd, samfile = c(args, opts)
166
    if cmd:
167
        cmd = output_bam(cmd, samfile)
168
169
    bam = opts.bam
170
    unmapped = opts.unmapped
171
172
    sh(cmd)
173
    if unmapped:
174
        dbfile, readfile = args[:2]
175
        mopts = [samfile, "--unmapped"]
176
        if not bam:
177
            mopts += ["--sam"]
178
        mapped(mopts)
179
        FileShredder([samfile])
180
181
    return samfile, None
182
183
184
def samse(args, opts):
185
    """
186
    %prog samse database.fasta short_read.fastq
187
188
    Wrapper for `bwa samse`. Output will be short_read.sam.
189
    """
190
    dbfile, readfile = args
191
    dbfile = check_index(dbfile)
192
    saifile = check_aln(dbfile, readfile, cpus=opts.cpus)
193
194
    samfile, _, unmapped = get_samfile(readfile, dbfile,
195
                                       bam=opts.bam, unmapped=opts.unmapped)
196
    if not need_update((dbfile, saifile), samfile):
197
        logging.error("`{0}` exists. `bwa samse` already run.".format(samfile))
198
        return "", samfile
199
200
    cmd = "bwa samse {0} {1} {2}".format(dbfile, saifile, readfile)
201
    cmd += " " + opts.extra
202
    if opts.uniq:
203
        cmd += " -n 1"
204
205
    return cmd, samfile
206
207
208
def sampe(args, opts):
209
    """
210 View Code Duplication
    %prog sampe database.fasta read1.fq read2.fq
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
211
212
    Wrapper for `bwa sampe`. Output will be read1.sam.
213
    """
214
    dbfile, read1file, read2file = args
215
    dbfile = check_index(dbfile)
216
    sai1file = check_aln(dbfile, read1file, cpus=opts.cpus)
217
    sai2file = check_aln(dbfile, read2file, cpus=opts.cpus)
218
219
    samfile, _, unmapped = get_samfile(read1file, dbfile,
220
                                       bam=opts.bam, unmapped=opts.unmapped)
221
    if not need_update((dbfile, sai1file, sai2file), samfile):
222
        logging.error("`{0}` exists. `bwa samse` already run.".format(samfile))
223
        return "", samfile
224
225
    cmd = "bwa sampe " + " ".join((dbfile, sai1file, sai2file,
226
                                   read1file, read2file))
227
    cmd += " " + opts.extra
228
    if opts.cutoff:
229
        cmd += " -a {0}".format(opts.cutoff)
230
    if opts.uniq:
231
        cmd += " -n 1"
232
233
    return cmd, samfile
234
235
236
def mem(args, opts):
237
    """
238
    %prog mem database.fasta read1.fq [read2.fq]
239
240
    Wrapper for `bwa mem`. Output will be read1.sam.
241
    """
242
    dbfile, read1file = args[:2]
243
    readtype = opts.readtype
244
    pl = readtype or "illumina"
245
246
    pf = op.basename(read1file).split(".")[0]
247
    rg = opts.rg or r"@RG\tID:{0}\tSM:sm\tLB:lb\tPL:{1}".format(pf, pl)
248
    dbfile = check_index(dbfile)
249
    args[0] = dbfile
250
    samfile, _, unmapped = get_samfile(read1file, dbfile,
251
                                       bam=opts.bam, unmapped=opts.unmapped)
252
    if not need_update(read1file, samfile):
253
        logging.error("`{0}` exists. `bwa mem` already run.".format(samfile))
254
        return "", samfile
255
256
    cmd = "{} mem".format(opts.bwa)
257
    '''
258
    -M Mark shorter split hits as secondary (for Picard compatibility).
259
    '''
260
    cmd += " -M -t {0}".format(opts.cpus)
261
    cmd += ' -R "{0}"'.format(rg)
262
    if readtype:
263
        cmd += " -x {0}".format(readtype)
264
    cmd += " " + opts.extra
265
    cmd += " ".join(args)
266
267
    return cmd, samfile
268
269
270
def bwasw(args, opts):
271
    """
272
    %prog bwasw database.fasta long_read.fastq
273
274
    Wrapper for `bwa bwasw`. Output will be long_read.sam.
275
    """
276
    dbfile, readfile = args
277
    dbfile = check_index(dbfile)
278
279
    samfile, _, unmapped = get_samfile(readfile, dbfile,
280
                                       bam=opts.bam, unmapped=opts.unmapped)
281
    if not need_update(dbfile, samfile):
282
        logging.error("`{0}` exists. `bwa bwasw` already run.".format(samfile))
283
        return "", samfile
284
285
    cmd = "bwa bwasw " + " ".join(args)
286
    cmd += " -t {0}".format(opts.cpus)
287
    cmd += " " + opts.extra
288
    return cmd, samfile
289
290
291
if __name__ == '__main__':
292
    main()
293