1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- coding: UTF-8 -*- |
3
|
|
|
|
4
|
|
|
""" |
5
|
|
|
Run bwa command and skips the manual run of naming intermediate output files |
6
|
|
|
The whole pipeline is following bwa documentation at |
7
|
|
|
<http://bio-bwa.sf.net/bwa.shtml> |
8
|
|
|
""" |
9
|
|
|
|
10
|
|
|
import os.path as op |
11
|
|
|
import sys |
12
|
|
|
import logging |
13
|
|
|
|
14
|
|
|
from jcvi.formats.sam import output_bam, get_samfile, mapped |
15
|
|
|
from jcvi.formats.base import FileShredder |
16
|
|
|
from jcvi.assembly.automaton import iter_project |
17
|
|
|
from jcvi.apps.grid import MakeManager |
18
|
|
|
from jcvi.apps.base import OptionParser, ActionDispatcher, need_update, sh, \ |
19
|
|
|
get_abs_path, mkdir |
20
|
|
|
|
21
|
|
|
|
22
|
|
|
def main(): |
23
|
|
|
|
24
|
|
|
actions = ( |
25
|
|
|
('index', 'wraps bwa index'), |
26
|
|
|
('align', 'wraps bwa aln|mem|bwasw'), |
27
|
|
|
('batch', 'run bwa in batch mode'), |
28
|
|
|
) |
29
|
|
|
p = ActionDispatcher(actions) |
30
|
|
|
p.dispatch(globals()) |
31
|
|
|
|
32
|
|
|
|
33
|
|
|
def batch(args): |
34
|
|
|
""" |
35
|
|
|
%proj batch database.fasta project_dir output_dir |
36
|
|
|
|
37
|
|
|
Run bwa in batch mode. |
38
|
|
|
""" |
39
|
|
|
p = OptionParser(batch.__doc__) |
40
|
|
|
set_align_options(p) |
41
|
|
|
p.set_sam_options() |
42
|
|
|
opts, args = p.parse_args(args) |
43
|
|
|
|
44
|
|
|
if len(args) != 3: |
45
|
|
|
sys.exit(not p.print_help()) |
46
|
|
|
|
47
|
|
|
ref_fasta, proj_dir, outdir = args |
48
|
|
|
outdir = outdir.rstrip("/") |
49
|
|
|
s3dir = None |
50
|
|
|
if outdir.startswith("s3://"): |
51
|
|
|
s3dir = outdir |
52
|
|
|
outdir = op.basename(outdir) |
53
|
|
|
mkdir(outdir) |
54
|
|
|
|
55
|
|
|
mm = MakeManager() |
56
|
|
|
for p, pf in iter_project(proj_dir): |
57
|
|
|
targs = [ref_fasta] + p |
58
|
|
|
cmd1, bamfile = mem(targs, opts) |
59
|
|
|
if cmd1: |
60
|
|
|
cmd1 = output_bam(cmd1, bamfile) |
61
|
|
|
nbamfile = op.join(outdir, bamfile) |
62
|
|
|
cmd2 = "mv {} {}".format(bamfile, nbamfile) |
63
|
|
|
cmds = [cmd1, cmd2] |
64
|
|
|
|
65
|
|
|
if s3dir: |
66
|
|
|
cmd = "aws cp {} {} --sse".format(nbamfile, |
67
|
|
|
op.join(s3dir, bamfile)) |
68
|
|
|
cmds.append(cmd) |
69
|
|
|
|
70
|
|
|
mm.add(p, nbamfile, cmds) |
71
|
|
|
|
72
|
|
|
mm.write() |
73
|
|
|
|
74
|
|
|
|
75
|
|
|
def check_index(dbfile): |
76
|
|
|
dbfile = get_abs_path(dbfile) |
77
|
|
|
safile = dbfile + ".sa" |
78
|
|
|
if not op.exists(safile): |
79
|
|
|
cmd = "bwa index {0}".format(dbfile) |
80
|
|
|
sh(cmd) |
81
|
|
|
else: |
82
|
|
|
logging.error("`{0}` exists. `bwa index` already run.".format(safile)) |
83
|
|
|
|
84
|
|
|
return dbfile |
85
|
|
|
|
86
|
|
|
|
87
|
|
|
def check_aln(dbfile, readfile, cpus=32): |
88
|
|
|
from jcvi.formats.fastq import guessoffset |
89
|
|
|
|
90
|
|
|
saifile = readfile.rsplit(".", 1)[0] + ".sai" |
91
|
|
|
if need_update((dbfile, readfile), saifile): |
92
|
|
|
offset = guessoffset([readfile]) |
93
|
|
|
cmd = "bwa aln " + " ".join((dbfile, readfile)) |
94
|
|
|
cmd += " -t {0}".format(cpus) |
95
|
|
|
if offset == 64: |
96
|
|
|
cmd += " -I" |
97
|
|
|
sh(cmd, outfile=saifile) |
98
|
|
|
else: |
99
|
|
|
logging.error("`{0}` exists. `bwa aln` already run.".format(saifile)) |
100
|
|
|
|
101
|
|
|
return saifile |
102
|
|
|
|
103
|
|
|
|
104
|
|
|
def index(args): |
105
|
|
|
""" |
106
|
|
|
%prog index database.fasta |
107
|
|
|
|
108
|
|
|
Wrapper for `bwa index`. Same interface. |
109
|
|
|
""" |
110
|
|
|
p = OptionParser(index.__doc__) |
111
|
|
|
opts, args = p.parse_args(args) |
112
|
|
|
|
113
|
|
|
if len(args) != 1: |
114
|
|
|
sys.exit(not p.print_help()) |
115
|
|
|
|
116
|
|
|
dbfile, = args |
117
|
|
|
check_index(dbfile) |
118
|
|
|
|
119
|
|
|
|
120
|
|
|
def set_align_options(p): |
121
|
|
|
""" Used in align() and batch() |
122
|
|
|
""" |
123
|
|
|
p.add_option("--bwa", default="bwa", help="Run bwa at this path") |
124
|
|
|
p.add_option("--rg", help="Read group") |
125
|
|
|
p.add_option("--readtype", |
126
|
|
|
choices=("pacbio", "pbread", "ont2d", "intractg"), |
127
|
|
|
help="Read type in bwa-mem") |
128
|
|
|
p.set_cutoff(cutoff=800) |
129
|
|
|
|
130
|
|
|
|
131
|
|
View Code Duplication |
def align(args): |
|
|
|
|
132
|
|
|
""" |
133
|
|
|
%prog align database.fasta read1.fq [read2.fq] |
134
|
|
|
|
135
|
|
|
Wrapper for three modes of BWA - mem (default), aln, bwasw (long reads). |
136
|
|
|
""" |
137
|
|
|
valid_modes = ("bwasw", "aln", "mem") |
138
|
|
|
p = OptionParser(align.__doc__) |
139
|
|
|
p.add_option("--mode", default="mem", choices=valid_modes, help="BWA mode") |
140
|
|
|
set_align_options(p) |
141
|
|
|
p.set_sam_options() |
142
|
|
|
|
143
|
|
|
opts, args = p.parse_args(args) |
144
|
|
|
mode = opts.mode |
145
|
|
|
nargs = len(args) |
146
|
|
|
|
147
|
|
|
if nargs not in (2, 3): |
148
|
|
|
sys.exit(not p.print_help()) |
149
|
|
|
|
150
|
|
|
tag = "bwa-{0}: ".format(mode) |
151
|
|
|
c = mem |
152
|
|
|
if nargs == 2: |
153
|
|
|
tag += "Single-end alignment" |
154
|
|
|
if mode == "bwasw": |
155
|
|
View Code Duplication |
c = bwasw |
|
|
|
|
156
|
|
|
elif mode == "aln": |
157
|
|
|
c = samse |
158
|
|
|
else: |
159
|
|
|
assert mode != "bwasw", "Cannot use --bwasw with paired-end mode" |
160
|
|
|
tag += "Paired-end alignment" |
161
|
|
|
if mode == "aln": |
162
|
|
|
c = sampe |
163
|
|
|
|
164
|
|
|
logging.debug(tag) |
165
|
|
|
cmd, samfile = c(args, opts) |
166
|
|
|
if cmd: |
167
|
|
|
cmd = output_bam(cmd, samfile) |
168
|
|
|
|
169
|
|
|
bam = opts.bam |
170
|
|
|
unmapped = opts.unmapped |
171
|
|
|
|
172
|
|
|
sh(cmd) |
173
|
|
|
if unmapped: |
174
|
|
|
dbfile, readfile = args[:2] |
175
|
|
|
mopts = [samfile, "--unmapped"] |
176
|
|
|
if not bam: |
177
|
|
|
mopts += ["--sam"] |
178
|
|
|
mapped(mopts) |
179
|
|
|
FileShredder([samfile]) |
180
|
|
|
|
181
|
|
|
return samfile, None |
182
|
|
|
|
183
|
|
|
|
184
|
|
|
def samse(args, opts): |
185
|
|
|
""" |
186
|
|
|
%prog samse database.fasta short_read.fastq |
187
|
|
|
|
188
|
|
|
Wrapper for `bwa samse`. Output will be short_read.sam. |
189
|
|
|
""" |
190
|
|
|
dbfile, readfile = args |
191
|
|
|
dbfile = check_index(dbfile) |
192
|
|
|
saifile = check_aln(dbfile, readfile, cpus=opts.cpus) |
193
|
|
|
|
194
|
|
|
samfile, _, unmapped = get_samfile(readfile, dbfile, |
195
|
|
|
bam=opts.bam, unmapped=opts.unmapped) |
196
|
|
|
if not need_update((dbfile, saifile), samfile): |
197
|
|
|
logging.error("`{0}` exists. `bwa samse` already run.".format(samfile)) |
198
|
|
|
return "", samfile |
199
|
|
|
|
200
|
|
|
cmd = "bwa samse {0} {1} {2}".format(dbfile, saifile, readfile) |
201
|
|
|
cmd += " " + opts.extra |
202
|
|
|
if opts.uniq: |
203
|
|
|
cmd += " -n 1" |
204
|
|
|
|
205
|
|
|
return cmd, samfile |
206
|
|
|
|
207
|
|
|
|
208
|
|
|
def sampe(args, opts): |
209
|
|
|
""" |
210
|
|
View Code Duplication |
%prog sampe database.fasta read1.fq read2.fq |
|
|
|
|
211
|
|
|
|
212
|
|
|
Wrapper for `bwa sampe`. Output will be read1.sam. |
213
|
|
|
""" |
214
|
|
|
dbfile, read1file, read2file = args |
215
|
|
|
dbfile = check_index(dbfile) |
216
|
|
|
sai1file = check_aln(dbfile, read1file, cpus=opts.cpus) |
217
|
|
|
sai2file = check_aln(dbfile, read2file, cpus=opts.cpus) |
218
|
|
|
|
219
|
|
|
samfile, _, unmapped = get_samfile(read1file, dbfile, |
220
|
|
|
bam=opts.bam, unmapped=opts.unmapped) |
221
|
|
|
if not need_update((dbfile, sai1file, sai2file), samfile): |
222
|
|
|
logging.error("`{0}` exists. `bwa samse` already run.".format(samfile)) |
223
|
|
|
return "", samfile |
224
|
|
|
|
225
|
|
|
cmd = "bwa sampe " + " ".join((dbfile, sai1file, sai2file, |
226
|
|
|
read1file, read2file)) |
227
|
|
|
cmd += " " + opts.extra |
228
|
|
|
if opts.cutoff: |
229
|
|
|
cmd += " -a {0}".format(opts.cutoff) |
230
|
|
|
if opts.uniq: |
231
|
|
|
cmd += " -n 1" |
232
|
|
|
|
233
|
|
|
return cmd, samfile |
234
|
|
|
|
235
|
|
|
|
236
|
|
|
def mem(args, opts): |
237
|
|
|
""" |
238
|
|
|
%prog mem database.fasta read1.fq [read2.fq] |
239
|
|
|
|
240
|
|
|
Wrapper for `bwa mem`. Output will be read1.sam. |
241
|
|
|
""" |
242
|
|
|
dbfile, read1file = args[:2] |
243
|
|
|
readtype = opts.readtype |
244
|
|
|
pl = readtype or "illumina" |
245
|
|
|
|
246
|
|
|
pf = op.basename(read1file).split(".")[0] |
247
|
|
|
rg = opts.rg or r"@RG\tID:{0}\tSM:sm\tLB:lb\tPL:{1}".format(pf, pl) |
248
|
|
|
dbfile = check_index(dbfile) |
249
|
|
|
args[0] = dbfile |
250
|
|
|
samfile, _, unmapped = get_samfile(read1file, dbfile, |
251
|
|
|
bam=opts.bam, unmapped=opts.unmapped) |
252
|
|
|
if not need_update(read1file, samfile): |
253
|
|
|
logging.error("`{0}` exists. `bwa mem` already run.".format(samfile)) |
254
|
|
|
return "", samfile |
255
|
|
|
|
256
|
|
|
cmd = "{} mem".format(opts.bwa) |
257
|
|
|
''' |
258
|
|
|
-M Mark shorter split hits as secondary (for Picard compatibility). |
259
|
|
|
''' |
260
|
|
|
cmd += " -M -t {0}".format(opts.cpus) |
261
|
|
|
cmd += ' -R "{0}"'.format(rg) |
262
|
|
|
if readtype: |
263
|
|
|
cmd += " -x {0}".format(readtype) |
264
|
|
|
cmd += " " + opts.extra |
265
|
|
|
cmd += " ".join(args) |
266
|
|
|
|
267
|
|
|
return cmd, samfile |
268
|
|
|
|
269
|
|
|
|
270
|
|
|
def bwasw(args, opts): |
271
|
|
|
""" |
272
|
|
|
%prog bwasw database.fasta long_read.fastq |
273
|
|
|
|
274
|
|
|
Wrapper for `bwa bwasw`. Output will be long_read.sam. |
275
|
|
|
""" |
276
|
|
|
dbfile, readfile = args |
277
|
|
|
dbfile = check_index(dbfile) |
278
|
|
|
|
279
|
|
|
samfile, _, unmapped = get_samfile(readfile, dbfile, |
280
|
|
|
bam=opts.bam, unmapped=opts.unmapped) |
281
|
|
|
if not need_update(dbfile, samfile): |
282
|
|
|
logging.error("`{0}` exists. `bwa bwasw` already run.".format(samfile)) |
283
|
|
|
return "", samfile |
284
|
|
|
|
285
|
|
|
cmd = "bwa bwasw " + " ".join(args) |
286
|
|
|
cmd += " -t {0}".format(opts.cpus) |
287
|
|
|
cmd += " " + opts.extra |
288
|
|
|
return cmd, samfile |
289
|
|
|
|
290
|
|
|
|
291
|
|
|
if __name__ == '__main__': |
292
|
|
|
main() |
293
|
|
|
|