1
|
|
|
require 'json' |
2
|
|
|
# GeoDiver NameSpace |
3
|
|
|
module GeoDiver |
4
|
|
|
# module to run the Load the GEO dataset. |
5
|
|
|
module LoadGeoData |
6
|
|
|
# To signal error in query sequence or options. |
7
|
|
|
# |
8
|
|
|
# ArgumentError is raised when ... exit status is 1; see [1]. |
9
|
|
|
class ArgumentError < ArgumentError |
10
|
|
|
end |
11
|
|
|
|
12
|
|
|
# To signal internal errors. |
13
|
|
|
# |
14
|
|
|
# RuntimeError is raised when there is a problem in writing the input file, |
15
|
|
|
# running R Script, writing the output etc. These are rare, infrastructure |
16
|
|
|
# errors, used internally, and of concern only to the admins/developers. |
17
|
|
|
# One example of a RuntimeError would be R libraries not installed. |
18
|
|
|
class RuntimeError < RuntimeError |
19
|
|
|
end |
20
|
|
|
|
21
|
|
|
class << self |
22
|
|
|
extend Forwardable |
23
|
|
|
|
24
|
|
|
def_delegators GeoDiver, :logger, :public_dir, :db_dir |
25
|
|
|
|
26
|
|
|
# Check if the GEO database has already been downloaded, if not, then |
27
|
|
|
# download the GEO dataset and extract the meta data and convert into |
28
|
|
|
# RData |
29
|
|
|
def run(params, soft_link = true) |
30
|
|
|
init(params) |
31
|
|
|
geo_accession = params['geo_db'].upcase |
32
|
|
|
meta_json_file = File.join(db_dir, geo_accession, |
33
|
|
|
"#{geo_accession}.json") |
34
|
|
|
if File.exist? meta_json_file |
35
|
|
|
logger.debug("Found GeoDb at: '#{meta_json_file}'") |
36
|
|
|
logger.debug("Parsing GeoDb '#{geo_accession}'") |
37
|
|
|
meta_data = parse_meta_data(meta_json_file) |
38
|
|
|
else |
39
|
|
|
logger.debug("Local GeoDb for '#{geo_accession}' not found.") |
40
|
|
|
meta_data = download_and_parse_meta_data(geo_accession) |
41
|
|
|
write_to_json(meta_data, meta_json_file) |
42
|
|
|
end |
43
|
|
|
if soft_link |
44
|
|
|
soft_link_meta_json_to_public_dir(geo_accession, meta_json_file) |
45
|
|
|
end |
46
|
|
|
logger.debug('GeoDb loaded into memory') |
47
|
|
|
meta_data |
48
|
|
|
end |
49
|
|
|
|
50
|
|
|
def convert_geodb_into_rdata(geo_accession) |
51
|
|
|
geo_accession = geo_accession.upcase |
52
|
|
|
return if File.exist?(File.join(db_dir, geo_accession, |
53
|
|
|
"#{geo_accession}.RData")) |
54
|
|
|
logger.debug("Running: #{load_geo_db_cmd(geo_accession)}") |
55
|
|
|
Thread.new { run_load_geo_db_cmd(geo_accession) } |
56
|
|
|
end |
57
|
|
|
|
58
|
|
|
private |
59
|
|
|
|
60
|
|
|
# Verify paramaters |
61
|
|
|
def init(params) |
62
|
|
|
assert_geo_db_present(params) |
63
|
|
|
end |
64
|
|
|
|
65
|
|
View Code Duplication |
# |
|
|
|
|
66
|
|
|
def assert_geo_db_present(params) |
67
|
|
|
logger.debug('Checking if the GEO DB parameter is present.') |
68
|
|
|
return unless params['geo_db'].nil? || params['geo_db'].empty? |
69
|
|
|
raise ArgumentError, 'No GEO database provided.' |
70
|
|
|
end |
71
|
|
|
|
72
|
|
|
def parse_meta_data(meta_json_file) |
73
|
|
|
logger.debug("Parse the Meta JSON file at: #{meta_json_file}") |
74
|
|
|
meta_file_content = IO.read meta_json_file |
75
|
|
|
JSON.parse(meta_file_content) |
76
|
|
|
end |
77
|
|
|
|
78
|
|
|
def download_and_parse_meta_data(geo_accession) |
79
|
|
|
data = download_meta_data(geo_accession) |
80
|
|
|
meta = parse_gds_db(data) if geo_accession =~ /^GDS/ |
81
|
|
|
meta = parse_gse_db(data) if geo_accession =~ /^GSE/ |
82
|
|
|
assert_meta_data(meta) |
83
|
|
|
meta |
84
|
|
|
end |
85
|
|
|
|
86
|
|
|
# |
87
|
|
|
def download_meta_data(geo_accession) |
88
|
|
|
file = download_geo_file(geo_accession) |
89
|
|
|
read_geo_file(file) |
90
|
|
|
rescue |
91
|
|
|
raise ArgumentError, 'GeoDiver was unable to download the GEO Database' |
92
|
|
|
end |
93
|
|
|
|
94
|
|
|
def assert_meta_data(meta_data) |
95
|
|
|
return unless meta_data['Factors'].empty? || meta_data['Factors'].nil? |
96
|
|
|
raise ArgumentError, 'GeoDiver was unable to parse the GEO Database' |
97
|
|
|
end |
98
|
|
|
|
99
|
|
|
# |
100
|
|
|
def download_geo_file(geo_accession) |
101
|
|
|
remote_url = generate_remote_url(geo_accession) |
102
|
|
|
logger.debug "Remote URL: #{remote_url}" |
103
|
|
|
return if remote_url.empty? || remote_url.nil? |
104
|
|
|
output_dir = File.join(db_dir, geo_accession) |
105
|
|
|
FileUtils.mkdir(output_dir) unless Dir.exist? output_dir |
106
|
|
|
file = File.basename(remote_url).delete('*') |
107
|
|
|
compressed = File.join(output_dir, file) |
108
|
|
|
wget_geo_file(remote_url, compressed, geo_accession, output_dir) |
109
|
|
|
compressing_geo_file(compressed) |
110
|
|
|
end |
111
|
|
|
|
112
|
|
|
def wget_geo_file(remote_url, compressed, geo_accession, output_dir) |
113
|
|
|
logger.debug("Downloading from: #{remote_url} ==> #{compressed}") |
114
|
|
|
`wget -q #{remote_url} -O #{compressed} || rm -r #{output_dir}` |
115
|
|
|
return if $CHILD_STATUS.exitstatus.zero? |
116
|
|
|
logger.debug "Cannot find Geo Dataset on GEO: #{geo_accession}" |
117
|
|
|
raise ArgumentError, "Cannot find Geo Dataset on GEO: #{geo_accession}" |
118
|
|
|
end |
119
|
|
|
|
120
|
|
|
def compressing_geo_file(compressed) |
121
|
|
|
logger.debug("Uncompressing file: #{compressed.gsub('.gz', '')}") |
122
|
|
|
system "gunzip --force -c #{compressed} > #{compressed.gsub('.gz', '')}" |
123
|
|
|
compressed.gsub('.gz', '') |
124
|
|
|
end |
125
|
|
|
|
126
|
|
|
# |
127
|
|
|
def generate_remote_url(geo_accession) |
128
|
|
|
cmd = "bionode-ncbi search gds #{geo_accession} |"\ |
129
|
|
|
" jq -cr 'select(.accession == \"#{geo_accession}\") | .ftplink'" |
130
|
|
|
url = `#{cmd}`.chomp! |
131
|
|
|
return if url.nil? || url.empty? |
132
|
|
|
if geo_accession =~ /^GDS/ |
133
|
|
|
url + 'soft/' + geo_accession + '.soft.gz' |
134
|
|
|
elsif geo_accession =~ /^GSE/ |
135
|
|
|
url + 'matrix/' + geo_accession + '*_series_matrix.txt.gz' |
136
|
|
|
end |
137
|
|
|
end |
138
|
|
|
|
139
|
|
|
# Loads the file into memory line by line |
140
|
|
|
# Stop loading the file once it has read all the meta data. |
141
|
|
|
def read_geo_file(file) |
142
|
|
|
data = [] |
143
|
|
|
IO.foreach(file) do |line| |
144
|
|
|
break if line =~ /^#ID_REF/ |
145
|
|
|
data << line |
146
|
|
|
end |
147
|
|
|
data.join |
148
|
|
|
end |
149
|
|
|
|
150
|
|
|
# |
151
|
|
|
def parse_gds_db(d) |
152
|
|
|
{ |
153
|
|
|
'Accession' => d.match(/\^DATASET = (.*)/)[1], |
154
|
|
|
'Title' => d.match(/!dataset_title = (.*)/)[1], |
155
|
|
|
'Description' => d.match(/!dataset_description = (.*)/)[1], |
156
|
|
|
'Sample_Organism' => d.match(/!dataset_platform_organism = (.*)/)[1], |
157
|
|
|
'Factors' => parse_gds_factors(d), |
158
|
|
|
'Reference' => d.match(/!Database_ref = (.*)/)[1], |
159
|
|
|
'Update_Date' => d.match(/!dataset_update_date = (.*)/)[1] |
160
|
|
|
} |
161
|
|
|
end |
162
|
|
|
|
163
|
|
|
def parse_gse_db(d) |
164
|
|
|
{ |
165
|
|
|
'Accession' => d.match(/!Series_geo_accession\t"(.*)"/)[1], |
166
|
|
|
'Title' => d.match(/!Series_title\t"(.*)"/)[1], |
167
|
|
|
'Description' => d.match(/!Series_summary\t"(.*)"/)[1], |
168
|
|
|
'Sample_Organism' => parse_sample_organism(d), |
169
|
|
|
'Factors' => parse_gse_factors(d), |
170
|
|
|
'Reference' => d.match(/!Series_relation\t"(.*)"/)[1], |
171
|
|
|
'Update_Date' => d.match(/!Series_last_update_date\t"(.*)"/)[1] |
172
|
|
|
} |
173
|
|
|
end |
174
|
|
|
|
175
|
|
|
# |
176
|
|
|
def parse_gds_factors(data) |
177
|
|
|
subsets = data.gsub(/\^DATA.*\n/, '').gsub(/\![dD]ata.*\n/, '') |
178
|
|
|
factors = {} |
179
|
|
|
subsets.lines.each_slice(5) do |subset| |
180
|
|
|
desc = subset[2].match(/\!subset_description = (.*)/)[1] |
181
|
|
|
type = subset[4].match(/\!subset_type = (.*)/)[1].tr(' ', '.') |
182
|
|
|
factors[type] ||= {} |
183
|
|
|
factors[type]['options'] ||= [] |
184
|
|
|
factors[type]['options'] << desc |
185
|
|
|
factors[type]['value'] = type |
186
|
|
|
end |
187
|
|
|
factors |
188
|
|
|
end |
189
|
|
|
|
190
|
|
|
def parse_gse_factors(data) |
191
|
|
|
subsets = data.scan(/!Sample_characteristics_ch1\t(.*)/) |
192
|
|
|
factors = {} |
193
|
|
|
subsets.each_with_index do |feature, idx| |
194
|
|
|
a = feature[0].split(/\"?\t?\"/) |
195
|
|
|
a.delete_if { |e| e =~ /^\s+$/ || e.empty? } |
196
|
|
|
a.each do |e| |
197
|
|
|
split = e.split(': ') |
198
|
|
|
type = split[0] |
199
|
|
|
factors[type] ||= {} |
200
|
|
|
factors[type]['value'] = 'characteristics_ch1' |
201
|
|
|
factors[type]['value'] += ".#{idx}" if idx > 0 |
202
|
|
|
factors[type]['options'] ||= [] |
203
|
|
|
factors[type]['options'] << e |
204
|
|
|
end |
205
|
|
|
end |
206
|
|
|
factors.each { |_, e| e['options'].uniq! } |
207
|
|
|
factors.delete_if { |_, e| e['options'].size == 1 } |
208
|
|
|
factors |
209
|
|
|
end |
210
|
|
|
|
211
|
|
|
def parse_sample_organism(data) |
212
|
|
|
subset = data.match(/!Sample_organism_ch1\t(.*)/)[1] |
213
|
|
|
organism = subset.split(/\"?\t?\"/) |
214
|
|
|
organism.shift |
215
|
|
|
organism.uniq |
216
|
|
|
end |
217
|
|
|
|
218
|
|
|
# |
219
|
|
|
def write_to_json(hash, output_json) |
220
|
|
|
logger.debug("Writing meta data to file: #{output_json}") |
221
|
|
|
File.open(output_json, 'w') { |f| f.puts hash.to_json } |
222
|
|
|
end |
223
|
|
|
|
224
|
|
|
# |
225
|
|
|
def soft_link_meta_json_to_public_dir(geo_accession, meta_json_file) |
226
|
|
|
public_meta_json = File.join(public_dir, 'GeoDiver/DBs/', |
227
|
|
|
"#{geo_accession}.json") |
228
|
|
|
logger.debug("Creating a Soft Link from: #{meta_json_file} ==>" \ |
229
|
|
|
" #{public_meta_json}") |
230
|
|
|
return if File.exist? public_meta_json |
231
|
|
|
FileUtils.ln_s(meta_json_file, public_meta_json) |
232
|
|
|
end |
233
|
|
|
|
234
|
|
|
# |
235
|
|
|
def load_geo_db_cmd(geo_accession) |
236
|
|
|
geo_db_dir = File.join(db_dir, geo_accession) |
237
|
|
|
rdata_file = File.join(geo_db_dir, "#{geo_accession}.RData") |
238
|
|
|
"Rscript #{File.join(GeoDiver.root, 'RCore/download_GEO.R')}" \ |
239
|
|
|
" --accession #{geo_accession}" \ |
240
|
|
|
" --outrdata #{rdata_file} --geodbDir #{geo_db_dir}" |
241
|
|
|
end |
242
|
|
|
|
243
|
|
|
def run_load_geo_db_cmd(geo_accession) |
244
|
|
|
geo_db_dir = File.join(db_dir, geo_accession) |
245
|
|
|
rdata_file = File.join(geo_db_dir, "#{geo_accession}.RData") |
246
|
|
|
system(load_geo_db_cmd(geo_accession)) |
247
|
|
|
if File.exist? rdata_file |
248
|
|
|
logger.debug("Finished creating Rdata file: #{rdata_file}") |
249
|
|
|
cleanup(geo_accession) |
250
|
|
|
else |
251
|
|
|
logger.debug("Did not create Rdata file: #{rdata_file}") |
252
|
|
|
FileUtils.touch File.join(db_dir, geo_accession, |
253
|
|
|
"#{$CHILD_STATUS.exitstatus}.failed") |
254
|
|
|
end |
255
|
|
|
end |
256
|
|
|
|
257
|
|
|
def cleanup(geo_accession) |
258
|
|
|
soft_file = File.join(db_dir, geo_accession, '*.soft') |
259
|
|
|
`rm #{soft_file}` |
260
|
|
|
`rm #{soft_file}.gz` |
261
|
|
|
end |
262
|
|
|
end |
263
|
|
|
end |
264
|
|
|
end |
265
|
|
|
|