Completed
Push — master ( 7ac017...e3217e )
by Ismail
49s
created

LoadGeoData.download_meta_data()   A

Complexity

Conditions 2

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
dl 0
loc 6
rs 9.4285
c 0
b 0
f 0
1
require 'json'
2
# GeoDiver NameSpace
3
module GeoDiver
4
  # module to run the Load the GEO dataset.
5
  module LoadGeoData
6
    # To signal error in query sequence or options.
7
    #
8
    # ArgumentError is raised when ... exit status is 1; see [1].
9
    class ArgumentError < ArgumentError
10
    end
11
12
    # To signal internal errors.
13
    #
14
    # RuntimeError is raised when there is a problem in writing the input file,
15
    # running R Script, writing the output etc. These are rare, infrastructure
16
    # errors, used internally, and of concern only to the admins/developers.
17
    # One example of a RuntimeError would be R libraries not installed.
18
    class RuntimeError < RuntimeError
19
    end
20
21
    class << self
22
      extend Forwardable
23
24
      def_delegators GeoDiver, :logger, :public_dir, :db_dir
25
26
      # Check if the GEO database has already been downloaded, if not, then
27
      # download the GEO dataset and extract the meta data and convert into
28
      # RData
29
      def run(params, soft_link = true)
30
        init(params)
31
        geo_accession  = params['geo_db'].upcase
32
        meta_json_file = File.join(db_dir, geo_accession,
33
                                   "#{geo_accession}.json")
34
        if File.exist? meta_json_file
35
          logger.debug("Found GeoDb at: '#{meta_json_file}'")
36
          logger.debug("Parsing GeoDb '#{geo_accession}'")
37
          meta_data = parse_meta_data(meta_json_file)
38
        else
39
          logger.debug("Local GeoDb for '#{geo_accession}' not found.")
40
          meta_data = download_and_parse_meta_data(geo_accession)
41
          write_to_json(meta_data, meta_json_file)
42
        end
43
        if soft_link
44
          soft_link_meta_json_to_public_dir(geo_accession, meta_json_file)
45
        end
46
        logger.debug('GeoDb loaded into memory')
47
        meta_data
48
      end
49
50
      def convert_geodb_into_rdata(geo_accession)
51
        geo_accession = geo_accession.upcase
52
        return if File.exist?(File.join(db_dir, geo_accession,
53
                                        "#{geo_accession}.RData"))
54
        logger.debug("Running: #{load_geo_db_cmd(geo_accession)}")
55
        Thread.new { run_load_geo_db_cmd(geo_accession) }
56
      end
57
58
      private
59
60
      # Verify paramaters
61
      def init(params)
62
        assert_geo_db_present(params)
63
      end
64
65 View Code Duplication
      #
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
66
      def assert_geo_db_present(params)
67
        logger.debug('Checking if the GEO DB parameter is present.')
68
        return unless params['geo_db'].nil? || params['geo_db'].empty?
69
        raise ArgumentError, 'No GEO database provided.'
70
      end
71
72
      def parse_meta_data(meta_json_file)
73
        logger.debug("Parse the Meta JSON file at: #{meta_json_file}")
74
        meta_file_content = IO.read meta_json_file
75
        JSON.parse(meta_file_content)
76
      end
77
78
      def download_and_parse_meta_data(geo_accession)
79
        data = download_meta_data(geo_accession)
80
        meta = parse_gds_db(data) if geo_accession =~ /^GDS/
81
        meta = parse_gse_db(data) if geo_accession =~ /^GSE/
82
        assert_meta_data(meta)
83
        meta
84
      end
85
86
      #
87
      def download_meta_data(geo_accession)
88
        file = download_geo_file(geo_accession)
89
        read_geo_file(file)
90
      rescue
91
        raise ArgumentError, 'GeoDiver was unable to download the GEO Database'
92
      end
93
94
      def assert_meta_data(meta_data)
95
        return unless meta_data['Factors'].empty? || meta_data['Factors'].nil?
96
        raise ArgumentError, 'GeoDiver was unable to parse the GEO Database'
97
      end
98
99
      #
100
      def download_geo_file(geo_accession)
101
        remote_url = generate_remote_url(geo_accession)
102
        logger.debug "Remote URL: #{remote_url}"
103
        return if remote_url.empty? || remote_url.nil?
104
        output_dir = File.join(db_dir, geo_accession)
105
        FileUtils.mkdir(output_dir) unless Dir.exist? output_dir
106
        file = File.basename(remote_url).delete('*')
107
        compressed = File.join(output_dir, file)
108
        wget_geo_file(remote_url, compressed, geo_accession, output_dir)
109
        compressing_geo_file(compressed)
110
      end
111
112
      def wget_geo_file(remote_url, compressed, geo_accession, output_dir)
113
        logger.debug("Downloading from: #{remote_url} ==> #{compressed}")
114
        `wget -q #{remote_url} -O #{compressed} || rm -r #{output_dir}`
115
        return if $CHILD_STATUS.exitstatus.zero?
116
        logger.debug "Cannot find Geo Dataset on GEO: #{geo_accession}"
117
        raise ArgumentError, "Cannot find Geo Dataset on GEO: #{geo_accession}"
118
      end
119
120
      def compressing_geo_file(compressed)
121
        logger.debug("Uncompressing file: #{compressed.gsub('.gz', '')}")
122
        system "gunzip --force -c #{compressed} > #{compressed.gsub('.gz', '')}"
123
        compressed.gsub('.gz', '')
124
      end
125
126
      #
127
      def generate_remote_url(geo_accession)
128
        cmd = "bionode-ncbi search gds #{geo_accession} |"\
129
              " jq -cr 'select(.accession == \"#{geo_accession}\") | .ftplink'"
130
        url = `#{cmd}`.chomp!
131
        return if url.nil? || url.empty?
132
        if geo_accession =~ /^GDS/
133
          url + 'soft/' + geo_accession + '.soft.gz'
134
        elsif geo_accession =~ /^GSE/
135
          url + 'matrix/' + geo_accession + '*_series_matrix.txt.gz'
136
        end
137
      end
138
139
      # Loads the file into memory line by line
140
      # Stop loading the file once it has read all the meta data.
141
      def read_geo_file(file)
142
        data = []
143
        IO.foreach(file) do |line|
144
          break if line =~ /^#ID_REF/
145
          data << line
146
        end
147
        data.join
148
      end
149
150
      #
151
      def parse_gds_db(d)
152
        {
153
          'Accession' => d.match(/\^DATASET = (.*)/)[1],
154
          'Title' => d.match(/!dataset_title = (.*)/)[1],
155
          'Description' => d.match(/!dataset_description = (.*)/)[1],
156
          'Sample_Organism' => d.match(/!dataset_platform_organism = (.*)/)[1],
157
          'Factors' => parse_gds_factors(d),
158
          'Reference' => d.match(/!Database_ref = (.*)/)[1],
159
          'Update_Date' => d.match(/!dataset_update_date = (.*)/)[1]
160
        }
161
      end
162
163
      def parse_gse_db(d)
164
        {
165
          'Accession' => d.match(/!Series_geo_accession\t"(.*)"/)[1],
166
          'Title' => d.match(/!Series_title\t"(.*)"/)[1],
167
          'Description' => d.match(/!Series_summary\t"(.*)"/)[1],
168
          'Sample_Organism' => parse_sample_organism(d),
169
          'Factors' => parse_gse_factors(d),
170
          'Reference' => d.match(/!Series_relation\t"(.*)"/)[1],
171
          'Update_Date' => d.match(/!Series_last_update_date\t"(.*)"/)[1]
172
        }
173
      end
174
175
      #
176
      def parse_gds_factors(data)
177
        subsets = data.gsub(/\^DATA.*\n/, '').gsub(/\![dD]ata.*\n/, '')
178
        factors = {}
179
        subsets.lines.each_slice(5) do |subset|
180
          desc = subset[2].match(/\!subset_description = (.*)/)[1]
181
          type = subset[4].match(/\!subset_type = (.*)/)[1].tr(' ', '.')
182
          factors[type] ||= {}
183
          factors[type]['options'] ||= []
184
          factors[type]['options'] << desc
185
          factors[type]['value'] = type
186
        end
187
        factors
188
      end
189
190
      def parse_gse_factors(data)
191
        subsets = data.scan(/!Sample_characteristics_ch1\t(.*)/)
192
        factors = {}
193
        subsets.each_with_index do |feature, idx|
194
          a = feature[0].split(/\"?\t?\"/)
195
          a.delete_if { |e| e =~ /^\s+$/ || e.empty? }
196
          a.each do |e|
197
            split = e.split(': ')
198
            type = split[0]
199
            factors[type] ||= {}
200
            factors[type]['value'] = 'characteristics_ch1'
201
            factors[type]['value'] += ".#{idx}" if idx > 0
202
            factors[type]['options'] ||= []
203
            factors[type]['options'] << e
204
          end
205
        end
206
        factors.each { |_, e| e['options'].uniq! }
207
        factors.delete_if { |_, e| e['options'].size == 1 }
208
        factors
209
      end
210
211
      def parse_sample_organism(data)
212
        subset = data.match(/!Sample_organism_ch1\t(.*)/)[1]
213
        organism = subset.split(/\"?\t?\"/)
214
        organism.shift
215
        organism.uniq
216
      end
217
218
      #
219
      def write_to_json(hash, output_json)
220
        logger.debug("Writing meta data to file: #{output_json}")
221
        File.open(output_json, 'w') { |f| f.puts hash.to_json }
222
      end
223
224
      #
225
      def soft_link_meta_json_to_public_dir(geo_accession, meta_json_file)
226
        public_meta_json = File.join(public_dir, 'GeoDiver/DBs/',
227
                                     "#{geo_accession}.json")
228
        logger.debug("Creating a Soft Link from: #{meta_json_file} ==>" \
229
                     " #{public_meta_json}")
230
        return if File.exist? public_meta_json
231
        FileUtils.ln_s(meta_json_file, public_meta_json)
232
      end
233
234
      #
235
      def load_geo_db_cmd(geo_accession)
236
        geo_db_dir = File.join(db_dir, geo_accession)
237
        rdata_file = File.join(geo_db_dir, "#{geo_accession}.RData")
238
        "Rscript #{File.join(GeoDiver.root, 'RCore/download_GEO.R')}" \
239
        " --accession #{geo_accession}" \
240
        " --outrdata  #{rdata_file} --geodbDir #{geo_db_dir}"
241
      end
242
243
      def run_load_geo_db_cmd(geo_accession)
244
        geo_db_dir = File.join(db_dir, geo_accession)
245
        rdata_file = File.join(geo_db_dir, "#{geo_accession}.RData")
246
        system(load_geo_db_cmd(geo_accession))
247
        if File.exist? rdata_file
248
          logger.debug("Finished creating Rdata file: #{rdata_file}")
249
          cleanup(geo_accession)
250
        else
251
          logger.debug("Did not create Rdata file: #{rdata_file}")
252
          FileUtils.touch File.join(db_dir, geo_accession,
253
                                    "#{$CHILD_STATUS.exitstatus}.failed")
254
        end
255
      end
256
257
      def cleanup(geo_accession)
258
        soft_file = File.join(db_dir, geo_accession, '*.soft')
259
        `rm #{soft_file}`
260
        `rm #{soft_file}.gz`
261
      end
262
    end
263
  end
264
end
265