|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
# vim:fileencoding=utf-8 |
|
3
|
|
|
# |
|
4
|
|
|
# Copyright (c) 2017-2018 Stefan Bender |
|
5
|
|
|
# |
|
6
|
|
|
# This module is part of sciapy. |
|
7
|
|
|
# sciapy is free software: you can redistribute it or modify |
|
8
|
|
|
# it under the terms of the GNU General Public License as published |
|
9
|
|
|
# by the Free Software Foundation, version 2. |
|
10
|
|
|
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html. |
|
11
|
1 |
|
"""SCIAMACHY data interface |
|
12
|
|
|
|
|
13
|
|
|
Data loading and selection routines for SCIAMACHY data regression fits. |
|
14
|
|
|
Includes reading the solar and geomagnetic index files used as proxies. |
|
15
|
|
|
""" |
|
16
|
|
|
|
|
17
|
1 |
|
import logging |
|
18
|
|
|
|
|
19
|
1 |
|
import numpy as np |
|
20
|
1 |
|
import pandas as pd |
|
21
|
1 |
|
import xarray as xr |
|
22
|
|
|
|
|
23
|
1 |
|
from astropy.time import Time |
|
24
|
|
|
|
|
25
|
1 |
|
__all__ = ["load_solar_gm_table", "load_scia_dzm"] |
|
26
|
|
|
|
|
27
|
1 |
|
_NPV_MAJOR, _NPV_MINOR, _NPV_PATCH = np.__version__.split('.') |
|
28
|
|
|
|
|
29
|
1 |
|
_seasons = { |
|
30
|
|
|
"summerNH": [ |
|
31
|
|
|
slice("2002-03-20", "2002-09-25"), |
|
32
|
|
|
slice("2003-03-20", "2003-10-13"), |
|
33
|
|
|
slice("2004-03-20", "2004-10-20"), |
|
34
|
|
|
slice("2005-03-20", "2005-10-20"), |
|
35
|
|
|
slice("2006-03-20", "2006-10-20"), |
|
36
|
|
|
slice("2007-03-21", "2007-10-20"), |
|
37
|
|
|
slice("2008-03-20", "2008-10-20"), |
|
38
|
|
|
slice("2009-03-20", "2009-10-20"), |
|
39
|
|
|
slice("2010-03-20", "2010-10-20"), |
|
40
|
|
|
slice("2011-03-20", "2011-08-31"), |
|
41
|
|
|
slice("2012-03-20", "2012-04-07"), |
|
42
|
|
|
], |
|
43
|
|
|
"summerSH": [ |
|
44
|
|
|
slice("2002-09-13", "2003-03-26"), |
|
45
|
|
|
slice("2003-09-13", "2004-04-01"), |
|
46
|
|
|
slice("2004-09-14", "2005-04-02"), |
|
47
|
|
|
slice("2005-09-13", "2006-04-02"), |
|
48
|
|
|
slice("2006-09-13", "2007-04-02"), |
|
49
|
|
|
slice("2007-09-13", "2008-04-01"), |
|
50
|
|
|
slice("2008-09-14", "2009-04-02"), |
|
51
|
|
|
slice("2009-09-03", "2010-04-02"), |
|
52
|
|
|
slice("2010-09-13", "2011-04-02"), |
|
53
|
|
|
slice("2011-09-13", "2012-04-01"), |
|
54
|
|
|
] |
|
55
|
|
|
} |
|
56
|
|
|
|
|
57
|
1 |
|
_SPEs = [pd.date_range("2002-04-20", "2002-05-01"), |
|
58
|
|
|
pd.date_range("2002-05-21", "2002-06-02"), |
|
59
|
|
|
pd.date_range("2002-07-15", "2002-07-27"), |
|
60
|
|
|
pd.date_range("2002-08-23", "2002-09-03"), |
|
61
|
|
|
pd.date_range("2002-09-06", "2002-09-17"), |
|
62
|
|
|
pd.date_range("2002-11-08", "2002-11-20"), |
|
63
|
|
|
pd.date_range("2003-05-27", "2003-06-07"), |
|
64
|
|
|
pd.date_range("2003-10-25", "2003-11-15"), |
|
65
|
|
|
pd.date_range("2004-07-24", "2004-08-05"), |
|
66
|
|
|
pd.date_range("2004-11-06", "2004-11-18"), |
|
67
|
|
|
pd.date_range("2005-01-15", "2005-01-27"), |
|
68
|
|
|
pd.date_range("2005-05-13", "2005-05-25"), |
|
69
|
|
|
pd.date_range("2005-07-13", "2005-08-05"), |
|
70
|
|
|
pd.date_range("2005-08-21", "2005-09-02"), |
|
71
|
|
|
pd.date_range("2005-09-07", "2005-09-21"), |
|
72
|
|
|
pd.date_range("2006-12-05", "2006-12-23"), |
|
73
|
|
|
pd.date_range("2012-01-22", "2012-02-07"), |
|
74
|
|
|
pd.date_range("2012-03-06", "2012-03-27")] |
|
75
|
|
|
|
|
76
|
|
|
|
|
77
|
1 |
|
def load_dailymeanAE( |
|
78
|
|
|
filename="data/indices/AE_Kyoto_1980-2016_daily2_shift12h.dat", |
|
79
|
|
|
name="AE", tfmt="jyear"): |
|
80
|
1 |
|
from pkg_resources import resource_filename |
|
81
|
1 |
|
AEfilename = resource_filename("sciapy", filename) |
|
82
|
1 |
|
return load_solar_gm_table(AEfilename, cols=[0, 1], |
|
83
|
|
|
names=["time", name], tfmt=tfmt) |
|
84
|
|
|
|
|
85
|
|
|
|
|
86
|
1 |
|
def load_dailymeanLya(filename="data/indices/lisird_lya3_1980-2017.dat", |
|
87
|
|
|
name="Lya", tfmt="jyear"): |
|
88
|
1 |
|
from pkg_resources import resource_filename |
|
89
|
1 |
|
Lyafilename = resource_filename("sciapy", filename) |
|
90
|
1 |
|
return load_solar_gm_table(Lyafilename, cols=[0, 1], |
|
91
|
|
|
names=["time", name], tfmt=tfmt) |
|
92
|
|
|
|
|
93
|
|
|
|
|
94
|
1 |
View Code Duplication |
def load_solar_gm_table(filename, cols, names, sep="\t", tfmt="jyear"): |
|
|
|
|
|
|
95
|
|
|
"""Load proxy tables from ascii files |
|
96
|
|
|
|
|
97
|
|
|
This function wraps :func:`numpy.genfromtxt()` [#]_ with |
|
98
|
|
|
pre-defined settings to match the file format. |
|
99
|
|
|
It explicitly returns the times as UTC decimal years or julian epochs. |
|
100
|
|
|
|
|
101
|
|
|
.. [#] https://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html |
|
102
|
|
|
|
|
103
|
|
|
Parameters |
|
104
|
|
|
---------- |
|
105
|
|
|
filename: str |
|
106
|
|
|
The file to read |
|
107
|
|
|
cols: sequence |
|
108
|
|
|
The columns in the file to use, passed to `numpy.genfromtxt`'s |
|
109
|
|
|
`usecols` keyword. Should be at least of length 2, indicating |
|
110
|
|
|
time values in the first column, e.g., `cols=[0, 1]`. |
|
111
|
|
|
names: sequence |
|
112
|
|
|
The column names, passed as `names` to `numpy.genfromtxt()`. |
|
113
|
|
|
Should be at least of length 2, naming the proxy values in the |
|
114
|
|
|
second column, e.g., `names=["time", "proxy"]`. |
|
115
|
|
|
sep: str, optional |
|
116
|
|
|
The column separator character, passed as `sep`. |
|
117
|
|
|
Default: `\t` |
|
118
|
|
|
tfmt: str, optional |
|
119
|
|
|
The astropy.time "Time Format" for the time units, |
|
120
|
|
|
for example, "jyear", "decimalyear", "jd", "mjd", etc. |
|
121
|
|
|
See: |
|
122
|
|
|
http://docs.astropy.org/en/stable/time/index.html#time-format |
|
123
|
|
|
Default: "jyear" |
|
124
|
|
|
|
|
125
|
|
|
Returns |
|
126
|
|
|
------- |
|
127
|
|
|
times: array_like |
|
128
|
|
|
The proxy times according to the `tfmt` keyword (UTC) as |
|
129
|
|
|
:class:`numpy.ndarray` from the first column of "cols". |
|
130
|
|
|
values: tuple of array_like |
|
131
|
|
|
The proxy values combined into a structured array |
|
132
|
|
|
(:class:`numpy.recarray`) with fields set |
|
133
|
|
|
according to "names[1:]" and "cols[1:]" passed above. |
|
134
|
|
|
|
|
135
|
|
|
See Also |
|
136
|
|
|
-------- |
|
137
|
|
|
:class:`astropy.time.Time` |
|
138
|
|
|
:class:`numpy.recarray` |
|
139
|
|
|
""" |
|
140
|
1 |
|
if len(cols) < 2 and len(names) < 2: |
|
141
|
|
|
raise ValueError( |
|
142
|
|
|
"Both `cols` and `names` should be at least of length 2.") |
|
143
|
1 |
|
encoding = {} |
|
144
|
|
|
# set the encoding for numpy >= 1.14.0 |
|
145
|
1 |
|
if int(_NPV_MAJOR) >= 1 and int(_NPV_MINOR) >= 14: |
|
146
|
1 |
|
encoding = dict(encoding=None) |
|
147
|
1 |
|
tab = np.genfromtxt(filename, |
|
148
|
|
|
delimiter=sep, |
|
149
|
|
|
dtype=None, |
|
150
|
|
|
names=names, |
|
151
|
|
|
usecols=cols, |
|
152
|
|
|
**encoding) |
|
153
|
1 |
|
times = Time(tab[names[0]], scale="utc") |
|
154
|
1 |
|
ts = getattr(times, tfmt) |
|
155
|
1 |
|
return ts, tab[names[1:]] |
|
156
|
|
|
|
|
157
|
|
|
|
|
158
|
1 |
View Code Duplication |
def _greedy_select(ds, size, varname="NO_DENS_std", scale=1.): |
|
|
|
|
|
|
159
|
|
|
logging.info("Greedy subsampling to size: %s", size) |
|
160
|
|
|
var = np.ma.masked_array((ds[varname] * scale)**2, mask=[False]) |
|
161
|
|
|
var_p = np.ma.masked_array((ds[varname] * scale)**2, mask=[False]) |
|
162
|
|
|
sigma2_i = scale**2 |
|
163
|
|
|
sigma2_ip = scale**2 |
|
164
|
|
|
idxs = np.arange(len(var)) |
|
165
|
|
|
for _ in range(size): |
|
166
|
|
|
max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i)) |
|
167
|
|
|
min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip)) |
|
168
|
|
|
sigma2_i += 1. / var[max_entr_idx] |
|
169
|
|
|
sigma2_ip += 1. / var_p[min_entr_idx] |
|
170
|
|
|
var.mask[max_entr_idx] = True |
|
171
|
|
|
var_p.mask[max_entr_idx] = True |
|
172
|
|
|
return ds.isel(time=idxs[var.mask]) |
|
173
|
|
|
|
|
174
|
1 |
View Code Duplication |
def _greedy_idxs_post(x, xerr, size): |
|
|
|
|
|
|
175
|
|
|
logging.info("Greedy subsampling to size: %s", size) |
|
176
|
|
|
var = np.ma.masked_array(xerr**2, mask=[False]) |
|
177
|
|
|
var_p = np.ma.masked_array(xerr**2, mask=[False]) |
|
178
|
|
|
sigma2_i = 1. |
|
179
|
|
|
sigma2_ip = 1. |
|
180
|
|
|
idxs = np.arange(len(var)) |
|
181
|
|
|
for _ in range(size): |
|
182
|
|
|
max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i)) |
|
183
|
|
|
min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip)) |
|
184
|
|
|
sigma2_i += 1. / var[max_entr_idx] |
|
185
|
|
|
sigma2_ip += 1. / var_p[min_entr_idx] |
|
186
|
|
|
var.mask[max_entr_idx] = True |
|
187
|
|
|
var_p.mask[max_entr_idx] = True |
|
188
|
|
|
return idxs[var.mask] |
|
189
|
|
|
|
|
190
|
1 |
View Code Duplication |
def load_scia_dzm(filename, alt, lat, tfmt="jyear", |
|
|
|
|
|
|
191
|
|
|
scale=1, subsample_factor=1, subsample_method="greedy", |
|
192
|
|
|
akd_threshold=0.002, cnt_threshold=0, |
|
193
|
|
|
center=False, season=None, SPEs=False): |
|
194
|
|
|
"""Load SCIAMACHY daily zonal mean data |
|
195
|
|
|
|
|
196
|
|
|
Interface function for SCIAMACHY daily zonal mean data files version 6.x. |
|
197
|
|
|
Uses :mod:`xarray` [#]_ to load and select the data. Possible selections are by |
|
198
|
|
|
hemispheric summer (NH summer ~ SH winter and SH summer ~ NH winter) and |
|
199
|
|
|
exclusion of strong solar proton events (SPE). |
|
200
|
|
|
|
|
201
|
|
|
.. [#] https://xarray.pydata.org |
|
202
|
|
|
|
|
203
|
|
|
Parameters |
|
204
|
|
|
---------- |
|
205
|
|
|
filename: str |
|
206
|
|
|
The input filename |
|
207
|
|
|
alt: float |
|
208
|
|
|
The altitude |
|
209
|
|
|
lat: float |
|
210
|
|
|
The longitude |
|
211
|
|
|
tfmt: string, optional |
|
212
|
|
|
The astropy.time "Time Format" for the time units, |
|
213
|
|
|
for example, "jyear", "decimalyear", "jd", "mjd", etc. |
|
214
|
|
|
See: |
|
215
|
|
|
http://docs.astropy.org/en/stable/time/index.html#time-format |
|
216
|
|
|
Default: "jyear" |
|
217
|
|
|
scale: float, optional |
|
218
|
|
|
Scale factor of the data (default: 1) |
|
219
|
|
|
subsample_factor: int, optional |
|
220
|
|
|
Factor to subsample the data by (see `subsample_method`) |
|
221
|
|
|
(default: 1 (no subsampling)) |
|
222
|
|
|
subsample_method: "equal", "greedy", or "random", optional |
|
223
|
|
|
Method for subsampling the data (see `subsample_factor`). |
|
224
|
|
|
"equal" for equally spaced subsampling, |
|
225
|
|
|
"greedy" for selecting the data based on their uncertainty, |
|
226
|
|
|
and "random" for uniform random subsampling. |
|
227
|
|
|
(default: "greedy") |
|
228
|
|
|
center: bool, optional |
|
229
|
|
|
Center the data by subtracting the global mean. |
|
230
|
|
|
(default: False) |
|
231
|
|
|
season: "summerNH", "summerSH", or `None`, optional |
|
232
|
|
|
Select the named season or `None` for all data (default: None) |
|
233
|
|
|
SPEs: bool, optional |
|
234
|
|
|
Set to `True` to exclude strong SPE events (default: False) |
|
235
|
|
|
|
|
236
|
|
|
Returns |
|
237
|
|
|
------- |
|
238
|
|
|
(times, dens, errs): tuple of (N,) array_like |
|
239
|
|
|
The measurement times according to the `tfmt` keyword, |
|
240
|
|
|
the number densities, and their uncertainties. |
|
241
|
|
|
""" |
|
242
|
1 |
|
logging.info("Opening dataset: '%s'", filename) |
|
243
|
1 |
|
NO_ds = xr.open_mfdataset(filename, decode_times=False, |
|
244
|
|
|
chunks={"time": 400, "latitude": 9, "altitude": 11}) |
|
245
|
1 |
|
logging.info("done.") |
|
246
|
|
|
# Decode time coordinate for selection |
|
247
|
1 |
|
try: |
|
248
|
|
|
# xarray <= 0.9.6 |
|
249
|
1 |
|
NO_ds["time"] = xr.conventions.decode_cf_variable(NO_ds.time) |
|
250
|
1 |
|
except TypeError: |
|
251
|
|
|
# xarray => 0.10.0 |
|
252
|
1 |
|
NO_ds["time"] = xr.conventions.decode_cf_variable("time", NO_ds.time) |
|
253
|
|
|
|
|
254
|
1 |
|
NO_mean = 0. |
|
255
|
1 |
|
if center: |
|
256
|
1 |
|
NO_mean = NO_ds.NO_DENS.mean().values |
|
257
|
1 |
|
logging.info("Centering with global mean: %s", NO_mean) |
|
258
|
1 |
|
NO_tds = NO_ds.sel(latitude=lat, altitude=alt) |
|
259
|
|
|
|
|
260
|
|
|
# Exclude SPEs first if requested |
|
261
|
1 |
|
if SPEs: |
|
262
|
1 |
|
logging.info("Removing SPEs.") |
|
263
|
1 |
|
for spe in _SPEs: |
|
264
|
1 |
|
NO_tds = NO_tds.drop( |
|
265
|
|
|
NO_tds.sel(time=slice(spe[0], spe[-1])).time.values, |
|
266
|
|
|
dim="time") |
|
267
|
|
|
|
|
268
|
|
|
# Filter by season |
|
269
|
1 |
|
if season in _seasons.keys(): |
|
270
|
1 |
|
logging.info("Restricting to season: %s", season) |
|
271
|
1 |
|
NO_tds = xr.concat([NO_tds.sel(time=s) for s in _seasons[season]], |
|
272
|
|
|
dim="time") |
|
273
|
1 |
|
NO_tds.load() |
|
274
|
|
|
else: |
|
275
|
1 |
|
logging.info("No season selected or unknown season, " |
|
276
|
|
|
"using all available data.") |
|
277
|
|
|
|
|
278
|
1 |
|
try: |
|
279
|
1 |
|
NO_counts = NO_tds.NO_DENS_cnt |
|
280
|
|
|
except AttributeError: |
|
281
|
|
|
NO_counts = NO_tds.counts |
|
282
|
|
|
|
|
283
|
|
|
# Select only useful data |
|
284
|
1 |
|
NO_tds = NO_tds.where( |
|
285
|
|
|
np.isfinite(NO_tds.NO_DENS) & |
|
286
|
|
|
(NO_tds.NO_DENS_std != 0) & |
|
287
|
|
|
(NO_tds.NO_AKDIAG > akd_threshold) & |
|
288
|
|
|
(NO_counts > cnt_threshold) & |
|
289
|
|
|
(NO_tds.NO_MASK == 0), |
|
290
|
|
|
drop=True) |
|
291
|
|
|
|
|
292
|
1 |
|
no_dens = scale * NO_tds.NO_DENS |
|
293
|
1 |
|
if center: |
|
294
|
1 |
|
no_dens -= scale * NO_mean |
|
295
|
1 |
|
no_errs = scale * NO_tds.NO_DENS_std / np.sqrt(NO_counts) |
|
296
|
1 |
|
logging.debug("no_dens.shape (ntime,): %s", no_dens.shape) |
|
297
|
|
|
|
|
298
|
1 |
|
no_sza = NO_tds.mean_SZA |
|
299
|
|
|
|
|
300
|
|
|
# Convert to astropy.Time for Julian epoch or decimal year |
|
301
|
1 |
|
if NO_tds.time.size > 0: |
|
302
|
1 |
|
no_t = Time(pd.to_datetime(NO_tds.time.values, utc=True).to_pydatetime(), |
|
303
|
|
|
format="datetime", scale="utc") |
|
304
|
1 |
|
no_ys = getattr(no_t, tfmt) |
|
305
|
|
|
else: |
|
306
|
1 |
|
no_ys = np.empty_like(NO_tds.time.values, dtype=np.float64) |
|
307
|
|
|
|
|
308
|
1 |
|
if subsample_factor > 1: |
|
309
|
|
|
new_data_size = no_dens.shape[0] // subsample_factor |
|
310
|
|
|
if subsample_method == "random": |
|
311
|
|
|
# random subsampling |
|
312
|
|
|
_idxs = np.random.choice(no_dens.shape[0], |
|
313
|
|
|
new_data_size, replace=False) |
|
314
|
|
|
elif subsample_method == "equal": |
|
315
|
|
|
# equally spaced subsampling |
|
316
|
|
|
_idxs = slice(0, no_dens.shape[0], subsample_factor) |
|
317
|
|
|
else: |
|
318
|
|
|
# "greedy" subsampling (default fall-back) |
|
319
|
|
|
_idxs = _greedy_idxs_post(no_dens, no_errs, new_data_size) |
|
320
|
|
|
return (no_ys[_idxs], |
|
321
|
|
|
no_dens.values[_idxs], |
|
322
|
|
|
no_errs.values[_idxs], |
|
323
|
|
|
no_sza.values[_idxs]) |
|
324
|
|
|
|
|
325
|
|
|
return no_ys, no_dens.values, no_errs.values, no_sza.values |
|
326
|
|
|
|