1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
# vim:fileencoding=utf-8 |
3
|
|
|
# |
4
|
|
|
# Copyright (c) 2017-2018 Stefan Bender |
5
|
|
|
# |
6
|
|
|
# This module is part of sciapy. |
7
|
|
|
# sciapy is free software: you can redistribute it or modify |
8
|
|
|
# it under the terms of the GNU General Public License as published |
9
|
|
|
# by the Free Software Foundation, version 2. |
10
|
|
|
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html. |
11
|
1 |
|
"""SCIAMACHY data interface |
12
|
|
|
|
13
|
|
|
Data loading and selection routines for SCIAMACHY data regression fits. |
14
|
|
|
Includes reading the solar and geomagnetic index files used as proxies. |
15
|
|
|
""" |
16
|
|
|
|
17
|
1 |
|
import logging |
18
|
|
|
|
19
|
1 |
|
import numpy as np |
20
|
1 |
|
import pandas as pd |
21
|
1 |
|
import xarray as xr |
22
|
|
|
|
23
|
1 |
|
from astropy.time import Time |
24
|
|
|
|
25
|
1 |
|
__all__ = ["load_solar_gm_table", "load_scia_dzm"] |
26
|
|
|
|
27
|
1 |
|
_NPV_MAJOR, _NPV_MINOR, _NPV_PATCH = np.__version__.split('.') |
28
|
|
|
|
29
|
1 |
|
_seasons = { |
30
|
|
|
"summerNH": [ |
31
|
|
|
slice("2002-03-20", "2002-09-25"), |
32
|
|
|
slice("2003-03-20", "2003-10-13"), |
33
|
|
|
slice("2004-03-20", "2004-10-20"), |
34
|
|
|
slice("2005-03-20", "2005-10-20"), |
35
|
|
|
slice("2006-03-20", "2006-10-20"), |
36
|
|
|
slice("2007-03-21", "2007-10-20"), |
37
|
|
|
slice("2008-03-20", "2008-10-20"), |
38
|
|
|
slice("2009-03-20", "2009-10-20"), |
39
|
|
|
slice("2010-03-20", "2010-10-20"), |
40
|
|
|
slice("2011-03-20", "2011-08-31"), |
41
|
|
|
slice("2012-03-20", "2012-04-07"), |
42
|
|
|
], |
43
|
|
|
"summerSH": [ |
44
|
|
|
slice("2002-09-13", "2003-03-26"), |
45
|
|
|
slice("2003-09-13", "2004-04-01"), |
46
|
|
|
slice("2004-09-14", "2005-04-02"), |
47
|
|
|
slice("2005-09-13", "2006-04-02"), |
48
|
|
|
slice("2006-09-13", "2007-04-02"), |
49
|
|
|
slice("2007-09-13", "2008-04-01"), |
50
|
|
|
slice("2008-09-14", "2009-04-02"), |
51
|
|
|
slice("2009-09-03", "2010-04-02"), |
52
|
|
|
slice("2010-09-13", "2011-04-02"), |
53
|
|
|
slice("2011-09-13", "2012-04-01"), |
54
|
|
|
] |
55
|
|
|
} |
56
|
|
|
|
57
|
1 |
|
_SPEs = [pd.date_range("2002-04-20", "2002-05-01"), |
58
|
|
|
pd.date_range("2002-05-21", "2002-06-02"), |
59
|
|
|
pd.date_range("2002-07-15", "2002-07-27"), |
60
|
|
|
pd.date_range("2002-08-23", "2002-09-03"), |
61
|
|
|
pd.date_range("2002-09-06", "2002-09-17"), |
62
|
|
|
pd.date_range("2002-11-08", "2002-11-20"), |
63
|
|
|
pd.date_range("2003-05-27", "2003-06-07"), |
64
|
|
|
pd.date_range("2003-10-25", "2003-11-15"), |
65
|
|
|
pd.date_range("2004-07-24", "2004-08-05"), |
66
|
|
|
pd.date_range("2004-11-06", "2004-11-18"), |
67
|
|
|
pd.date_range("2005-01-15", "2005-01-27"), |
68
|
|
|
pd.date_range("2005-05-13", "2005-05-25"), |
69
|
|
|
pd.date_range("2005-07-13", "2005-08-05"), |
70
|
|
|
pd.date_range("2005-08-21", "2005-09-02"), |
71
|
|
|
pd.date_range("2005-09-07", "2005-09-21"), |
72
|
|
|
pd.date_range("2006-12-05", "2006-12-23"), |
73
|
|
|
pd.date_range("2012-01-22", "2012-02-07"), |
74
|
|
|
pd.date_range("2012-03-06", "2012-03-27")] |
75
|
|
|
|
76
|
|
|
|
77
|
1 |
|
def load_dailymeanAE( |
78
|
|
|
filename="data/indices/AE_Kyoto_1980-2018_daily2_shift12h.dat", |
79
|
|
|
name="AE", tfmt="jyear"): |
80
|
1 |
|
from pkg_resources import resource_filename |
81
|
1 |
|
AEfilename = resource_filename("sciapy", filename) |
82
|
1 |
|
return load_solar_gm_table(AEfilename, cols=[0, 1], |
83
|
|
|
names=["time", name], tfmt=tfmt) |
84
|
|
|
|
85
|
|
|
|
86
|
1 |
|
def load_dailymeanLya(filename="data/indices/lisird_lya3_1980-2021.dat", |
87
|
|
|
name="Lya", tfmt="jyear"): |
88
|
1 |
|
from pkg_resources import resource_filename |
89
|
1 |
|
Lyafilename = resource_filename("sciapy", filename) |
90
|
1 |
|
return load_solar_gm_table(Lyafilename, cols=[0, 1], |
91
|
|
|
names=["time", name], tfmt=tfmt) |
92
|
|
|
|
93
|
|
|
|
94
|
1 |
|
def load_solar_gm_table(filename, cols, names, sep="\t", tfmt="jyear"): |
95
|
|
|
"""Load proxy tables from ascii files |
96
|
|
|
|
97
|
|
|
This function wraps :func:`numpy.genfromtxt()` [#]_ with |
98
|
|
|
pre-defined settings to match the file format. |
99
|
|
|
It explicitly returns the times as UTC decimal years or julian epochs. |
100
|
|
|
|
101
|
|
|
.. [#] https://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html |
102
|
|
|
|
103
|
|
|
Parameters |
104
|
|
|
---------- |
105
|
|
|
filename: str |
106
|
|
|
The file to read |
107
|
|
|
cols: sequence |
108
|
|
|
The columns in the file to use, passed to `numpy.genfromtxt`'s |
109
|
|
|
`usecols` keyword. Should be at least of length 2, indicating |
110
|
|
|
time values in the first column, e.g., `cols=[0, 1]`. |
111
|
|
|
names: sequence |
112
|
|
|
The column names, passed as `names` to `numpy.genfromtxt()`. |
113
|
|
|
Should be at least of length 2, naming the proxy values in the |
114
|
|
|
second column, e.g., `names=["time", "proxy"]`. |
115
|
|
|
sep: str, optional |
116
|
|
|
The column separator character, passed as `sep`. |
117
|
|
|
Default: `\t` |
118
|
|
|
tfmt: str, optional |
119
|
|
|
The astropy.time "Time Format" for the time units, |
120
|
|
|
for example, "jyear", "decimalyear", "jd", "mjd", etc. |
121
|
|
|
See: |
122
|
|
|
http://docs.astropy.org/en/stable/time/index.html#time-format |
123
|
|
|
Default: "jyear" |
124
|
|
|
|
125
|
|
|
Returns |
126
|
|
|
------- |
127
|
|
|
times: array_like |
128
|
|
|
The proxy times according to the `tfmt` keyword (UTC) as |
129
|
|
|
:class:`numpy.ndarray` from the first column of "cols". |
130
|
|
|
values: tuple of array_like |
131
|
|
|
The proxy values combined into a structured array |
132
|
|
|
(:class:`numpy.recarray`) with fields set |
133
|
|
|
according to "names[1:]" and "cols[1:]" passed above. |
134
|
|
|
|
135
|
|
|
See Also |
136
|
|
|
-------- |
137
|
|
|
:class:`astropy.time.Time` |
138
|
|
|
:class:`numpy.recarray` |
139
|
|
|
""" |
140
|
1 |
|
if len(cols) < 2 and len(names) < 2: |
141
|
|
|
raise ValueError( |
142
|
|
|
"Both `cols` and `names` should be at least of length 2.") |
143
|
1 |
|
encoding = {} |
144
|
|
|
# set the encoding for numpy >= 1.14.0 |
145
|
1 |
|
if int(_NPV_MAJOR) >= 1 and int(_NPV_MINOR) >= 14: |
146
|
1 |
|
encoding = dict(encoding=None) |
147
|
1 |
|
tab = np.genfromtxt(filename, |
148
|
|
|
delimiter=sep, |
149
|
|
|
dtype=None, |
150
|
|
|
names=names, |
151
|
|
|
usecols=cols, |
152
|
|
|
**encoding) |
153
|
1 |
|
times = Time(tab[names[0]], scale="utc") |
154
|
1 |
|
ts = getattr(times, tfmt) |
155
|
1 |
|
return ts, tab[names[1:]] |
156
|
|
|
|
157
|
|
|
|
158
|
1 |
|
def _greedy_select(ds, size, varname="NO_DENS_std", scale=1.): |
159
|
|
|
logging.info("Greedy subsampling to size: %s", size) |
160
|
|
|
var = np.ma.masked_array((ds[varname] * scale)**2, mask=[False]) |
161
|
|
|
var_p = np.ma.masked_array((ds[varname] * scale)**2, mask=[False]) |
162
|
|
|
sigma2_i = scale**2 |
163
|
|
|
sigma2_ip = scale**2 |
164
|
|
|
idxs = np.arange(len(var)) |
165
|
|
|
for _ in range(size): |
166
|
|
|
max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i)) |
167
|
|
|
min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip)) |
168
|
|
|
sigma2_i += 1. / var[max_entr_idx] |
169
|
|
|
sigma2_ip += 1. / var_p[min_entr_idx] |
170
|
|
|
var.mask[max_entr_idx] = True |
171
|
|
|
var_p.mask[max_entr_idx] = True |
172
|
|
|
return ds.isel(time=idxs[var.mask]) |
173
|
|
|
|
174
|
1 |
|
def _greedy_idxs_post(x, xerr, size): |
175
|
|
|
logging.info("Greedy subsampling to size: %s", size) |
176
|
|
|
var = np.ma.masked_array(xerr**2, mask=[False]) |
177
|
|
|
var_p = np.ma.masked_array(xerr**2, mask=[False]) |
178
|
|
|
sigma2_i = 1. |
179
|
|
|
sigma2_ip = 1. |
180
|
|
|
idxs = np.arange(len(var)) |
181
|
|
|
for _ in range(size): |
182
|
|
|
max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i)) |
183
|
|
|
min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip)) |
184
|
|
|
sigma2_i += 1. / var[max_entr_idx] |
185
|
|
|
sigma2_ip += 1. / var_p[min_entr_idx] |
186
|
|
|
var.mask[max_entr_idx] = True |
187
|
|
|
var_p.mask[max_entr_idx] = True |
188
|
|
|
return idxs[var.mask] |
189
|
|
|
|
190
|
1 |
|
def load_scia_dzm(filename, alt, lat, tfmt="jyear", |
191
|
|
|
scale=1, subsample_factor=1, subsample_method="greedy", |
192
|
|
|
akd_threshold=0.002, cnt_threshold=0, |
193
|
|
|
center=False, season=None, SPEs=False): |
194
|
|
|
"""Load SCIAMACHY daily zonal mean data |
195
|
|
|
|
196
|
|
|
Interface function for SCIAMACHY daily zonal mean data files version 6.x. |
197
|
|
|
Uses :mod:`xarray` [#]_ to load and select the data. Possible selections are by |
198
|
|
|
hemispheric summer (NH summer ~ SH winter and SH summer ~ NH winter) and |
199
|
|
|
exclusion of strong solar proton events (SPE). |
200
|
|
|
|
201
|
|
|
.. [#] https://xarray.pydata.org |
202
|
|
|
|
203
|
|
|
Parameters |
204
|
|
|
---------- |
205
|
|
|
filename: str |
206
|
|
|
The input filename |
207
|
|
|
alt: float |
208
|
|
|
The altitude |
209
|
|
|
lat: float |
210
|
|
|
The longitude |
211
|
|
|
tfmt: string, optional |
212
|
|
|
The astropy.time "Time Format" for the time units, |
213
|
|
|
for example, "jyear", "decimalyear", "jd", "mjd", etc. |
214
|
|
|
See: |
215
|
|
|
http://docs.astropy.org/en/stable/time/index.html#time-format |
216
|
|
|
Default: "jyear" |
217
|
|
|
scale: float, optional |
218
|
|
|
Scale factor of the data (default: 1) |
219
|
|
|
subsample_factor: int, optional |
220
|
|
|
Factor to subsample the data by (see `subsample_method`) |
221
|
|
|
(default: 1 (no subsampling)) |
222
|
|
|
subsample_method: "equal", "greedy", or "random", optional |
223
|
|
|
Method for subsampling the data (see `subsample_factor`). |
224
|
|
|
"equal" for equally spaced subsampling, |
225
|
|
|
"greedy" for selecting the data based on their uncertainty, |
226
|
|
|
and "random" for uniform random subsampling. |
227
|
|
|
(default: "greedy") |
228
|
|
|
center: bool, optional |
229
|
|
|
Center the data by subtracting the global mean. |
230
|
|
|
(default: False) |
231
|
|
|
season: "summerNH", "summerSH", or `None`, optional |
232
|
|
|
Select the named season or `None` for all data (default: None) |
233
|
|
|
SPEs: bool, optional |
234
|
|
|
Set to `True` to exclude strong SPE events (default: False) |
235
|
|
|
|
236
|
|
|
Returns |
237
|
|
|
------- |
238
|
|
|
(times, dens, errs): tuple of (N,) array_like |
239
|
|
|
The measurement times according to the `tfmt` keyword, |
240
|
|
|
the number densities, and their uncertainties. |
241
|
|
|
""" |
242
|
1 |
|
logging.info("Opening dataset: '%s'", filename) |
243
|
1 |
|
NO_ds = xr.open_mfdataset(filename, decode_times=False, |
244
|
|
|
chunks={"time": 400, "latitude": 9, "altitude": 11}) |
245
|
1 |
|
logging.info("done.") |
246
|
|
|
# Decode time coordinate for selection |
247
|
1 |
|
NO_ds["time"] = xr.conventions.decode_cf(NO_ds[["time"]]).time |
248
|
|
|
|
249
|
1 |
|
NO_mean = 0. |
250
|
1 |
|
if center: |
251
|
1 |
|
NO_mean = NO_ds.NO_DENS.mean().values |
252
|
1 |
|
logging.info("Centering with global mean: %s", NO_mean) |
253
|
1 |
|
NO_tds = NO_ds.sel(latitude=lat, altitude=alt) |
254
|
|
|
|
255
|
|
|
# Exclude SPEs first if requested |
256
|
1 |
|
if SPEs: |
257
|
1 |
|
logging.info("Removing SPEs.") |
258
|
1 |
|
for spe in _SPEs: |
259
|
1 |
|
NO_tds = NO_tds.drop( |
260
|
|
|
NO_tds.sel(time=slice(spe[0], spe[-1])).time.values, |
261
|
|
|
dim="time") |
262
|
|
|
|
263
|
|
|
# Filter by season |
264
|
1 |
|
if season in _seasons.keys(): |
265
|
1 |
|
logging.info("Restricting to season: %s", season) |
266
|
1 |
|
NO_tds = xr.concat([NO_tds.sel(time=s) for s in _seasons[season]], |
267
|
|
|
dim="time") |
268
|
1 |
|
NO_tds.load() |
269
|
|
|
else: |
270
|
1 |
|
logging.info("No season selected or unknown season, " |
271
|
|
|
"using all available data.") |
272
|
|
|
|
273
|
1 |
|
try: |
274
|
1 |
|
NO_counts = NO_tds.NO_DENS_cnt |
275
|
|
|
except AttributeError: |
276
|
|
|
NO_counts = NO_tds.counts |
277
|
|
|
|
278
|
|
|
# Select only useful data |
279
|
1 |
|
NO_tds = NO_tds.where( |
280
|
|
|
np.isfinite(NO_tds.NO_DENS) & |
281
|
|
|
(NO_tds.NO_DENS_std != 0) & |
282
|
|
|
(NO_tds.NO_AKDIAG > akd_threshold) & |
283
|
|
|
(NO_counts > cnt_threshold) & |
284
|
|
|
(NO_tds.NO_MASK == 0), |
285
|
|
|
drop=True) |
286
|
|
|
|
287
|
1 |
|
no_dens = scale * NO_tds.NO_DENS |
288
|
1 |
|
if center: |
289
|
1 |
|
no_dens -= scale * NO_mean |
290
|
1 |
|
no_errs = scale * NO_tds.NO_DENS_std / np.sqrt(NO_counts) |
291
|
1 |
|
logging.debug("no_dens.shape (ntime,): %s", no_dens.shape) |
292
|
|
|
|
293
|
1 |
|
no_sza = NO_tds.mean_SZA |
294
|
|
|
|
295
|
|
|
# Convert to astropy.Time for Julian epoch or decimal year |
296
|
1 |
|
if NO_tds.time.size > 0: |
297
|
1 |
|
no_t = Time(pd.to_datetime(NO_tds.time.values, utc=True).to_pydatetime(), |
298
|
|
|
format="datetime", scale="utc") |
299
|
1 |
|
no_ys = getattr(no_t, tfmt) |
300
|
|
|
else: |
301
|
1 |
|
no_ys = np.empty_like(NO_tds.time.values, dtype=np.float64) |
302
|
|
|
|
303
|
1 |
|
if subsample_factor > 1: |
304
|
|
|
new_data_size = no_dens.shape[0] // subsample_factor |
305
|
|
|
if subsample_method == "random": |
306
|
|
|
# random subsampling |
307
|
|
|
_idxs = np.random.choice(no_dens.shape[0], |
308
|
|
|
new_data_size, replace=False) |
309
|
|
|
elif subsample_method == "equal": |
310
|
|
|
# equally spaced subsampling |
311
|
|
|
_idxs = slice(0, no_dens.shape[0], subsample_factor) |
312
|
|
|
else: |
313
|
|
|
# "greedy" subsampling (default fall-back) |
314
|
|
|
_idxs = _greedy_idxs_post(no_dens, no_errs, new_data_size) |
315
|
|
|
return (no_ys[_idxs], |
316
|
|
|
no_dens.values[_idxs], |
317
|
|
|
no_errs.values[_idxs], |
318
|
|
|
no_sza.values[_idxs]) |
319
|
|
|
|
320
|
|
|
return no_ys, no_dens.values, no_errs.values, no_sza.values |
321
|
|
|
|