1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
# vim:fileencoding=utf-8 |
3
|
|
|
# |
4
|
|
|
# Copyright (c) 2017-2018 Stefan Bender |
5
|
|
|
# |
6
|
|
|
# This module is part of sciapy. |
7
|
|
|
# sciapy is free software: you can redistribute it or modify |
8
|
|
|
# it under the terms of the GNU General Public License as published |
9
|
|
|
# by the Free Software Foundation, version 2. |
10
|
|
|
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html. |
11
|
1 |
|
"""SCIAMACHY data interface |
12
|
|
|
|
13
|
|
|
Data loading and selection routines for SCIAMACHY data regression fits. |
14
|
|
|
Includes reading the solar and geomagnetic index files used as proxies. |
15
|
|
|
""" |
16
|
|
|
|
17
|
1 |
|
import logging |
18
|
|
|
|
19
|
1 |
|
import numpy as np |
20
|
1 |
|
import pandas as pd |
21
|
1 |
|
import xarray as xr |
22
|
|
|
|
23
|
1 |
|
from astropy.time import Time |
24
|
|
|
|
25
|
1 |
|
__all__ = ["load_solar_gm_table", "load_scia_dzm"] |
26
|
|
|
|
27
|
1 |
|
_seasons = { |
28
|
|
|
"summerNH": [ |
29
|
|
|
slice("2002-03-20", "2002-09-25"), |
30
|
|
|
slice("2003-03-20", "2003-10-13"), |
31
|
|
|
slice("2004-03-20", "2004-10-20"), |
32
|
|
|
slice("2005-03-20", "2005-10-20"), |
33
|
|
|
slice("2006-03-20", "2006-10-20"), |
34
|
|
|
slice("2007-03-21", "2007-10-20"), |
35
|
|
|
slice("2008-03-20", "2008-10-20"), |
36
|
|
|
slice("2009-03-20", "2009-10-20"), |
37
|
|
|
slice("2010-03-20", "2010-10-20"), |
38
|
|
|
slice("2011-03-20", "2011-08-31"), |
39
|
|
|
slice("2012-03-20", "2012-04-07"), |
40
|
|
|
], |
41
|
|
|
"summerSH": [ |
42
|
|
|
slice("2002-09-13", "2003-03-26"), |
43
|
|
|
slice("2003-09-13", "2004-04-01"), |
44
|
|
|
slice("2004-09-14", "2005-04-02"), |
45
|
|
|
slice("2005-09-13", "2006-04-02"), |
46
|
|
|
slice("2006-09-13", "2007-04-02"), |
47
|
|
|
slice("2007-09-13", "2008-04-01"), |
48
|
|
|
slice("2008-09-14", "2009-04-02"), |
49
|
|
|
slice("2009-09-03", "2010-04-02"), |
50
|
|
|
slice("2010-09-13", "2011-04-02"), |
51
|
|
|
slice("2011-09-13", "2012-04-01"), |
52
|
|
|
] |
53
|
|
|
} |
54
|
|
|
|
55
|
1 |
|
_SPEs = [pd.date_range("2002-04-20", "2002-05-01"), |
56
|
|
|
pd.date_range("2002-05-21", "2002-06-02"), |
57
|
|
|
pd.date_range("2002-07-15", "2002-07-27"), |
58
|
|
|
pd.date_range("2002-08-23", "2002-09-03"), |
59
|
|
|
pd.date_range("2002-09-06", "2002-09-17"), |
60
|
|
|
pd.date_range("2002-11-08", "2002-11-20"), |
61
|
|
|
pd.date_range("2003-05-27", "2003-06-07"), |
62
|
|
|
pd.date_range("2003-10-25", "2003-11-15"), |
63
|
|
|
pd.date_range("2004-07-24", "2004-08-05"), |
64
|
|
|
pd.date_range("2004-11-06", "2004-11-18"), |
65
|
|
|
pd.date_range("2005-01-15", "2005-01-27"), |
66
|
|
|
pd.date_range("2005-05-13", "2005-05-25"), |
67
|
|
|
pd.date_range("2005-07-13", "2005-08-05"), |
68
|
|
|
pd.date_range("2005-08-21", "2005-09-02"), |
69
|
|
|
pd.date_range("2005-09-07", "2005-09-21"), |
70
|
|
|
pd.date_range("2006-12-05", "2006-12-23"), |
71
|
|
|
pd.date_range("2012-01-22", "2012-02-07"), |
72
|
|
|
pd.date_range("2012-03-06", "2012-03-27")] |
73
|
|
|
|
74
|
|
|
|
75
|
1 |
|
def load_dailymeanAE( |
76
|
|
|
filename="data/indices/AE_Kyoto_1980-2016_daily2_shift12h.dat", |
77
|
|
|
name="AE", tfmt="jyear"): |
78
|
1 |
|
from pkg_resources import resource_filename |
79
|
1 |
|
AEfilename = resource_filename("sciapy", filename) |
80
|
1 |
|
return load_solar_gm_table(AEfilename, cols=[0, 1], |
81
|
|
|
names=["time", name], tfmt=tfmt) |
82
|
|
|
|
83
|
|
|
|
84
|
1 |
|
def load_dailymeanLya(filename="data/indices/lisird_lya3_1980-2017.dat", |
85
|
|
|
name="Lya", tfmt="jyear"): |
86
|
1 |
|
from pkg_resources import resource_filename |
87
|
1 |
|
Lyafilename = resource_filename("sciapy", filename) |
88
|
1 |
|
return load_solar_gm_table(Lyafilename, cols=[0, 1], |
89
|
|
|
names=["time", name], tfmt=tfmt) |
90
|
|
|
|
91
|
|
|
|
92
|
1 |
View Code Duplication |
def load_solar_gm_table(filename, cols, names, sep="\t", tfmt="jyear"): |
|
|
|
|
93
|
|
|
"""Load proxy tables from ascii files |
94
|
|
|
|
95
|
|
|
This function wraps :func:`pandas.read_table()` [#]_ with |
96
|
|
|
pre-defined settings to match the file format. |
97
|
|
|
It explicitly returns the times as UTC decimal years or julian epochs. |
98
|
|
|
|
99
|
|
|
.. [#] https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_table.html |
100
|
|
|
|
101
|
|
|
Parameters |
102
|
|
|
---------- |
103
|
|
|
filename: str |
104
|
|
|
The file to read |
105
|
|
|
cols: array-like or callable |
106
|
|
|
The columns in the file to use, passed to `pandas.read_table`'s |
107
|
|
|
`usecols` keyword. |
108
|
|
|
names: array-like |
109
|
|
|
The column names, passed as `names` to `pandas.read_table()`. |
110
|
|
|
sep: str, optional |
111
|
|
|
The column separator character, passed as `sep`. |
112
|
|
|
Default: `\t` |
113
|
|
|
tfmt: str, optional |
114
|
|
|
The astropy.time "Time Format" for the time units, |
115
|
|
|
for example, "jyear", "decimalyear", "jd", "mjd", etc. |
116
|
|
|
See: |
117
|
|
|
http://docs.astropy.org/en/stable/time/index.html#time-format |
118
|
|
|
Default: "jyear" |
119
|
|
|
|
120
|
|
|
Returns |
121
|
|
|
------- |
122
|
|
|
(times, table): tuple |
123
|
|
|
The proxy times according to the `tfmt` keyword (UTC), |
124
|
|
|
the proxy values as a :class:`pandas.DataFrame` returned by |
125
|
|
|
:func:`pandas.read_table()`. |
126
|
|
|
|
127
|
|
|
See Also |
128
|
|
|
-------- |
129
|
|
|
pandas.read_table |
130
|
|
|
:class:`pandas.DataFrame` |
131
|
|
|
:class:`astropy.time.Time` |
132
|
|
|
""" |
133
|
1 |
|
tab = pd.read_table(filename, index_col=0, parse_dates=[0], |
134
|
|
|
comment="#", |
135
|
|
|
sep=sep, usecols=cols, names=names) |
136
|
1 |
|
times = Time(tab.tz_localize("UTC").index.to_pydatetime()) |
137
|
1 |
|
ts = getattr(times, tfmt) |
138
|
1 |
|
return ts, tab |
139
|
|
|
|
140
|
1 |
View Code Duplication |
def _greedy_select(ds, size, varname="NO_DENS_std", scale=1.): |
|
|
|
|
141
|
|
|
logging.info("Greedy subsampling to size: %s", size) |
142
|
|
|
var = np.ma.masked_array((ds[varname] * scale)**2, mask=[False]) |
143
|
|
|
var_p = np.ma.masked_array((ds[varname] * scale)**2, mask=[False]) |
144
|
|
|
sigma2_i = scale**2 |
145
|
|
|
sigma2_ip = scale**2 |
146
|
|
|
idxs = np.arange(len(var)) |
147
|
|
|
for _ in range(size): |
148
|
|
|
max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i)) |
149
|
|
|
min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip)) |
150
|
|
|
sigma2_i += 1. / var[max_entr_idx] |
151
|
|
|
sigma2_ip += 1. / var_p[min_entr_idx] |
152
|
|
|
var.mask[max_entr_idx] = True |
153
|
|
|
var_p.mask[max_entr_idx] = True |
154
|
|
|
return ds.isel(time=idxs[var.mask]) |
155
|
|
|
|
156
|
1 |
View Code Duplication |
def _greedy_idxs_post(x, xerr, size): |
|
|
|
|
157
|
|
|
logging.info("Greedy subsampling to size: %s", size) |
158
|
|
|
var = np.ma.masked_array(xerr**2, mask=[False]) |
159
|
|
|
var_p = np.ma.masked_array(xerr**2, mask=[False]) |
160
|
|
|
sigma2_i = 1. |
161
|
|
|
sigma2_ip = 1. |
162
|
|
|
idxs = np.arange(len(var)) |
163
|
|
|
for _ in range(size): |
164
|
|
|
max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i)) |
165
|
|
|
min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip)) |
166
|
|
|
sigma2_i += 1. / var[max_entr_idx] |
167
|
|
|
sigma2_ip += 1. / var_p[min_entr_idx] |
168
|
|
|
var.mask[max_entr_idx] = True |
169
|
|
|
var_p.mask[max_entr_idx] = True |
170
|
|
|
return idxs[var.mask] |
171
|
|
|
|
172
|
1 |
View Code Duplication |
def load_scia_dzm(filename, alt, lat, tfmt="jyear", |
|
|
|
|
173
|
|
|
scale=1, subsample_factor=1, subsample_method="greedy", |
174
|
|
|
akd_threshold=0.002, cnt_threshold=0, |
175
|
|
|
center=False, season=None, SPEs=False): |
176
|
|
|
"""Load SCIAMACHY daily zonal mean data |
177
|
|
|
|
178
|
|
|
Interface function for SCIAMACHY daily zonal mean data files version 6.x. |
179
|
|
|
Uses :mod:`xarray` [#]_ to load and select the data. Possible selections are by |
180
|
|
|
hemispheric summer (NH summer ~ SH winter and SH summer ~ NH winter) and |
181
|
|
|
exclusion of strong solar proton events (SPE). |
182
|
|
|
|
183
|
|
|
.. [#] https://xarray.pydata.org |
184
|
|
|
|
185
|
|
|
Parameters |
186
|
|
|
---------- |
187
|
|
|
filename: str |
188
|
|
|
The input filename |
189
|
|
|
alt: float |
190
|
|
|
The altitude |
191
|
|
|
lat: float |
192
|
|
|
The longitude |
193
|
|
|
tfmt: string, optional |
194
|
|
|
The astropy.time "Time Format" for the time units, |
195
|
|
|
for example, "jyear", "decimalyear", "jd", "mjd", etc. |
196
|
|
|
See: |
197
|
|
|
http://docs.astropy.org/en/stable/time/index.html#time-format |
198
|
|
|
Default: "jyear" |
199
|
|
|
scale: float, optional |
200
|
|
|
Scale factor of the data (default: 1) |
201
|
|
|
subsample_factor: int, optional |
202
|
|
|
Factor to subsample the data by (see `subsample_method`) |
203
|
|
|
(default: 1 (no subsampling)) |
204
|
|
|
subsample_method: "equal", "greedy", or "random", optional |
205
|
|
|
Method for subsampling the data (see `subsample_factor`). |
206
|
|
|
"equal" for equally spaced subsampling, |
207
|
|
|
"greedy" for selecting the data based on their uncertainty, |
208
|
|
|
and "random" for uniform random subsampling. |
209
|
|
|
(default: "greedy") |
210
|
|
|
center: bool, optional |
211
|
|
|
Center the data by subtracting the global mean. |
212
|
|
|
(default: False) |
213
|
|
|
season: "summerNH", "summerSH", or `None`, optional |
214
|
|
|
Select the named season or `None` for all data (default: None) |
215
|
|
|
SPEs: bool, optional |
216
|
|
|
Set to `True` to exclude strong SPE events (default: False) |
217
|
|
|
|
218
|
|
|
Returns |
219
|
|
|
------- |
220
|
|
|
(times, dens, errs): tuple of (N,) array_like |
221
|
|
|
The measurement times according to the `tfmt` keyword, |
222
|
|
|
the number densities, and their uncertainties. |
223
|
|
|
""" |
224
|
1 |
|
logging.info("Opening dataset: '%s'", filename) |
225
|
1 |
|
NO_ds = xr.open_mfdataset(filename, decode_times=False, |
226
|
|
|
chunks={"time": 400, "latitude": 9, "altitude": 11}) |
227
|
1 |
|
logging.info("done.") |
228
|
|
|
# Decode time coordinate for selection |
229
|
1 |
|
try: |
230
|
|
|
# xarray <= 0.9.6 |
231
|
1 |
|
NO_ds["time"] = xr.conventions.decode_cf_variable(NO_ds.time) |
232
|
1 |
|
except TypeError: |
233
|
|
|
# xarray => 0.10.0 |
234
|
1 |
|
NO_ds["time"] = xr.conventions.decode_cf_variable("time", NO_ds.time) |
235
|
|
|
|
236
|
1 |
|
NO_mean = 0. |
237
|
1 |
|
if center: |
238
|
1 |
|
NO_mean = NO_ds.NO_DENS.mean().values |
239
|
1 |
|
logging.info("Centering with global mean: %s", NO_mean) |
240
|
1 |
|
NO_tds = NO_ds.sel(latitude=lat, altitude=alt) |
241
|
|
|
|
242
|
|
|
# Exclude SPEs first if requested |
243
|
1 |
|
if SPEs: |
244
|
1 |
|
logging.info("Removing SPEs.") |
245
|
1 |
|
for spe in _SPEs: |
246
|
1 |
|
NO_tds = NO_tds.drop( |
247
|
|
|
NO_tds.sel(time=slice(spe[0], spe[-1])).time.values, |
248
|
|
|
dim="time") |
249
|
|
|
|
250
|
|
|
# Filter by season |
251
|
1 |
|
if season in _seasons.keys(): |
252
|
1 |
|
logging.info("Restricting to season: %s", season) |
253
|
1 |
|
NO_tds = xr.concat([NO_tds.sel(time=s) for s in _seasons[season]], |
254
|
|
|
dim="time") |
255
|
1 |
|
NO_tds.load() |
256
|
|
|
else: |
257
|
1 |
|
logging.info("No season selected or unknown season, " |
258
|
|
|
"using all available data.") |
259
|
|
|
|
260
|
1 |
|
try: |
261
|
1 |
|
NO_counts = NO_tds.NO_DENS_cnt |
262
|
|
|
except AttributeError: |
263
|
|
|
NO_counts = NO_tds.counts |
264
|
|
|
|
265
|
|
|
# Select only useful data |
266
|
1 |
|
NO_tds = NO_tds.where( |
267
|
|
|
np.isfinite(NO_tds.NO_DENS) & |
268
|
|
|
(NO_tds.NO_DENS_std != 0) & |
269
|
|
|
(NO_tds.NO_AKDIAG > akd_threshold) & |
270
|
|
|
(NO_counts > cnt_threshold) & |
271
|
|
|
(NO_tds.NO_MASK == 0), |
272
|
|
|
drop=True) |
273
|
|
|
|
274
|
1 |
|
no_dens = scale * NO_tds.NO_DENS |
275
|
1 |
|
if center: |
276
|
1 |
|
no_dens -= scale * NO_mean |
277
|
1 |
|
no_errs = scale * NO_tds.NO_DENS_std / np.sqrt(NO_counts) |
278
|
1 |
|
logging.debug("no_dens.shape (ntime,): %s", no_dens.shape) |
279
|
|
|
|
280
|
1 |
|
no_sza = NO_tds.mean_SZA |
281
|
|
|
|
282
|
|
|
# Convert to astropy.Time for Julian epoch or decimal year |
283
|
1 |
|
no_t = Time(pd.to_datetime(NO_tds.time.values, utc=True).to_pydatetime()) |
284
|
1 |
|
no_ys = getattr(no_t, tfmt) |
285
|
|
|
|
286
|
1 |
|
if subsample_factor > 1: |
287
|
|
|
new_data_size = no_dens.shape[0] // subsample_factor |
288
|
|
|
if subsample_method == "random": |
289
|
|
|
# random subsampling |
290
|
|
|
_idxs = np.random.choice(no_dens.shape[0], |
291
|
|
|
new_data_size, replace=False) |
292
|
|
|
elif subsample_method == "equal": |
293
|
|
|
# equally spaced subsampling |
294
|
|
|
_idxs = slice(0, no_dens.shape[0], subsample_factor) |
295
|
|
|
else: |
296
|
|
|
# "greedy" subsampling (default fall-back) |
297
|
|
|
_idxs = _greedy_idxs_post(no_dens, no_errs, new_data_size) |
298
|
|
|
return (no_ys[_idxs], |
299
|
|
|
no_dens.values[_idxs], |
300
|
|
|
no_errs.values[_idxs], |
301
|
|
|
no_sza.values[_idxs]) |
302
|
|
|
|
303
|
|
|
return no_ys, no_dens.values, no_errs.values, no_sza.values |
304
|
|
|
|