Passed
Push — master ( e18d2d...77f7c6 )
by Stefan
05:33
created

sciapy.regress.load_data.load_solar_gm_table()   A

Complexity

Conditions 3

Size

Total Lines 56
Code Lines 12

Duplication

Lines 56
Ratio 100 %

Code Coverage

Tests 6
CRAP Score 3.0261

Importance

Changes 0
Metric Value
cc 3
eloc 12
nop 5
dl 56
loc 56
ccs 6
cts 7
cp 0.8571
crap 3.0261
rs 9.8
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
# vim:fileencoding=utf-8
3
#
4
# Copyright (c) 2017-2018 Stefan Bender
5
#
6
# This module is part of sciapy.
7
# sciapy is free software: you can redistribute it or modify
8
# it under the terms of the GNU General Public License as published
9
# by the Free Software Foundation, version 2.
10
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
11 1
"""SCIAMACHY data interface
12
13
Data loading and selection routines for SCIAMACHY data regression fits.
14
Includes reading the solar and geomagnetic index files used as proxies.
15
"""
16
17 1
import logging
18
19 1
import numpy as np
20 1
import pandas as pd
21 1
import xarray as xr
22
23 1
from astropy.time import Time
24
25 1
__all__ = ["load_solar_gm_table", "load_scia_dzm"]
26
27 1
_seasons = {
28
	"summerNH": [
29
		slice("2002-03-20", "2002-09-25"),
30
		slice("2003-03-20", "2003-10-13"),
31
		slice("2004-03-20", "2004-10-20"),
32
		slice("2005-03-20", "2005-10-20"),
33
		slice("2006-03-20", "2006-10-20"),
34
		slice("2007-03-21", "2007-10-20"),
35
		slice("2008-03-20", "2008-10-20"),
36
		slice("2009-03-20", "2009-10-20"),
37
		slice("2010-03-20", "2010-10-20"),
38
		slice("2011-03-20", "2011-08-31"),
39
		slice("2012-03-20", "2012-04-07"),
40
	],
41
	"summerSH": [
42
		slice("2002-09-13", "2003-03-26"),
43
		slice("2003-09-13", "2004-04-01"),
44
		slice("2004-09-14", "2005-04-02"),
45
		slice("2005-09-13", "2006-04-02"),
46
		slice("2006-09-13", "2007-04-02"),
47
		slice("2007-09-13", "2008-04-01"),
48
		slice("2008-09-14", "2009-04-02"),
49
		slice("2009-09-03", "2010-04-02"),
50
		slice("2010-09-13", "2011-04-02"),
51
		slice("2011-09-13", "2012-04-01"),
52
	]
53
}
54
55 1
_SPEs = [pd.date_range("2002-04-20", "2002-05-01"),
56
		pd.date_range("2002-05-21", "2002-06-02"),
57
		pd.date_range("2002-07-15", "2002-07-27"),
58
		pd.date_range("2002-08-23", "2002-09-03"),
59
		pd.date_range("2002-09-06", "2002-09-17"),
60
		pd.date_range("2002-11-08", "2002-11-20"),
61
		pd.date_range("2003-05-27", "2003-06-07"),
62
		pd.date_range("2003-10-25", "2003-11-15"),
63
		pd.date_range("2004-07-24", "2004-08-05"),
64
		pd.date_range("2004-11-06", "2004-11-18"),
65
		pd.date_range("2005-01-15", "2005-01-27"),
66
		pd.date_range("2005-05-13", "2005-05-25"),
67
		pd.date_range("2005-07-13", "2005-08-05"),
68
		pd.date_range("2005-08-21", "2005-09-02"),
69
		pd.date_range("2005-09-07", "2005-09-21"),
70
		pd.date_range("2006-12-05", "2006-12-23"),
71
		pd.date_range("2012-01-22", "2012-02-07"),
72
		pd.date_range("2012-03-06", "2012-03-27")]
73
74
75 1
def load_dailymeanAE(
76
		filename="data/indices/AE_Kyoto_1980-2016_daily2_shift12h.dat",
77
		name="AE", tfmt="jyear"):
78 1
	from pkg_resources import resource_filename
79 1
	AEfilename = resource_filename("sciapy", filename)
80 1
	return load_solar_gm_table(AEfilename, cols=[0, 1],
81
			names=["time", name], tfmt=tfmt)
82
83
84 1
def load_dailymeanLya(filename="data/indices/lisird_lya3_1980-2017.dat",
85
		name="Lya", tfmt="jyear"):
86 1
	from pkg_resources import resource_filename
87 1
	Lyafilename = resource_filename("sciapy", filename)
88 1
	return load_solar_gm_table(Lyafilename, cols=[0, 1],
89
			names=["time", name], tfmt=tfmt)
90
91
92 1 View Code Duplication
def load_solar_gm_table(filename, cols, names, sep="\t", tfmt="jyear"):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
93
	"""Load proxy tables from ascii files
94
95
	This function wraps :func:`numpy.genfromtxt()` [#]_ with
96
	pre-defined settings to match the file format.
97
	It explicitly returns the times as UTC decimal years or julian epochs.
98
99
	.. [#] https://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html
100
101
	Parameters
102
	----------
103
	filename: str
104
		The file to read
105
	cols: sequence
106
		The columns in the file to use, passed to `numpy.genfromtxt`'s
107
		`usecols` keyword. Should be at least of length 2, indicating
108
		time values in the first column, e.g., `cols=[0, 1]`.
109
	names: sequence
110
		The column names, passed as `names` to `numpy.genfromtxt()`.
111
		Should be at least of length 2, naming the proxy values in the
112
		second column, e.g., `names=["time", "proxy"]`.
113
	sep: str, optional
114
		The column separator character, passed as `sep`.
115
		Default: `\t`
116
	tfmt: str, optional
117
		The astropy.time "Time Format" for the time units,
118
		for example, "jyear", "decimalyear", "jd", "mjd", etc.
119
		See:
120
		http://docs.astropy.org/en/stable/time/index.html#time-format
121
		Default: "jyear"
122
123
	Returns
124
	-------
125
	times: array_like
126
		The proxy times according to the `tfmt` keyword (UTC) as
127
		:class:`numpy.ndarray` from the first column of "cols".
128
	values: tuple of array_like
129
		The proxy values combined into a structured array [#]_
130
		according to "names[1:]" and "cols[1:]" passed above.
131
132
	See Also
133
	--------
134
	:class:`astropy.time.Time`
135
	.. [#] https://docs.scipy.org/doc/numpy/user/basics.rec.html
136
	"""
137 1
	if len(cols) < 2 and len(names) < 2:
138
		raise ValueError(
139
				"Both `cols` and `names` should be at least of length 2.")
140 1
	tab = np.genfromtxt(filename,
141
			delimiter=sep,
142
			dtype=None,
143
			names=names,
144
			usecols=cols)
145 1
	times = Time(tab[names[0]], scale="utc")
146 1
	ts = getattr(times, tfmt)
147 1
	return ts, tab[names[1:]]
148
149
150 1 View Code Duplication
def _greedy_select(ds, size, varname="NO_DENS_std", scale=1.):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
151
	logging.info("Greedy subsampling to size: %s", size)
152
	var = np.ma.masked_array((ds[varname] * scale)**2, mask=[False])
153
	var_p = np.ma.masked_array((ds[varname] * scale)**2, mask=[False])
154
	sigma2_i = scale**2
155
	sigma2_ip = scale**2
156
	idxs = np.arange(len(var))
157
	for _ in range(size):
158
		max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i))
159
		min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip))
160
		sigma2_i += 1. / var[max_entr_idx]
161
		sigma2_ip += 1. / var_p[min_entr_idx]
162
		var.mask[max_entr_idx] = True
163
		var_p.mask[max_entr_idx] = True
164
	return ds.isel(time=idxs[var.mask])
165
166 1 View Code Duplication
def _greedy_idxs_post(x, xerr, size):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
167
	logging.info("Greedy subsampling to size: %s", size)
168
	var = np.ma.masked_array(xerr**2, mask=[False])
169
	var_p = np.ma.masked_array(xerr**2, mask=[False])
170
	sigma2_i = 1.
171
	sigma2_ip = 1.
172
	idxs = np.arange(len(var))
173
	for _ in range(size):
174
		max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i))
175
		min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip))
176
		sigma2_i += 1. / var[max_entr_idx]
177
		sigma2_ip += 1. / var_p[min_entr_idx]
178
		var.mask[max_entr_idx] = True
179
		var_p.mask[max_entr_idx] = True
180
	return idxs[var.mask]
181
182 1 View Code Duplication
def load_scia_dzm(filename, alt, lat, tfmt="jyear",
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
183
		scale=1, subsample_factor=1, subsample_method="greedy",
184
		akd_threshold=0.002, cnt_threshold=0,
185
		center=False, season=None, SPEs=False):
186
	"""Load SCIAMACHY daily zonal mean data
187
188
	Interface function for SCIAMACHY daily zonal mean data files version 6.x.
189
	Uses :mod:`xarray` [#]_ to load and select the data. Possible selections are by
190
	hemispheric summer (NH summer ~ SH winter and SH summer ~ NH winter) and
191
	exclusion of strong solar proton events (SPE).
192
193
	.. [#] https://xarray.pydata.org
194
195
	Parameters
196
	----------
197
	filename: str
198
		The input filename
199
	alt: float
200
		The altitude
201
	lat: float
202
		The longitude
203
	tfmt: string, optional
204
		The astropy.time "Time Format" for the time units,
205
		for example, "jyear", "decimalyear", "jd", "mjd", etc.
206
		See:
207
		http://docs.astropy.org/en/stable/time/index.html#time-format
208
		Default: "jyear"
209
	scale: float, optional
210
		Scale factor of the data (default: 1)
211
	subsample_factor: int, optional
212
		Factor to subsample the data by (see `subsample_method`)
213
		(default: 1 (no subsampling))
214
	subsample_method: "equal", "greedy", or "random", optional
215
		Method for subsampling the data (see `subsample_factor`).
216
		"equal" for equally spaced subsampling,
217
		"greedy" for selecting the data based on their uncertainty,
218
		and "random" for uniform random subsampling.
219
		(default: "greedy")
220
	center: bool, optional
221
		Center the data by subtracting the global mean.
222
		(default: False)
223
	season: "summerNH", "summerSH", or `None`, optional
224
		Select the named season or `None` for all data (default: None)
225
	SPEs: bool, optional
226
		Set to `True` to exclude strong SPE events (default: False)
227
228
	Returns
229
	-------
230
	(times, dens, errs): tuple of (N,) array_like
231
		The measurement times according to the `tfmt` keyword,
232
		the number densities, and their uncertainties.
233
	"""
234 1
	logging.info("Opening dataset: '%s'", filename)
235 1
	NO_ds = xr.open_mfdataset(filename, decode_times=False,
236
				chunks={"time": 400, "latitude": 9, "altitude": 11})
237 1
	logging.info("done.")
238
	# Decode time coordinate for selection
239 1
	try:
240
		# xarray <= 0.9.6
241 1
		NO_ds["time"] = xr.conventions.decode_cf_variable(NO_ds.time)
242 1
	except TypeError:
243
		# xarray => 0.10.0
244 1
		NO_ds["time"] = xr.conventions.decode_cf_variable("time", NO_ds.time)
245
246 1
	NO_mean = 0.
247 1
	if center:
248 1
		NO_mean = NO_ds.NO_DENS.mean().values
249 1
		logging.info("Centering with global mean: %s", NO_mean)
250 1
	NO_tds = NO_ds.sel(latitude=lat, altitude=alt)
251
252
	# Exclude SPEs first if requested
253 1
	if SPEs:
254 1
		logging.info("Removing SPEs.")
255 1
		for spe in _SPEs:
256 1
			NO_tds = NO_tds.drop(
257
					NO_tds.sel(time=slice(spe[0], spe[-1])).time.values,
258
					dim="time")
259
260
	# Filter by season
261 1
	if season in _seasons.keys():
262 1
		logging.info("Restricting to season: %s", season)
263 1
		NO_tds = xr.concat([NO_tds.sel(time=s) for s in _seasons[season]],
264
					dim="time")
265 1
		NO_tds.load()
266
	else:
267 1
		logging.info("No season selected or unknown season, "
268
					"using all available data.")
269
270 1
	try:
271 1
		NO_counts = NO_tds.NO_DENS_cnt
272
	except AttributeError:
273
		NO_counts = NO_tds.counts
274
275
	# Select only useful data
276 1
	NO_tds = NO_tds.where(
277
				np.isfinite(NO_tds.NO_DENS) &
278
				(NO_tds.NO_DENS_std != 0) &
279
				(NO_tds.NO_AKDIAG > akd_threshold) &
280
				(NO_counts > cnt_threshold) &
281
				(NO_tds.NO_MASK == 0),
282
				drop=True)
283
284 1
	no_dens = scale * NO_tds.NO_DENS
285 1
	if center:
286 1
		no_dens -= scale * NO_mean
287 1
	no_errs = scale * NO_tds.NO_DENS_std / np.sqrt(NO_counts)
288 1
	logging.debug("no_dens.shape (ntime,): %s", no_dens.shape)
289
290 1
	no_sza = NO_tds.mean_SZA
291
292
	# Convert to astropy.Time for Julian epoch or decimal year
293 1
	no_t = Time(pd.to_datetime(NO_tds.time.values, utc=True).to_pydatetime())
294 1
	no_ys = getattr(no_t, tfmt)
295
296 1
	if subsample_factor > 1:
297
		new_data_size = no_dens.shape[0] // subsample_factor
298
		if subsample_method == "random":
299
			# random subsampling
300
			_idxs = np.random.choice(no_dens.shape[0],
301
					new_data_size, replace=False)
302
		elif subsample_method == "equal":
303
			# equally spaced subsampling
304
			_idxs = slice(0, no_dens.shape[0], subsample_factor)
305
		else:
306
			# "greedy" subsampling (default fall-back)
307
			_idxs = _greedy_idxs_post(no_dens, no_errs, new_data_size)
308
		return (no_ys[_idxs],
309
				no_dens.values[_idxs],
310
				no_errs.values[_idxs],
311
				no_sza.values[_idxs])
312
313
	return no_ys, no_dens.values, no_errs.values, no_sza.values
314