Passed
Push — master ( 353449...a10708 )
by Stefan
05:50
created

sciapy.regress.load_data.load_solar_gm_table()   A

Complexity

Conditions 5

Size

Total Lines 61
Code Lines 16

Duplication

Lines 61
Ratio 100 %

Code Coverage

Tests 9
CRAP Score 5.025

Importance

Changes 0
Metric Value
cc 5
eloc 16
nop 5
dl 61
loc 61
ccs 9
cts 10
cp 0.9
crap 5.025
rs 9.1333
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
# vim:fileencoding=utf-8
3
#
4
# Copyright (c) 2017-2018 Stefan Bender
5
#
6
# This module is part of sciapy.
7
# sciapy is free software: you can redistribute it or modify
8
# it under the terms of the GNU General Public License as published
9
# by the Free Software Foundation, version 2.
10
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
11 1
"""SCIAMACHY data interface
12
13
Data loading and selection routines for SCIAMACHY data regression fits.
14
Includes reading the solar and geomagnetic index files used as proxies.
15
"""
16
17 1
import logging
18
19 1
import numpy as np
20 1
import pandas as pd
21 1
import xarray as xr
22
23 1
from astropy.time import Time
24
25 1
__all__ = ["load_solar_gm_table", "load_scia_dzm"]
26
27 1
_NPV_MAJOR, _NPV_MINOR, _NPV_PATCH = np.__version__.split('.')
28
29 1
_seasons = {
30
	"summerNH": [
31
		slice("2002-03-20", "2002-09-25"),
32
		slice("2003-03-20", "2003-10-13"),
33
		slice("2004-03-20", "2004-10-20"),
34
		slice("2005-03-20", "2005-10-20"),
35
		slice("2006-03-20", "2006-10-20"),
36
		slice("2007-03-21", "2007-10-20"),
37
		slice("2008-03-20", "2008-10-20"),
38
		slice("2009-03-20", "2009-10-20"),
39
		slice("2010-03-20", "2010-10-20"),
40
		slice("2011-03-20", "2011-08-31"),
41
		slice("2012-03-20", "2012-04-07"),
42
	],
43
	"summerSH": [
44
		slice("2002-09-13", "2003-03-26"),
45
		slice("2003-09-13", "2004-04-01"),
46
		slice("2004-09-14", "2005-04-02"),
47
		slice("2005-09-13", "2006-04-02"),
48
		slice("2006-09-13", "2007-04-02"),
49
		slice("2007-09-13", "2008-04-01"),
50
		slice("2008-09-14", "2009-04-02"),
51
		slice("2009-09-03", "2010-04-02"),
52
		slice("2010-09-13", "2011-04-02"),
53
		slice("2011-09-13", "2012-04-01"),
54
	]
55
}
56
57 1
_SPEs = [pd.date_range("2002-04-20", "2002-05-01"),
58
		pd.date_range("2002-05-21", "2002-06-02"),
59
		pd.date_range("2002-07-15", "2002-07-27"),
60
		pd.date_range("2002-08-23", "2002-09-03"),
61
		pd.date_range("2002-09-06", "2002-09-17"),
62
		pd.date_range("2002-11-08", "2002-11-20"),
63
		pd.date_range("2003-05-27", "2003-06-07"),
64
		pd.date_range("2003-10-25", "2003-11-15"),
65
		pd.date_range("2004-07-24", "2004-08-05"),
66
		pd.date_range("2004-11-06", "2004-11-18"),
67
		pd.date_range("2005-01-15", "2005-01-27"),
68
		pd.date_range("2005-05-13", "2005-05-25"),
69
		pd.date_range("2005-07-13", "2005-08-05"),
70
		pd.date_range("2005-08-21", "2005-09-02"),
71
		pd.date_range("2005-09-07", "2005-09-21"),
72
		pd.date_range("2006-12-05", "2006-12-23"),
73
		pd.date_range("2012-01-22", "2012-02-07"),
74
		pd.date_range("2012-03-06", "2012-03-27")]
75
76
77 1
def load_dailymeanAE(
78
		filename="data/indices/AE_Kyoto_1980-2016_daily2_shift12h.dat",
79
		name="AE", tfmt="jyear"):
80 1
	from pkg_resources import resource_filename
81 1
	AEfilename = resource_filename("sciapy", filename)
82 1
	return load_solar_gm_table(AEfilename, cols=[0, 1],
83
			names=["time", name], tfmt=tfmt)
84
85
86 1
def load_dailymeanLya(filename="data/indices/lisird_lya3_1980-2017.dat",
87
		name="Lya", tfmt="jyear"):
88 1
	from pkg_resources import resource_filename
89 1
	Lyafilename = resource_filename("sciapy", filename)
90 1
	return load_solar_gm_table(Lyafilename, cols=[0, 1],
91
			names=["time", name], tfmt=tfmt)
92
93
94 1 View Code Duplication
def load_solar_gm_table(filename, cols, names, sep="\t", tfmt="jyear"):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
95
	"""Load proxy tables from ascii files
96
97
	This function wraps :func:`numpy.genfromtxt()` [#]_ with
98
	pre-defined settings to match the file format.
99
	It explicitly returns the times as UTC decimal years or julian epochs.
100
101
	.. [#] https://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html
102
103
	Parameters
104
	----------
105
	filename: str
106
		The file to read
107
	cols: sequence
108
		The columns in the file to use, passed to `numpy.genfromtxt`'s
109
		`usecols` keyword. Should be at least of length 2, indicating
110
		time values in the first column, e.g., `cols=[0, 1]`.
111
	names: sequence
112
		The column names, passed as `names` to `numpy.genfromtxt()`.
113
		Should be at least of length 2, naming the proxy values in the
114
		second column, e.g., `names=["time", "proxy"]`.
115
	sep: str, optional
116
		The column separator character, passed as `sep`.
117
		Default: `\t`
118
	tfmt: str, optional
119
		The astropy.time "Time Format" for the time units,
120
		for example, "jyear", "decimalyear", "jd", "mjd", etc.
121
		See:
122
		http://docs.astropy.org/en/stable/time/index.html#time-format
123
		Default: "jyear"
124
125
	Returns
126
	-------
127
	times: array_like
128
		The proxy times according to the `tfmt` keyword (UTC) as
129
		:class:`numpy.ndarray` from the first column of "cols".
130
	values: tuple of array_like
131
		The proxy values combined into a structured array [#]_
132
		according to "names[1:]" and "cols[1:]" passed above.
133
134
	See Also
135
	--------
136
	:class:`astropy.time.Time`
137
	.. [#] https://docs.scipy.org/doc/numpy/user/basics.rec.html
138
	"""
139 1
	if len(cols) < 2 and len(names) < 2:
140
		raise ValueError(
141
				"Both `cols` and `names` should be at least of length 2.")
142 1
	encoding = {}
143
	# set the encoding for numpy >= 1.14.0
144 1
	if int(_NPV_MAJOR) >= 1 and int(_NPV_MINOR) >= 14:
145 1
		encoding = dict(encoding=None)
146 1
	tab = np.genfromtxt(filename,
147
			delimiter=sep,
148
			dtype=None,
149
			names=names,
150
			usecols=cols,
151
			**encoding)
152 1
	times = Time(tab[names[0]], scale="utc")
153 1
	ts = getattr(times, tfmt)
154 1
	return ts, tab[names[1:]]
155
156
157 1 View Code Duplication
def _greedy_select(ds, size, varname="NO_DENS_std", scale=1.):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
158
	logging.info("Greedy subsampling to size: %s", size)
159
	var = np.ma.masked_array((ds[varname] * scale)**2, mask=[False])
160
	var_p = np.ma.masked_array((ds[varname] * scale)**2, mask=[False])
161
	sigma2_i = scale**2
162
	sigma2_ip = scale**2
163
	idxs = np.arange(len(var))
164
	for _ in range(size):
165
		max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i))
166
		min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip))
167
		sigma2_i += 1. / var[max_entr_idx]
168
		sigma2_ip += 1. / var_p[min_entr_idx]
169
		var.mask[max_entr_idx] = True
170
		var_p.mask[max_entr_idx] = True
171
	return ds.isel(time=idxs[var.mask])
172
173 1 View Code Duplication
def _greedy_idxs_post(x, xerr, size):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
174
	logging.info("Greedy subsampling to size: %s", size)
175
	var = np.ma.masked_array(xerr**2, mask=[False])
176
	var_p = np.ma.masked_array(xerr**2, mask=[False])
177
	sigma2_i = 1.
178
	sigma2_ip = 1.
179
	idxs = np.arange(len(var))
180
	for _ in range(size):
181
		max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i))
182
		min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip))
183
		sigma2_i += 1. / var[max_entr_idx]
184
		sigma2_ip += 1. / var_p[min_entr_idx]
185
		var.mask[max_entr_idx] = True
186
		var_p.mask[max_entr_idx] = True
187
	return idxs[var.mask]
188
189 1 View Code Duplication
def load_scia_dzm(filename, alt, lat, tfmt="jyear",
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
190
		scale=1, subsample_factor=1, subsample_method="greedy",
191
		akd_threshold=0.002, cnt_threshold=0,
192
		center=False, season=None, SPEs=False):
193
	"""Load SCIAMACHY daily zonal mean data
194
195
	Interface function for SCIAMACHY daily zonal mean data files version 6.x.
196
	Uses :mod:`xarray` [#]_ to load and select the data. Possible selections are by
197
	hemispheric summer (NH summer ~ SH winter and SH summer ~ NH winter) and
198
	exclusion of strong solar proton events (SPE).
199
200
	.. [#] https://xarray.pydata.org
201
202
	Parameters
203
	----------
204
	filename: str
205
		The input filename
206
	alt: float
207
		The altitude
208
	lat: float
209
		The longitude
210
	tfmt: string, optional
211
		The astropy.time "Time Format" for the time units,
212
		for example, "jyear", "decimalyear", "jd", "mjd", etc.
213
		See:
214
		http://docs.astropy.org/en/stable/time/index.html#time-format
215
		Default: "jyear"
216
	scale: float, optional
217
		Scale factor of the data (default: 1)
218
	subsample_factor: int, optional
219
		Factor to subsample the data by (see `subsample_method`)
220
		(default: 1 (no subsampling))
221
	subsample_method: "equal", "greedy", or "random", optional
222
		Method for subsampling the data (see `subsample_factor`).
223
		"equal" for equally spaced subsampling,
224
		"greedy" for selecting the data based on their uncertainty,
225
		and "random" for uniform random subsampling.
226
		(default: "greedy")
227
	center: bool, optional
228
		Center the data by subtracting the global mean.
229
		(default: False)
230
	season: "summerNH", "summerSH", or `None`, optional
231
		Select the named season or `None` for all data (default: None)
232
	SPEs: bool, optional
233
		Set to `True` to exclude strong SPE events (default: False)
234
235
	Returns
236
	-------
237
	(times, dens, errs): tuple of (N,) array_like
238
		The measurement times according to the `tfmt` keyword,
239
		the number densities, and their uncertainties.
240
	"""
241 1
	logging.info("Opening dataset: '%s'", filename)
242 1
	NO_ds = xr.open_mfdataset(filename, decode_times=False,
243
				chunks={"time": 400, "latitude": 9, "altitude": 11})
244 1
	logging.info("done.")
245
	# Decode time coordinate for selection
246 1
	try:
247
		# xarray <= 0.9.6
248 1
		NO_ds["time"] = xr.conventions.decode_cf_variable(NO_ds.time)
249 1
	except TypeError:
250
		# xarray => 0.10.0
251 1
		NO_ds["time"] = xr.conventions.decode_cf_variable("time", NO_ds.time)
252
253 1
	NO_mean = 0.
254 1
	if center:
255 1
		NO_mean = NO_ds.NO_DENS.mean().values
256 1
		logging.info("Centering with global mean: %s", NO_mean)
257 1
	NO_tds = NO_ds.sel(latitude=lat, altitude=alt)
258
259
	# Exclude SPEs first if requested
260 1
	if SPEs:
261 1
		logging.info("Removing SPEs.")
262 1
		for spe in _SPEs:
263 1
			NO_tds = NO_tds.drop(
264
					NO_tds.sel(time=slice(spe[0], spe[-1])).time.values,
265
					dim="time")
266
267
	# Filter by season
268 1
	if season in _seasons.keys():
269 1
		logging.info("Restricting to season: %s", season)
270 1
		NO_tds = xr.concat([NO_tds.sel(time=s) for s in _seasons[season]],
271
					dim="time")
272 1
		NO_tds.load()
273
	else:
274 1
		logging.info("No season selected or unknown season, "
275
					"using all available data.")
276
277 1
	try:
278 1
		NO_counts = NO_tds.NO_DENS_cnt
279
	except AttributeError:
280
		NO_counts = NO_tds.counts
281
282
	# Select only useful data
283 1
	NO_tds = NO_tds.where(
284
				np.isfinite(NO_tds.NO_DENS) &
285
				(NO_tds.NO_DENS_std != 0) &
286
				(NO_tds.NO_AKDIAG > akd_threshold) &
287
				(NO_counts > cnt_threshold) &
288
				(NO_tds.NO_MASK == 0),
289
				drop=True)
290
291 1
	no_dens = scale * NO_tds.NO_DENS
292 1
	if center:
293 1
		no_dens -= scale * NO_mean
294 1
	no_errs = scale * NO_tds.NO_DENS_std / np.sqrt(NO_counts)
295 1
	logging.debug("no_dens.shape (ntime,): %s", no_dens.shape)
296
297 1
	no_sza = NO_tds.mean_SZA
298
299
	# Convert to astropy.Time for Julian epoch or decimal year
300 1
	no_t = Time(pd.to_datetime(NO_tds.time.values, utc=True).to_pydatetime())
301 1
	no_ys = getattr(no_t, tfmt)
302
303 1
	if subsample_factor > 1:
304
		new_data_size = no_dens.shape[0] // subsample_factor
305
		if subsample_method == "random":
306
			# random subsampling
307
			_idxs = np.random.choice(no_dens.shape[0],
308
					new_data_size, replace=False)
309
		elif subsample_method == "equal":
310
			# equally spaced subsampling
311
			_idxs = slice(0, no_dens.shape[0], subsample_factor)
312
		else:
313
			# "greedy" subsampling (default fall-back)
314
			_idxs = _greedy_idxs_post(no_dens, no_errs, new_data_size)
315
		return (no_ys[_idxs],
316
				no_dens.values[_idxs],
317
				no_errs.values[_idxs],
318
				no_sza.values[_idxs])
319
320
	return no_ys, no_dens.values, no_errs.values, no_sza.values
321