Passed
Push — master ( b988a0...edd9bf )
by Stefan
05:54
created

sciapy.regress.load_data   A

Complexity

Total Complexity 18

Size/Duplication

Total Lines 304
Duplicated Lines 68.75 %

Test Coverage

Coverage 61.46%

Importance

Changes 0
Metric Value
eloc 161
dl 209
loc 304
ccs 59
cts 96
cp 0.6146
rs 10
c 0
b 0
f 0
wmc 18

6 Functions

Rating   Name   Duplication   Size   Complexity  
A load_dailymeanLya() 0 6 1
A load_dailymeanAE() 0 7 1
A _greedy_idxs_post() 15 15 2
A load_solar_gm_table() 47 47 1
A _greedy_select() 15 15 2
C load_scia_dzm() 132 132 11

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
# -*- coding: utf-8 -*-
2
# vim:fileencoding=utf-8
3
#
4
# Copyright (c) 2017-2018 Stefan Bender
5
#
6
# This module is part of sciapy.
7
# sciapy is free software: you can redistribute it or modify
8
# it under the terms of the GNU General Public License as published
9
# by the Free Software Foundation, version 2.
10
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
11 1
"""SCIAMACHY data interface
12
13
Data loading and selection routines for SCIAMACHY data regression fits.
14
Includes reading the solar and geomagnetic index files used as proxies.
15
"""
16
17 1
import logging
18
19 1
import numpy as np
20 1
import pandas as pd
21 1
import xarray as xr
22
23 1
from astropy.time import Time
24
25 1
__all__ = ["load_solar_gm_table", "load_scia_dzm"]
26
27 1
_seasons = {
28
	"summerNH": [
29
		slice("2002-03-20", "2002-09-25"),
30
		slice("2003-03-20", "2003-10-13"),
31
		slice("2004-03-20", "2004-10-20"),
32
		slice("2005-03-20", "2005-10-20"),
33
		slice("2006-03-20", "2006-10-20"),
34
		slice("2007-03-21", "2007-10-20"),
35
		slice("2008-03-20", "2008-10-20"),
36
		slice("2009-03-20", "2009-10-20"),
37
		slice("2010-03-20", "2010-10-20"),
38
		slice("2011-03-20", "2011-08-31"),
39
		slice("2012-03-20", "2012-04-07"),
40
	],
41
	"summerSH": [
42
		slice("2002-09-13", "2003-03-26"),
43
		slice("2003-09-13", "2004-04-01"),
44
		slice("2004-09-14", "2005-04-02"),
45
		slice("2005-09-13", "2006-04-02"),
46
		slice("2006-09-13", "2007-04-02"),
47
		slice("2007-09-13", "2008-04-01"),
48
		slice("2008-09-14", "2009-04-02"),
49
		slice("2009-09-03", "2010-04-02"),
50
		slice("2010-09-13", "2011-04-02"),
51
		slice("2011-09-13", "2012-04-01"),
52
	]
53
}
54
55 1
_SPEs = [pd.date_range("2002-04-20", "2002-05-01"),
56
		pd.date_range("2002-05-21", "2002-06-02"),
57
		pd.date_range("2002-07-15", "2002-07-27"),
58
		pd.date_range("2002-08-23", "2002-09-03"),
59
		pd.date_range("2002-09-06", "2002-09-17"),
60
		pd.date_range("2002-11-08", "2002-11-20"),
61
		pd.date_range("2003-05-27", "2003-06-07"),
62
		pd.date_range("2003-10-25", "2003-11-15"),
63
		pd.date_range("2004-07-24", "2004-08-05"),
64
		pd.date_range("2004-11-06", "2004-11-18"),
65
		pd.date_range("2005-01-15", "2005-01-27"),
66
		pd.date_range("2005-05-13", "2005-05-25"),
67
		pd.date_range("2005-07-13", "2005-08-05"),
68
		pd.date_range("2005-08-21", "2005-09-02"),
69
		pd.date_range("2005-09-07", "2005-09-21"),
70
		pd.date_range("2006-12-05", "2006-12-23"),
71
		pd.date_range("2012-01-22", "2012-02-07"),
72
		pd.date_range("2012-03-06", "2012-03-27")]
73
74
75 1
def load_dailymeanAE(
76
		filename="data/indices/AE_Kyoto_1980-2016_daily2_shift12h.dat",
77
		name="AE", tfmt="jyear"):
78 1
	from pkg_resources import resource_filename
79 1
	AEfilename = resource_filename("sciapy", filename)
80 1
	return load_solar_gm_table(AEfilename, cols=[0, 1],
81
			names=["time", name], tfmt=tfmt)
82
83
84 1
def load_dailymeanLya(filename="data/indices/lisird_lya3_1980-2017.dat",
85
		name="Lya", tfmt="jyear"):
86 1
	from pkg_resources import resource_filename
87 1
	Lyafilename = resource_filename("sciapy", filename)
88 1
	return load_solar_gm_table(Lyafilename, cols=[0, 1],
89
			names=["time", name], tfmt=tfmt)
90
91
92 1 View Code Duplication
def load_solar_gm_table(filename, cols, names, sep="\t", tfmt="jyear"):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
93
	"""Load proxy tables from ascii files
94
95
	This function wraps :func:`pandas.read_table()` [#]_ with
96
	pre-defined settings to match the file format.
97
	It explicitly returns the times as UTC decimal years or julian epochs.
98
99
	.. [#] https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_table.html
100
101
	Parameters
102
	----------
103
	filename: str
104
		The file to read
105
	cols: array-like or callable
106
		The columns in the file to use, passed to `pandas.read_table`'s
107
		`usecols` keyword.
108
	names: array-like
109
		The column names, passed as `names` to `pandas.read_table()`.
110
	sep: str, optional
111
		The column separator character, passed as `sep`.
112
		Default: `\t`
113
	tfmt: str, optional
114
		The astropy.time "Time Format" for the time units,
115
		for example, "jyear", "decimalyear", "jd", "mjd", etc.
116
		See:
117
		http://docs.astropy.org/en/stable/time/index.html#time-format
118
		Default: "jyear"
119
120
	Returns
121
	-------
122
	(times, table): tuple
123
		The proxy times according to the `tfmt` keyword (UTC),
124
		the proxy values as a :class:`pandas.DataFrame` returned by
125
		:func:`pandas.read_table()`.
126
127
	See Also
128
	--------
129
	pandas.read_table
130
	:class:`pandas.DataFrame`
131
	:class:`astropy.time.Time`
132
	"""
133 1
	tab = pd.read_table(filename, index_col=0, parse_dates=[0],
134
			comment="#",
135
			sep=sep, usecols=cols, names=names)
136 1
	times = Time(tab.tz_localize("UTC").index.to_pydatetime())
137 1
	ts = getattr(times, tfmt)
138 1
	return ts, tab
139
140 1 View Code Duplication
def _greedy_select(ds, size, varname="NO_DENS_std", scale=1.):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
141
	logging.info("Greedy subsampling to size: %s", size)
142
	var = np.ma.masked_array((ds[varname] * scale)**2, mask=[False])
143
	var_p = np.ma.masked_array((ds[varname] * scale)**2, mask=[False])
144
	sigma2_i = scale**2
145
	sigma2_ip = scale**2
146
	idxs = np.arange(len(var))
147
	for _ in range(size):
148
		max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i))
149
		min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip))
150
		sigma2_i += 1. / var[max_entr_idx]
151
		sigma2_ip += 1. / var_p[min_entr_idx]
152
		var.mask[max_entr_idx] = True
153
		var_p.mask[max_entr_idx] = True
154
	return ds.isel(time=idxs[var.mask])
155
156 1 View Code Duplication
def _greedy_idxs_post(x, xerr, size):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
157
	logging.info("Greedy subsampling to size: %s", size)
158
	var = np.ma.masked_array(xerr**2, mask=[False])
159
	var_p = np.ma.masked_array(xerr**2, mask=[False])
160
	sigma2_i = 1.
161
	sigma2_ip = 1.
162
	idxs = np.arange(len(var))
163
	for _ in range(size):
164
		max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i))
165
		min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip))
166
		sigma2_i += 1. / var[max_entr_idx]
167
		sigma2_ip += 1. / var_p[min_entr_idx]
168
		var.mask[max_entr_idx] = True
169
		var_p.mask[max_entr_idx] = True
170
	return idxs[var.mask]
171
172 1 View Code Duplication
def load_scia_dzm(filename, alt, lat, tfmt="jyear",
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
173
		scale=1, subsample_factor=1, subsample_method="greedy",
174
		akd_threshold=0.002, cnt_threshold=0,
175
		center=False, season=None, SPEs=False):
176
	"""Load SCIAMACHY daily zonal mean data
177
178
	Interface function for SCIAMACHY daily zonal mean data files version 6.x.
179
	Uses :mod:`xarray` [#]_ to load and select the data. Possible selections are by
180
	hemispheric summer (NH summer ~ SH winter and SH summer ~ NH winter) and
181
	exclusion of strong solar proton events (SPE).
182
183
	.. [#] https://xarray.pydata.org
184
185
	Parameters
186
	----------
187
	filename: str
188
		The input filename
189
	alt: float
190
		The altitude
191
	lat: float
192
		The longitude
193
	tfmt: string, optional
194
		The astropy.time "Time Format" for the time units,
195
		for example, "jyear", "decimalyear", "jd", "mjd", etc.
196
		See:
197
		http://docs.astropy.org/en/stable/time/index.html#time-format
198
		Default: "jyear"
199
	scale: float, optional
200
		Scale factor of the data (default: 1)
201
	subsample_factor: int, optional
202
		Factor to subsample the data by (see `subsample_method`)
203
		(default: 1 (no subsampling))
204
	subsample_method: "equal", "greedy", or "random", optional
205
		Method for subsampling the data (see `subsample_factor`).
206
		"equal" for equally spaced subsampling,
207
		"greedy" for selecting the data based on their uncertainty,
208
		and "random" for uniform random subsampling.
209
		(default: "greedy")
210
	center: bool, optional
211
		Center the data by subtracting the global mean.
212
		(default: False)
213
	season: "summerNH", "summerSH", or `None`, optional
214
		Select the named season or `None` for all data (default: None)
215
	SPEs: bool, optional
216
		Set to `True` to exclude strong SPE events (default: False)
217
218
	Returns
219
	-------
220
	(times, dens, errs): tuple of (N,) array_like
221
		The measurement times according to the `tfmt` keyword,
222
		the number densities, and their uncertainties.
223
	"""
224 1
	logging.info("Opening dataset: '%s'", filename)
225 1
	NO_ds = xr.open_mfdataset(filename, decode_times=False,
226
				chunks={"time": 400, "latitude": 9, "altitude": 11})
227 1
	logging.info("done.")
228
	# Decode time coordinate for selection
229 1
	try:
230
		# xarray <= 0.9.6
231 1
		NO_ds["time"] = xr.conventions.decode_cf_variable(NO_ds.time)
232 1
	except TypeError:
233
		# xarray => 0.10.0
234 1
		NO_ds["time"] = xr.conventions.decode_cf_variable("time", NO_ds.time)
235
236 1
	NO_mean = 0.
237 1
	if center:
238 1
		NO_mean = NO_ds.NO_DENS.mean().values
239 1
		logging.info("Centering with global mean: %s", NO_mean)
240 1
	NO_tds = NO_ds.sel(latitude=lat, altitude=alt)
241
242
	# Exclude SPEs first if requested
243 1
	if SPEs:
244 1
		logging.info("Removing SPEs.")
245 1
		for spe in _SPEs:
246 1
			NO_tds = NO_tds.drop(
247
					NO_tds.sel(time=slice(spe[0], spe[-1])).time.values,
248
					dim="time")
249
250
	# Filter by season
251 1
	if season in _seasons.keys():
252 1
		logging.info("Restricting to season: %s", season)
253 1
		NO_tds = xr.concat([NO_tds.sel(time=s) for s in _seasons[season]],
254
					dim="time")
255 1
		NO_tds.load()
256
	else:
257 1
		logging.info("No season selected or unknown season, "
258
					"using all available data.")
259
260 1
	try:
261 1
		NO_counts = NO_tds.NO_DENS_cnt
262
	except AttributeError:
263
		NO_counts = NO_tds.counts
264
265
	# Select only useful data
266 1
	NO_tds = NO_tds.where(
267
				np.isfinite(NO_tds.NO_DENS) &
268
				(NO_tds.NO_DENS_std != 0) &
269
				(NO_tds.NO_AKDIAG > akd_threshold) &
270
				(NO_counts > cnt_threshold) &
271
				(NO_tds.NO_MASK == 0),
272
				drop=True)
273
274 1
	no_dens = scale * NO_tds.NO_DENS
275 1
	if center:
276 1
		no_dens -= scale * NO_mean
277 1
	no_errs = scale * NO_tds.NO_DENS_std / np.sqrt(NO_counts)
278 1
	logging.debug("no_dens.shape (ntime,): %s", no_dens.shape)
279
280 1
	no_sza = NO_tds.mean_SZA
281
282
	# Convert to astropy.Time for Julian epoch or decimal year
283 1
	no_t = Time(pd.to_datetime(NO_tds.time.values, utc=True).to_pydatetime())
284 1
	no_ys = getattr(no_t, tfmt)
285
286 1
	if subsample_factor > 1:
287
		new_data_size = no_dens.shape[0] // subsample_factor
288
		if subsample_method == "random":
289
			# random subsampling
290
			_idxs = np.random.choice(no_dens.shape[0],
291
					new_data_size, replace=False)
292
		elif subsample_method == "equal":
293
			# equally spaced subsampling
294
			_idxs = slice(0, no_dens.shape[0], subsample_factor)
295
		else:
296
			# "greedy" subsampling (default fall-back)
297
			_idxs = _greedy_idxs_post(no_dens, no_errs, new_data_size)
298
		return (no_ys[_idxs],
299
				no_dens.values[_idxs],
300
				no_errs.values[_idxs],
301
				no_sza.values[_idxs])
302
303
	return no_ys, no_dens.values, no_errs.values, no_sza.values
304