convert_adult() - Code Metrics - Inspection of "Merge pull request #345 from dmitriy-serdyuk/fix-w..." - mila-udem/fuel - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 3e1d4c...f31f72 )

by Bart

created 2016-04-13 21:38 UTC

convert_adult() F

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

104

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	13
dl	0
loc	104
rs	2

How to fix Long Method Complexity

import os

import h5py
import numpy

from fuel.converters.base import fill_hdf5_file


def convert_to_one_hot(y):
    """
    converts y into one hot reprsentation.

    Parameters
    ----------
    y : list
        A list containing continous integer values.

    Returns
    -------
    one_hot : numpy.ndarray
        A numpy.ndarray object, which is one-hot representation of y.

    """
    max_value = max(y)
    min_value = min(y)
    length = len(y)
    one_hot = numpy.zeros((length, (max_value - min_value + 1)))
    one_hot[numpy.arange(length), y] = 1
    return one_hot


def convert_adult(directory, output_directory,
                  output_filename='adult.hdf5'):
    """
    Convert the Adult dataset to HDF5.

    Converts the Adult dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.Adult`. The converted dataset is saved as
    'adult.hdf5'.
    This method assumes the existence of the file `adult.data` and
    `adult.test`.

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to `adult.hdf5`.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    train_path = os.path.join(directory, 'adult.data')
    test_path = os.path.join(directory, 'adult.test')
    output_path = os.path.join(output_directory, output_filename)

    train_content = open(train_path, 'r').readlines()
    test_content = open(test_path, 'r').readlines()
    train_content = train_content[:-1]
    test_content = test_content[1:-1]

    features_list = []
    targets_list = []
    for content in [train_content, test_content]:
        # strip out examples with missing features
        content = [line for line in content if line.find('?') == -1]
        # strip off endlines, separate entries
        content = list(map(lambda l: l[:-1].split(', '), content))

        features = list(map(lambda l: l[:-1], content))
        targets = list(map(lambda l: l[-1], content))
        del content
        y = list(map(lambda l: [l[0] == '>'], targets))
        y = numpy.array(y)
        del targets

        # Process features into a matrix
        variables = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'
        ]
        continuous = set([
            'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
            'hours-per-week'
        ])

        pieces = []
        for i, var in enumerate(variables):
            data = list(map(lambda l: l[i], features))
            if var in continuous:
                data = list(map(lambda l: float(l), data))

                data = numpy.array(data)
                data = data.reshape(data.shape[0], 1)
            else:
                unique_values = list(set(data))
                data = list(map(lambda l: unique_values.index(l), data))

                data = convert_to_one_hot(data)
            pieces.append(data)

        X = numpy.concatenate(pieces, axis=1)

        features_list.append(X)
        targets_list.append(y)

    # the largets value in the last variable of test set is only 40, thus
    # the one hot representation has 40 at the second dimention. While in
    # training set it is 41. Since it lies in the last variable, so it is
    # safe to simply add a last column with zeros.
    features_list[1] = numpy.concatenate(
        (features_list[1],
         numpy.zeros((features_list[1].shape[0], 1),
                     dtype=features_list[1].dtype)),
        axis=1)
    h5file = h5py.File(output_path, mode='w')
    data = (('train', 'features', features_list[0]),
            ('train', 'targets', targets_list[0]),
            ('test', 'features', features_list[1]),
            ('test', 'targets', targets_list[1]))

    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'feature'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path,)


def fill_subparser(subparser):

    return convert_adult


1			import os
2
3			import h5py
4			import numpy
5
6			from fuel.converters.base import fill_hdf5_file
7
8
9			def convert_to_one_hot(y):
10			"""
11			converts y into one hot reprsentation.
12
13			Parameters
14			----------
15			y : list
16			A list containing continous integer values.
17
18			Returns
19			-------
20			one_hot : numpy.ndarray
21			A numpy.ndarray object, which is one-hot representation of y.
22
23			"""
24			max_value = max(y)
25			min_value = min(y)
26			length = len(y)
27			one_hot = numpy.zeros((length, (max_value - min_value + 1)))
28			one_hot[numpy.arange(length), y] = 1
29			return one_hot
30
31
32			def convert_adult(directory, output_directory,
33			output_filename='adult.hdf5'):
34			"""
35			Convert the Adult dataset to HDF5.
36
37			Converts the Adult dataset to an HDF5 dataset compatible with
38			:class:`fuel.datasets.Adult`. The converted dataset is saved as
39			'adult.hdf5'.
40			This method assumes the existence of the file `adult.data` and
41			`adult.test`.
42
43			Parameters
44			----------
45			directory : str
46			Directory in which input files reside.
47			output_directory : str
48			Directory in which to save the converted dataset.
49			output_filename : str, optional
50			Name of the saved dataset. Defaults to `adult.hdf5`.
51
52			Returns
53			-------
54			output_paths : tuple of str
55			Single-element tuple containing the path to the converted dataset.
56
57			"""
58			train_path = os.path.join(directory, 'adult.data')
59			test_path = os.path.join(directory, 'adult.test')
60			output_path = os.path.join(output_directory, output_filename)
61
62			train_content = open(train_path, 'r').readlines()
63			test_content = open(test_path, 'r').readlines()
64			train_content = train_content[:-1]
65			test_content = test_content[1:-1]
66
67			features_list = []
68			targets_list = []
69			for content in [train_content, test_content]:
70			# strip out examples with missing features
71			content = [line for line in content if line.find('?') == -1]
72			# strip off endlines, separate entries
73			content = list(map(lambda l: l[:-1].split(', '), content))
74
75			features = list(map(lambda l: l[:-1], content))
76			targets = list(map(lambda l: l[-1], content))
77			del content
78			y = list(map(lambda l: [l[0] == '>'], targets))
79			y = numpy.array(y)
80			del targets
81
82			# Process features into a matrix
83			variables = [
84			'age', 'workclass', 'fnlwgt', 'education', 'education-num',
85			'marital-status', 'occupation', 'relationship', 'race', 'sex',
86			'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'
87			]
88			continuous = set([
89			'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
90			'hours-per-week'
91			])
92
93			pieces = []
94			for i, var in enumerate(variables):
95			data = list(map(lambda l: l[i], features))
96			if var in continuous:
97			data = list(map(lambda l: float(l), data))
			0 ignored issues – show Unused Code introduced 2015-12-02 16:30 UTC by Report Bug Copy Issue Report This lambda might be unnecessary. Loading history...
98			data = numpy.array(data)
99			data = data.reshape(data.shape[0], 1)
100			else:
101			unique_values = list(set(data))
102			data = list(map(lambda l: unique_values.index(l), data))
			0 ignored issues – show Unused Code introduced 2015-12-02 16:30 UTC by Report Bug Copy Issue Report This lambda might be unnecessary. Loading history...
103			data = convert_to_one_hot(data)
104			pieces.append(data)
105
106			X = numpy.concatenate(pieces, axis=1)
107
108			features_list.append(X)
109			targets_list.append(y)
110
111			# the largets value in the last variable of test set is only 40, thus
112			# the one hot representation has 40 at the second dimention. While in
113			# training set it is 41. Since it lies in the last variable, so it is
114			# safe to simply add a last column with zeros.
115			features_list[1] = numpy.concatenate(
116			(features_list[1],
117			numpy.zeros((features_list[1].shape[0], 1),
118			dtype=features_list[1].dtype)),
119			axis=1)
120			h5file = h5py.File(output_path, mode='w')
121			data = (('train', 'features', features_list[0]),
122			('train', 'targets', targets_list[0]),
123			('test', 'features', features_list[1]),
124			('test', 'targets', targets_list[1]))
125
126			fill_hdf5_file(h5file, data)
127			h5file['features'].dims[0].label = 'batch'
128			h5file['features'].dims[1].label = 'feature'
129			h5file['targets'].dims[0].label = 'batch'
130			h5file['targets'].dims[1].label = 'index'
131
132			h5file.flush()
133			h5file.close()
134
135			return (output_path,)
136
137
138			def fill_subparser(subparser):
			0 ignored issues – show Unused Code introduced 2015-12-02 16:30 UTC by Report Bug Copy Issue Report The argument `subparser` seems to be unused. Loading history...
139			return convert_adult
140

mila-udem / fuel

Push — master ( 3e1d4c...f31f72 )

convert_adult() F

Complexity

Size

Duplication

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like