| Conditions | 13 |
| Total Lines | 104 |
| Lines | 0 |
| Ratio | 0 % |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like convert_adult() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | import os |
||
| 32 | def convert_adult(directory, output_directory, |
||
| 33 | output_filename='adult.hdf5'): |
||
| 34 | """ |
||
| 35 | Convert the Adult dataset to HDF5. |
||
| 36 | |||
| 37 | Converts the Adult dataset to an HDF5 dataset compatible with |
||
| 38 | :class:`fuel.datasets.Adult`. The converted dataset is saved as |
||
| 39 | 'adult.hdf5'. |
||
| 40 | This method assumes the existence of the file `adult.data` and |
||
| 41 | `adult.test`. |
||
| 42 | |||
| 43 | Parameters |
||
| 44 | ---------- |
||
| 45 | directory : str |
||
| 46 | Directory in which input files reside. |
||
| 47 | output_directory : str |
||
| 48 | Directory in which to save the converted dataset. |
||
| 49 | output_filename : str, optional |
||
| 50 | Name of the saved dataset. Defaults to `adult.hdf5`. |
||
| 51 | |||
| 52 | Returns |
||
| 53 | ------- |
||
| 54 | output_paths : tuple of str |
||
| 55 | Single-element tuple containing the path to the converted dataset. |
||
| 56 | |||
| 57 | """ |
||
| 58 | train_path = os.path.join(directory, 'adult.data') |
||
| 59 | test_path = os.path.join(directory, 'adult.test') |
||
| 60 | output_path = os.path.join(output_directory, output_filename) |
||
| 61 | |||
| 62 | train_content = open(train_path, 'r').readlines() |
||
| 63 | test_content = open(test_path, 'r').readlines() |
||
| 64 | train_content = train_content[:-1] |
||
| 65 | test_content = test_content[1:-1] |
||
| 66 | |||
| 67 | features_list = [] |
||
| 68 | targets_list = [] |
||
| 69 | for content in [train_content, test_content]: |
||
| 70 | # strip out examples with missing features |
||
| 71 | content = [line for line in content if line.find('?') == -1] |
||
| 72 | # strip off endlines, separate entries |
||
| 73 | content = list(map(lambda l: l[:-1].split(', '), content)) |
||
| 74 | |||
| 75 | features = list(map(lambda l: l[:-1], content)) |
||
| 76 | targets = list(map(lambda l: l[-1], content)) |
||
| 77 | del content |
||
| 78 | y = list(map(lambda l: [l[0] == '>'], targets)) |
||
| 79 | y = numpy.array(y) |
||
| 80 | del targets |
||
| 81 | |||
| 82 | # Process features into a matrix |
||
| 83 | variables = [ |
||
| 84 | 'age', 'workclass', 'fnlwgt', 'education', 'education-num', |
||
| 85 | 'marital-status', 'occupation', 'relationship', 'race', 'sex', |
||
| 86 | 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country' |
||
| 87 | ] |
||
| 88 | continuous = set([ |
||
| 89 | 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', |
||
| 90 | 'hours-per-week' |
||
| 91 | ]) |
||
| 92 | |||
| 93 | pieces = [] |
||
| 94 | for i, var in enumerate(variables): |
||
| 95 | data = list(map(lambda l: l[i], features)) |
||
| 96 | if var in continuous: |
||
| 97 | data = list(map(lambda l: float(l), data)) |
||
|
|
|||
| 98 | data = numpy.array(data) |
||
| 99 | data = data.reshape(data.shape[0], 1) |
||
| 100 | else: |
||
| 101 | unique_values = list(set(data)) |
||
| 102 | data = list(map(lambda l: unique_values.index(l), data)) |
||
| 103 | data = convert_to_one_hot(data) |
||
| 104 | pieces.append(data) |
||
| 105 | |||
| 106 | X = numpy.concatenate(pieces, axis=1) |
||
| 107 | |||
| 108 | features_list.append(X) |
||
| 109 | targets_list.append(y) |
||
| 110 | |||
| 111 | # the largets value in the last variable of test set is only 40, thus |
||
| 112 | # the one hot representation has 40 at the second dimention. While in |
||
| 113 | # training set it is 41. Since it lies in the last variable, so it is |
||
| 114 | # safe to simply add a last column with zeros. |
||
| 115 | features_list[1] = numpy.concatenate( |
||
| 116 | (features_list[1], |
||
| 117 | numpy.zeros((features_list[1].shape[0], 1), |
||
| 118 | dtype=features_list[1].dtype)), |
||
| 119 | axis=1) |
||
| 120 | h5file = h5py.File(output_path, mode='w') |
||
| 121 | data = (('train', 'features', features_list[0]), |
||
| 122 | ('train', 'targets', targets_list[0]), |
||
| 123 | ('test', 'features', features_list[1]), |
||
| 124 | ('test', 'targets', targets_list[1])) |
||
| 125 | |||
| 126 | fill_hdf5_file(h5file, data) |
||
| 127 | h5file['features'].dims[0].label = 'batch' |
||
| 128 | h5file['features'].dims[1].label = 'feature' |
||
| 129 | h5file['targets'].dims[0].label = 'batch' |
||
| 130 | h5file['targets'].dims[1].label = 'index' |
||
| 131 | |||
| 132 | h5file.flush() |
||
| 133 | h5file.close() |
||
| 134 | |||
| 135 | return (output_path,) |
||
| 136 | |||
| 140 |