| Conditions | 5 |
| Total Lines | 61 |
| Code Lines | 22 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | ''' |
||
| 30 | def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408): |
||
| 31 | ''' |
||
| 32 | Split a dataset and a label column into train, dev and test sets. |
||
| 33 | |||
| 34 | Parameters: |
||
| 35 | ---------- |
||
| 36 | |||
| 37 | data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \ |
||
| 38 | information is used to label the plots. |
||
| 39 | |||
| 40 | target: string, list, np.array or pd.Series, default None |
||
| 41 | Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
||
| 42 | and the label. |
||
| 43 | |||
| 44 | dev_size: float, default 0.1 |
||
| 45 | If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \ |
||
| 46 | split. |
||
| 47 | |||
| 48 | test_size: float, default 0.1 |
||
| 49 | If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \ |
||
| 50 | split. |
||
| 51 | |||
| 52 | stratify: target column, default None |
||
| 53 | If not None, data is split in a stratified fashion, using the input as the class labels. |
||
| 54 | |||
| 55 | random_state: integer, default 408 |
||
| 56 | Random_state is the seed used by the random number generator. |
||
| 57 | |||
| 58 | Returns |
||
| 59 | ------- |
||
| 60 | tuple: Tuple containing train-dev-test split of inputs. |
||
| 61 | ''' |
||
| 62 | |||
| 63 | # Validate Inputs |
||
| 64 | _validate_input_range(dev_size, 'dev_size', 0, 1) |
||
| 65 | _validate_input_range(test_size, 'test_size', 0, 1) |
||
| 66 | _validate_input_int(random_state, 'random_state') |
||
| 67 | |||
| 68 | target_data = [] |
||
| 69 | if isinstance(target, str): |
||
| 70 | target_data = data[target] |
||
| 71 | data = data.drop(target, axis=1) |
||
| 72 | |||
| 73 | elif isinstance(target, (list, pd.Series, np.ndarray)): |
||
| 74 | target_data = pd.Series(target) |
||
| 75 | target = target.name |
||
| 76 | |||
| 77 | X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data, |
||
| 78 | test_size=dev_size+test_size, |
||
| 79 | random_state=random_state, |
||
| 80 | stratify=stratify) |
||
| 81 | |||
| 82 | if (dev_size == 0) or (test_size == 0): |
||
| 83 | return X_train, X_dev_test, y_train, y_dev_test |
||
| 84 | |||
| 85 | else: |
||
| 86 | X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test, |
||
| 87 | test_size=test_size/(dev_size+test_size), |
||
| 88 | random_state=random_state, |
||
| 89 | stratify=y_dev_test) |
||
| 90 | return X_train, X_dev, X_test, y_train, y_dev, y_test |
||
| 91 | |||
| 154 |