1
|
|
|
#! /usr/bin/env python |
|
|
|
|
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
1 |
|
import os |
7
|
1 |
|
import zipfile |
8
|
1 |
|
import logging |
9
|
|
|
|
10
|
1 |
|
LOGGER = logging.getLogger(__name__) |
11
|
|
|
|
12
|
1 |
|
import numpy as np |
|
|
|
|
13
|
|
|
|
14
|
1 |
|
from ... import io |
15
|
|
|
|
16
|
1 |
|
from .base import Converter, default_pipeline, contiguous_order |
17
|
1 |
|
from ...cross_validation import SimThresholdSplit |
18
|
|
|
|
19
|
1 |
|
class BursiAmesConverter(Converter): |
|
|
|
|
20
|
|
|
|
21
|
1 |
|
def __init__(self, directory, output_directory, output_filename='bursi_ames.h5'): |
|
|
|
|
22
|
|
|
|
23
|
|
|
zip_path = os.path.join(directory, 'cas_4337.zip') |
24
|
|
|
output_path = os.path.join(output_directory, output_filename) |
25
|
|
|
|
26
|
|
|
with zipfile.ZipFile(zip_path) as f: |
|
|
|
|
27
|
|
|
sdf_path = f.extract('cas_4337.sdf') |
28
|
|
|
|
29
|
|
|
data = io.read_sdf(sdf_path) |
30
|
|
|
data.index.name = 'batch' |
31
|
|
|
data['is_mutagen'] = (data['Ames test categorisation'] == 'mutagen').astype(np.uint8) |
32
|
|
|
ms, y = data.structure, data.is_mutagen |
|
|
|
|
33
|
|
|
pipeline = default_pipeline() |
34
|
|
|
ms, y = pipeline.transform_filter(ms, y) |
|
|
|
|
35
|
|
|
|
36
|
|
|
cv = SimThresholdSplit(min_threshold=0.6, n_jobs=-1).fit(ms) |
|
|
|
|
37
|
|
|
train, valid, test = cv.split((70, 15, 15)) |
38
|
|
|
(ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test)) |
|
|
|
|
39
|
|
|
splits = (('train', train), ('valid', valid), ('test', test)) |
40
|
|
|
self.run(ms, y, output_path, splits=splits) |
41
|
|
|
|
42
|
1 |
|
if __name__ == '__main__': |
43
|
|
|
logging.basicConfig(level=logging.INFO) |
44
|
|
|
LOGGER.info('Converting Bursi Ames Dataset...') |
45
|
|
|
BursiAmesConverter.convert() |
46
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.