|
1
|
|
|
#! /usr/bin/env python |
|
2
|
|
|
# |
|
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
|
4
|
|
|
# License: 3-clause BSD |
|
5
|
|
|
|
|
6
|
1 |
|
""" |
|
7
|
|
|
## skchem.data.transformers.tox21 |
|
8
|
|
|
|
|
9
|
|
|
Module defining transformation techniques for tox21. |
|
10
|
|
|
""" |
|
11
|
|
|
|
|
12
|
1 |
|
import zipfile |
|
13
|
1 |
|
import os |
|
14
|
1 |
|
import logging |
|
15
|
1 |
|
LOGGER = logging.getLogger(__name__) |
|
16
|
|
|
|
|
17
|
1 |
|
import numpy as np |
|
|
|
|
|
|
18
|
1 |
|
import pandas as pd |
|
|
|
|
|
|
19
|
|
|
|
|
20
|
1 |
|
from .base import Converter, default_pipeline |
|
21
|
1 |
|
from ... import io |
|
22
|
1 |
|
from ... import core |
|
23
|
|
|
|
|
24
|
1 |
|
class Tox21Converter(Converter): |
|
25
|
|
|
|
|
26
|
|
|
""" Class to build tox21 dataset. |
|
27
|
|
|
|
|
28
|
|
|
""" |
|
29
|
1 |
|
def __init__(self, directory, output_directory, output_filename='tox21.h5'): |
|
|
|
|
|
|
30
|
|
|
|
|
31
|
|
|
output_path = os.path.join(output_directory, output_filename) |
|
32
|
|
|
|
|
33
|
|
|
# extract data |
|
34
|
|
|
train, valid, test = self.extract(directory) |
|
35
|
|
|
|
|
36
|
|
|
# read data |
|
37
|
|
|
train = self.read_train(train) |
|
38
|
|
|
valid = self.read_valid(valid) |
|
39
|
|
|
test = self.read_test(test, os.path.join(directory, 'test.txt')) |
|
40
|
|
|
|
|
41
|
|
|
# combine into full dataset |
|
42
|
|
|
data = pd.concat([train, valid, test], keys=['train', 'valid', 'test']).sort_index() |
|
43
|
|
|
data.index.names = 'ds', 'id' |
|
44
|
|
|
|
|
45
|
|
|
ms, y = data.structure, data.drop('structure', axis=1) |
|
|
|
|
|
|
46
|
|
|
|
|
47
|
|
|
pipeline = default_pipeline() |
|
48
|
|
|
ms, y = pipeline.transform_filter(ms, y) |
|
|
|
|
|
|
49
|
|
|
|
|
50
|
|
|
# generate splits |
|
51
|
|
|
ms, y = ms.reset_index(0), y.reset_index(0) |
|
|
|
|
|
|
52
|
|
|
split_arr = ms.pop('ds') |
|
53
|
|
|
y.pop('ds') |
|
54
|
|
|
|
|
55
|
|
|
splits = [(split, split_arr == split) for split in ('train', 'valid', 'test')] |
|
56
|
|
|
|
|
57
|
|
|
y.columns.name = 'tasks' |
|
58
|
|
|
|
|
59
|
|
|
# call the Converter to make the final dataset |
|
60
|
|
|
self.run(ms, y, output_path, splits=splits) |
|
61
|
|
|
|
|
62
|
1 |
|
@staticmethod |
|
63
|
|
|
def fix_id(s): |
|
|
|
|
|
|
64
|
|
|
return s.split('-')[0] |
|
65
|
|
|
|
|
66
|
1 |
|
@staticmethod |
|
67
|
|
|
def fix_assay_name(s): |
|
|
|
|
|
|
68
|
|
|
return s.replace('-', '_') |
|
69
|
|
|
|
|
70
|
1 |
|
@staticmethod |
|
71
|
|
|
def patch_test(test): |
|
|
|
|
|
|
72
|
|
|
test_1 = pd.Series({ |
|
73
|
|
|
'structure': core.Mol.from_smiles('FC(F)(F)c1[nH]c(c(C#N)c1Br)C1=CC=C(Cl)C=C1', name='NCGC00357062'), |
|
|
|
|
|
|
74
|
|
|
'stochiometry': 0, |
|
75
|
|
|
'Compound ID': 'NCGC00357062', |
|
76
|
|
|
'Sample ID': 'NCGC00357062-01'}, name='NCGC00357062') |
|
77
|
|
|
test['NCGC00357062'] = test_1 |
|
78
|
|
|
return test |
|
79
|
|
|
|
|
80
|
1 |
|
def read_train(self, train): |
|
|
|
|
|
|
81
|
|
|
|
|
82
|
|
|
train = io.read_sdf(train) |
|
83
|
|
|
train.columns = train.columns.to_series().apply(self.fix_assay_name) |
|
84
|
|
|
train.index = train.index.to_series().apply(self.fix_id) |
|
85
|
|
|
self.assays = train.columns[-12:] |
|
86
|
|
|
self.keep_cols = ['structure'] + self.assays.tolist() |
|
87
|
|
|
train[self.assays] = train[self.assays].astype(float) |
|
88
|
|
|
train = train[self.keep_cols] |
|
89
|
|
|
train = train.sort_index() |
|
90
|
|
|
ms = train.structure[~train.index.duplicated()] |
|
|
|
|
|
|
91
|
|
|
train = train[self.assays].groupby(train.index).max() |
|
92
|
|
|
train = ms.to_frame().join(train) |
|
93
|
|
|
return train |
|
94
|
|
|
|
|
95
|
1 |
|
def read_valid(self, valid): |
|
|
|
|
|
|
96
|
|
|
|
|
97
|
|
|
valid = io.read_sdf(valid) |
|
98
|
|
|
valid.columns = valid.columns.to_series().apply(self.fix_assay_name) |
|
99
|
|
|
valid = valid[self.keep_cols] |
|
100
|
|
|
valid[self.assays] = valid[self.assays].astype(float) |
|
101
|
|
|
return valid |
|
102
|
|
|
|
|
103
|
1 |
|
def read_test(self, test, test_data): |
|
|
|
|
|
|
104
|
|
|
|
|
105
|
|
|
test = io.read_sdf(test) |
|
106
|
|
|
test = self.patch_test(test) |
|
107
|
|
|
test_data = pd.read_table(test_data) |
|
108
|
|
|
test_data['Sample ID'] = test_data['Sample ID'].apply(self.fix_id) |
|
109
|
|
|
test = test.join(test_data.set_index('Sample ID')) |
|
110
|
|
|
|
|
111
|
|
|
test.columns = test.columns.to_series().apply(self.fix_assay_name) |
|
112
|
|
|
test = test[self.keep_cols] |
|
113
|
|
|
test[test == 'x'] = np.nan |
|
114
|
|
|
test[self.assays] = test[self.assays].astype(float) |
|
115
|
|
|
return test |
|
116
|
|
|
|
|
117
|
1 |
|
def extract(self, directory): |
|
|
|
|
|
|
118
|
|
|
|
|
119
|
|
|
with zipfile.ZipFile(os.path.join(directory, 'train.sdf.zip')) as f: |
|
|
|
|
|
|
120
|
|
|
train = f.extract('tox21_10k_data_all.sdf') |
|
121
|
|
|
|
|
122
|
|
|
with zipfile.ZipFile(os.path.join(directory, 'valid.sdf.zip')) as f: |
|
|
|
|
|
|
123
|
|
|
valid = f.extract('tox21_10k_challenge_test.sdf') |
|
124
|
|
|
|
|
125
|
|
|
with zipfile.ZipFile(os.path.join(directory, 'test.sdf.zip')) as f: |
|
|
|
|
|
|
126
|
|
|
test = f.extract('tox21_10k_challenge_score.sdf') |
|
127
|
|
|
|
|
128
|
|
|
return train, valid, test |
|
129
|
|
|
|
|
130
|
1 |
|
if __name__ == '__main__': |
|
131
|
|
|
logging.basicConfig(level=logging.INFO) |
|
132
|
|
|
LOGGER.info('Converting Tox21 Dataset...') |
|
133
|
|
|
Tox21Converter.convert() |
|
134
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.pyfiles in your module folders. Make sure that you place one file in each sub-folder.