1
|
|
|
#! /usr/bin/env python |
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
1 |
|
""" |
7
|
|
|
## skchem.data.transformers.tox21 |
8
|
|
|
|
9
|
|
|
Module defining transformation techniques for tox21. |
10
|
|
|
""" |
11
|
|
|
|
12
|
1 |
|
import zipfile |
13
|
1 |
|
import os |
14
|
1 |
|
import logging |
15
|
1 |
|
LOGGER = logging.getLogger(__name__) |
16
|
|
|
|
17
|
1 |
|
import numpy as np |
|
|
|
|
18
|
1 |
|
import pandas as pd |
|
|
|
|
19
|
|
|
|
20
|
1 |
|
from .base import Converter, default_pipeline |
21
|
1 |
|
from ... import io |
22
|
1 |
|
from ... import core |
23
|
|
|
|
24
|
1 |
|
class Tox21Converter(Converter): |
25
|
|
|
|
26
|
|
|
""" Class to build tox21 dataset. |
27
|
|
|
|
28
|
|
|
""" |
29
|
1 |
|
def __init__(self, directory, output_directory, output_filename='tox21.h5'): |
|
|
|
|
30
|
|
|
|
31
|
|
|
output_path = os.path.join(output_directory, output_filename) |
32
|
|
|
|
33
|
|
|
# extract data |
34
|
|
|
train, valid, test = self.extract(directory) |
35
|
|
|
|
36
|
|
|
# read data |
37
|
|
|
train = self.read_train(train) |
38
|
|
|
valid = self.read_valid(valid) |
39
|
|
|
test = self.read_test(test, os.path.join(directory, 'test.txt')) |
40
|
|
|
|
41
|
|
|
# combine into full dataset |
42
|
|
|
data = pd.concat([train, valid, test], keys=['train', 'valid', 'test']).sort_index() |
43
|
|
|
data.index.names = 'ds', 'id' |
44
|
|
|
|
45
|
|
|
ms, y = data.structure, data.drop('structure', axis=1) |
|
|
|
|
46
|
|
|
|
47
|
|
|
pipeline = default_pipeline() |
48
|
|
|
ms, y = pipeline.transform_filter(ms, y) |
|
|
|
|
49
|
|
|
|
50
|
|
|
# generate splits |
51
|
|
|
ms, y = ms.reset_index(0), y.reset_index(0) |
|
|
|
|
52
|
|
|
split_arr = ms.pop('ds') |
53
|
|
|
y.pop('ds') |
54
|
|
|
|
55
|
|
|
splits = [(split, split_arr == split) for split in ('train', 'valid', 'test')] |
56
|
|
|
|
57
|
|
|
y.columns.name = 'tasks' |
58
|
|
|
|
59
|
|
|
# call the Converter to make the final dataset |
60
|
|
|
self.run(ms, y, output_path, splits=splits) |
61
|
|
|
|
62
|
1 |
|
@staticmethod |
63
|
|
|
def fix_id(s): |
|
|
|
|
64
|
|
|
return s.split('-')[0] |
65
|
|
|
|
66
|
1 |
|
@staticmethod |
67
|
|
|
def fix_assay_name(s): |
|
|
|
|
68
|
|
|
return s.replace('-', '_') |
69
|
|
|
|
70
|
1 |
|
@staticmethod |
71
|
|
|
def patch_test(test): |
|
|
|
|
72
|
|
|
test_1 = pd.Series({ |
73
|
|
|
'structure': core.Mol.from_smiles('FC(F)(F)c1[nH]c(c(C#N)c1Br)C1=CC=C(Cl)C=C1', name='NCGC00357062'), |
|
|
|
|
74
|
|
|
'stochiometry': 0, |
75
|
|
|
'Compound ID': 'NCGC00357062', |
76
|
|
|
'Sample ID': 'NCGC00357062-01'}, name='NCGC00357062') |
77
|
|
|
test['NCGC00357062'] = test_1 |
78
|
|
|
return test |
79
|
|
|
|
80
|
1 |
|
def read_train(self, train): |
|
|
|
|
81
|
|
|
|
82
|
|
|
train = io.read_sdf(train) |
83
|
|
|
train.columns = train.columns.to_series().apply(self.fix_assay_name) |
84
|
|
|
train.index = train.index.to_series().apply(self.fix_id) |
85
|
|
|
self.assays = train.columns[-12:] |
86
|
|
|
self.keep_cols = ['structure'] + self.assays.tolist() |
87
|
|
|
train[self.assays] = train[self.assays].astype(float) |
88
|
|
|
train = train[self.keep_cols] |
89
|
|
|
train = train.sort_index() |
90
|
|
|
ms = train.structure[~train.index.duplicated()] |
|
|
|
|
91
|
|
|
train = train[self.assays].groupby(train.index).max() |
92
|
|
|
train = ms.to_frame().join(train) |
93
|
|
|
return train |
94
|
|
|
|
95
|
1 |
|
def read_valid(self, valid): |
|
|
|
|
96
|
|
|
|
97
|
|
|
valid = io.read_sdf(valid) |
98
|
|
|
valid.columns = valid.columns.to_series().apply(self.fix_assay_name) |
99
|
|
|
valid = valid[self.keep_cols] |
100
|
|
|
valid[self.assays] = valid[self.assays].astype(float) |
101
|
|
|
return valid |
102
|
|
|
|
103
|
1 |
|
def read_test(self, test, test_data): |
|
|
|
|
104
|
|
|
|
105
|
|
|
test = io.read_sdf(test) |
106
|
|
|
test = self.patch_test(test) |
107
|
|
|
test_data = pd.read_table(test_data) |
108
|
|
|
test_data['Sample ID'] = test_data['Sample ID'].apply(self.fix_id) |
109
|
|
|
test = test.join(test_data.set_index('Sample ID')) |
110
|
|
|
|
111
|
|
|
test.columns = test.columns.to_series().apply(self.fix_assay_name) |
112
|
|
|
test = test[self.keep_cols] |
113
|
|
|
test[test == 'x'] = np.nan |
114
|
|
|
test[self.assays] = test[self.assays].astype(float) |
115
|
|
|
return test |
116
|
|
|
|
117
|
1 |
|
def extract(self, directory): |
|
|
|
|
118
|
|
|
|
119
|
|
|
with zipfile.ZipFile(os.path.join(directory, 'train.sdf.zip')) as f: |
|
|
|
|
120
|
|
|
train = f.extract('tox21_10k_data_all.sdf') |
121
|
|
|
|
122
|
|
|
with zipfile.ZipFile(os.path.join(directory, 'valid.sdf.zip')) as f: |
|
|
|
|
123
|
|
|
valid = f.extract('tox21_10k_challenge_test.sdf') |
124
|
|
|
|
125
|
|
|
with zipfile.ZipFile(os.path.join(directory, 'test.sdf.zip')) as f: |
|
|
|
|
126
|
|
|
test = f.extract('tox21_10k_challenge_score.sdf') |
127
|
|
|
|
128
|
|
|
return train, valid, test |
129
|
|
|
|
130
|
1 |
|
if __name__ == '__main__': |
131
|
|
|
logging.basicConfig(level=logging.INFO) |
132
|
|
|
LOGGER.info('Converting Tox21 Dataset...') |
133
|
|
|
Tox21Converter.convert() |
134
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.