1
|
|
|
#! /usr/bin/env python |
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
1 |
|
""" |
7
|
|
|
# skchem.data.converters.chembl |
8
|
|
|
|
9
|
|
|
Dataset constructor for ChEMBL |
10
|
|
|
""" |
11
|
1 |
|
import logging |
12
|
1 |
|
import pandas as pd |
|
|
|
|
13
|
1 |
|
import os |
14
|
|
|
|
15
|
1 |
|
from .base import Converter, default_pipeline, contiguous_order, Feature |
16
|
1 |
|
from ...cross_validation import SimThresholdSplit |
17
|
1 |
|
from ... import features |
18
|
|
|
|
19
|
1 |
|
LOGGER = logging.getLogger(__name__) |
20
|
|
|
|
21
|
|
|
|
22
|
1 |
|
class ChEMBLConverter(Converter): |
23
|
|
|
|
24
|
|
|
""" Converter for the ChEMBL dataset. """ |
25
|
|
|
|
26
|
1 |
|
def __init__(self, directory, output_directory, output_filename='chembl.h5'): |
|
|
|
|
27
|
|
|
|
28
|
|
|
output_path = os.path.join(output_directory, output_filename) |
29
|
|
|
|
30
|
|
|
infile = os.path.join(directory, 'chembl_raw.h5') |
31
|
|
|
ms, y = self.parse_infile(infile) |
|
|
|
|
32
|
|
|
|
33
|
|
|
pipeline = default_pipeline() |
34
|
|
|
|
35
|
|
|
ms, y = pipeline.transform_filter(ms, y) |
|
|
|
|
36
|
|
|
|
37
|
|
|
cv = SimThresholdSplit(min_threshold=0.6, n_jobs=-1).fit(ms) |
|
|
|
|
38
|
|
|
train, valid, test = cv.split((70, 15, 15)) |
39
|
|
|
(ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test)) |
|
|
|
|
40
|
|
|
splits = (('train', train), ('valid', valid), ('test', test)) |
41
|
|
|
|
42
|
|
|
feats = ( |
43
|
|
|
Feature(fper=features.MorganFeaturizer(), |
|
|
|
|
44
|
|
|
key='X_morg', |
45
|
|
|
axis_names=['batch', 'features']), |
46
|
|
|
Feature(fper=features.PhysicochemicalFeaturizer(), |
|
|
|
|
47
|
|
|
key='X_pc', |
48
|
|
|
axis_names=['batch', 'features']), |
49
|
|
|
Feature(fper=features.AtomFeaturizer(max_atoms=100), |
|
|
|
|
50
|
|
|
key='A', |
51
|
|
|
axis_names=['batch', 'atom_idx', 'features']), |
52
|
|
|
Feature(fper=features.GraphDistanceTransformer(max_atoms=100), |
|
|
|
|
53
|
|
|
key='G', |
54
|
|
|
axis_names=['batch', 'atom_idx', 'atom_idx']), |
55
|
|
|
Feature(fper=features.SpacialDistanceTransformer(max_atoms=100), |
|
|
|
|
56
|
|
|
key='G_d')) |
57
|
|
|
|
58
|
|
|
self.run(ms, y, output_path, features=feats, splits=splits) |
59
|
|
|
|
60
|
|
|
|
61
|
1 |
|
def parse_infile(self, filename): |
|
|
|
|
62
|
|
|
|
63
|
|
|
ms = pd.read_hdf(filename, 'structure') |
|
|
|
|
64
|
|
|
y = pd.read_hdf(filename, 'targets/Y') |
|
|
|
|
65
|
|
|
return ms, y |
66
|
|
|
|
67
|
1 |
|
if __name__ == '__main__': |
68
|
|
|
logging.basicConfig(level=logging.DEBUG) |
69
|
|
|
LOGGER.info('Converting ChEMBL...') |
70
|
|
|
ChEMBLConverter.convert() |
71
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.