1
|
|
|
#! /usr/bin/env python |
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
1 |
|
""" |
7
|
|
|
# skchem.data.converters.chembl |
8
|
|
|
|
9
|
|
|
Dataset constructor for ChEMBL |
10
|
|
|
""" |
11
|
|
|
|
12
|
1 |
|
import pandas as pd |
|
|
|
|
13
|
1 |
|
import os |
14
|
|
|
|
15
|
1 |
|
from .base import Converter, default_pipeline, contiguous_order, Feature |
16
|
1 |
|
from ...cross_validation import SimThresholdSplit |
17
|
1 |
|
from ... import descriptors |
18
|
|
|
|
19
|
|
|
|
20
|
1 |
|
class ChEMBLConverter(Converter): |
21
|
|
|
|
22
|
|
|
""" Converter for the ChEMBL dataset. """ |
23
|
|
|
|
24
|
1 |
|
def __init__(self, directory, output_directory, output_filename='chembl.h5'): |
|
|
|
|
25
|
|
|
|
26
|
|
|
output_path = os.path.join(output_directory, output_filename) |
27
|
|
|
|
28
|
|
|
infile = os.path.join(directory, 'chembl_raw.h5') |
29
|
|
|
ms, y = self.parse_infile(infile) |
|
|
|
|
30
|
|
|
|
31
|
|
|
pipeline = default_pipeline() |
32
|
|
|
|
33
|
|
|
ms, y = pipeline.transform_filter(ms, y) |
|
|
|
|
34
|
|
|
|
35
|
|
|
cv = SimThresholdSplit(min_threshold=0.6, n_jobs=-1).fit(ms) |
|
|
|
|
36
|
|
|
train, valid, test = cv.split((70, 15, 15)) |
37
|
|
|
(ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test)) |
|
|
|
|
38
|
|
|
splits = (('train', train), ('valid', valid), ('test', test)) |
39
|
|
|
|
40
|
|
|
features = ( |
|
|
|
|
41
|
|
|
Feature(fper=descriptors.MorganFeaturizer(), |
|
|
|
|
42
|
|
|
key='X_morg', |
43
|
|
|
axis_names=['batch', 'features']), |
44
|
|
|
Feature(fper=descriptors.PhysicochemicalFeaturizer(), |
|
|
|
|
45
|
|
|
key='X_pc', |
46
|
|
|
axis_names=['batch', 'features']), |
47
|
|
|
Feature(fper=descriptors.AtomFeaturizer(max_atoms=100), |
|
|
|
|
48
|
|
|
key='A', |
49
|
|
|
axis_names=['batch', 'atom_idx', 'features']), |
50
|
|
|
Feature(fper=descriptors.GraphDistanceTransformer(max_atoms=100), |
|
|
|
|
51
|
|
|
key='G', |
52
|
|
|
axis_names=['batch', 'atom_idx', 'atom_idx']), |
53
|
|
|
Feature(fper=descriptors.SpacialDistanceTransformer(max_atoms=100), |
|
|
|
|
54
|
|
|
key='G_d')) |
55
|
|
|
|
56
|
|
|
self.run(ms, y, output_path, splits=splits) |
57
|
|
|
|
58
|
|
|
|
59
|
1 |
|
def parse_infile(self, filename): |
|
|
|
|
60
|
|
|
|
61
|
|
|
ms = pd.read_hdf(filename, 'structure') |
|
|
|
|
62
|
|
|
y = pd.read_hdf(filename, 'targets/Y') |
|
|
|
|
63
|
|
|
return ms, y |
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.