1
|
|
|
#! /usr/bin/env python |
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
1 |
|
""" |
7
|
|
|
# skchem.data.coverters.example |
8
|
|
|
|
9
|
|
|
Formatter for the example dataset. |
10
|
|
|
""" |
11
|
|
|
|
12
|
1 |
|
import os |
13
|
|
|
|
14
|
1 |
|
import pandas as pd |
|
|
|
|
15
|
1 |
|
import numpy as np |
|
|
|
|
16
|
|
|
|
17
|
1 |
|
from .base import Converter, contiguous_order, Feature |
18
|
1 |
|
from ...pipeline import Pipeline |
19
|
1 |
|
from ...io import read_sdf |
20
|
1 |
|
from ...cross_validation import SimThresholdSplit |
21
|
1 |
|
from ...features import MorganFeaturizer |
22
|
1 |
|
from ...standardizers import ChemAxonStandardizer |
23
|
|
|
|
24
|
1 |
|
class DiversityConverter(Converter): |
25
|
|
|
|
26
|
|
|
""" Example Converter, using the NCI DTP Diversity Set III. """ |
27
|
|
|
|
28
|
1 |
|
def __init__(self, directory, output_directory, output_filename='diversity.h5'): |
|
|
|
|
29
|
|
|
|
30
|
|
|
output_path = os.path.join(output_directory, output_filename) |
31
|
|
|
|
32
|
|
|
ms = self.parse_file(os.path.join(directory, 'structures.sdf')) |
|
|
|
|
33
|
|
|
y = self.synthetic_targets(ms.index) |
|
|
|
|
34
|
|
|
|
35
|
|
|
pipeline = Pipeline([ChemAxonStandardizer(keep_failed=True)]) |
|
|
|
|
36
|
|
|
|
37
|
|
|
cv = SimThresholdSplit(min_threshold=0.6, n_jobs=-1).fit(ms) |
|
|
|
|
38
|
|
|
train, valid, test = cv.split((70, 15, 15)) |
39
|
|
|
(ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test)) |
|
|
|
|
40
|
|
|
splits = (('train', train), ('valid', valid), ('test', test)) |
41
|
|
|
|
42
|
|
|
features = [Feature(fper=MorganFeaturizer(), key='X_morg', axis_names=['batch', 'features'])] |
|
|
|
|
43
|
|
|
|
44
|
|
|
self.run(ms, y, output_path, splits=splits, features=features) |
45
|
|
|
|
46
|
1 |
|
def parse_file(self, path): |
|
|
|
|
47
|
|
|
return read_sdf(path).structure |
48
|
|
|
|
49
|
|
|
def synthetic_targets(self, index): |
|
|
|
|
50
|
|
|
return pd.Series(np.random.choice([0, 1], size=len(index)), index=index) |
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.