|
1
|
|
|
#! /usr/bin/env python |
|
2
|
|
|
# |
|
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
|
4
|
|
|
# License: 3-clause BSD |
|
5
|
|
|
|
|
6
|
|
|
""" |
|
7
|
|
|
## skchem.cross_validation.similarity_threshold |
|
8
|
|
|
|
|
9
|
|
|
Similarity threshold dataset partitioning functionality. |
|
10
|
|
|
""" |
|
11
|
|
|
|
|
12
|
|
|
|
|
13
|
|
|
import numpy as np |
|
|
|
|
|
|
14
|
|
|
import pandas as pd |
|
|
|
|
|
|
15
|
|
|
|
|
16
|
|
|
from scipy.spatial.distance import pdist, squareform, cdist |
|
|
|
|
|
|
17
|
|
|
from scipy.sparse import dok_matrix, triu |
|
|
|
|
|
|
18
|
|
|
|
|
19
|
|
|
from .. import descriptors |
|
20
|
|
|
|
|
21
|
|
|
|
|
22
|
|
|
class SimThresholdSplit(object): |
|
|
|
|
|
|
23
|
|
|
|
|
24
|
|
|
def __init__(self, inp, threshold=0.5, fper='morgan', |
|
|
|
|
|
|
25
|
|
|
similarity_metric='jaccard', memory_optimized=False, |
|
26
|
|
|
fingerprints=None, similarity_matrix=None): |
|
27
|
|
|
""" Threshold similarity split for chemical datasets. |
|
28
|
|
|
|
|
29
|
|
|
This class implements a splitting technique that will pool compounds |
|
30
|
|
|
with similarity above a theshold into the same splits. |
|
31
|
|
|
|
|
32
|
|
|
Machine learning techniques should be able to extrapolate outside of a |
|
33
|
|
|
molecular series, or scaffold, however random splits will result in some |
|
34
|
|
|
'easy' test sets that are either *identical* or in the same molecular |
|
35
|
|
|
series or share a significant scaffold with training set compounds. |
|
36
|
|
|
|
|
37
|
|
|
This splitting technique reduces or eliminates (depending on the |
|
38
|
|
|
threshold set) this effect, making the problem harder. |
|
39
|
|
|
|
|
40
|
|
|
Args: |
|
41
|
|
|
inp (scipy.sparse.dok, pd.Series or pd.DataFrame): |
|
42
|
|
|
Either: |
|
43
|
|
|
- a series of skchem.Mols |
|
44
|
|
|
- dataframe of precalculated fingerprints |
|
45
|
|
|
|
|
46
|
|
|
n_splits (int): |
|
47
|
|
|
The number of splits to give. This will be overridden if ratio |
|
48
|
|
|
is passed. |
|
49
|
|
|
|
|
50
|
|
|
ratio (list[floats]): |
|
51
|
|
|
Split ratios to use. |
|
52
|
|
|
|
|
53
|
|
|
threshold (float): |
|
54
|
|
|
The similarity threshold, above which, compounds will all be |
|
55
|
|
|
assigned to the same split. |
|
56
|
|
|
|
|
57
|
|
|
fper (str or skchem.Fingerprinter): |
|
58
|
|
|
The fingerprinting technique to use to generate the similarity |
|
59
|
|
|
matrix. |
|
60
|
|
|
|
|
61
|
|
|
fingerprints (bool): |
|
62
|
|
|
Whether percalculated fingerprints were passed directly. |
|
63
|
|
|
|
|
64
|
|
|
similarity_matrix (scipy.sparse.dok): |
|
65
|
|
|
A precalculated similarity matrix. |
|
66
|
|
|
|
|
67
|
|
|
Notes: |
|
68
|
|
|
The splits will not always be exactly the size requested, due to the |
|
69
|
|
|
constraint and requirement to maintain random shuffling. |
|
70
|
|
|
""" |
|
71
|
|
|
|
|
72
|
|
|
if isinstance(fper, str): |
|
73
|
|
|
fper = descriptors.get(fper) |
|
74
|
|
|
|
|
75
|
|
|
self.fper = fper |
|
76
|
|
|
fps = inp if fingerprints else self.fper.transform(inp) |
|
77
|
|
|
|
|
78
|
|
|
self.n_instances = len(inp) |
|
79
|
|
|
|
|
80
|
|
|
self.threshold = threshold |
|
81
|
|
|
self.similarity_metric = similarity_metric |
|
82
|
|
|
self.memory_optimized = memory_optimized |
|
83
|
|
|
|
|
84
|
|
|
if not similarity_matrix: |
|
85
|
|
|
similarity_matrix = self.similarity_matrix(fps) |
|
86
|
|
|
|
|
87
|
|
|
self.clusters = pd.Series(self._cluster(similarity_matrix), |
|
88
|
|
|
index=fps.index, |
|
89
|
|
|
name='clusters') |
|
90
|
|
|
|
|
91
|
|
|
def _cluster_cumsum(self, shuffled=True): |
|
92
|
|
|
|
|
93
|
|
|
nums = self.clusters.value_counts() |
|
94
|
|
|
if shuffled: |
|
95
|
|
|
nums = nums.ix[np.random.permutation(nums.index)].cumsum() |
|
96
|
|
|
return nums |
|
97
|
|
|
|
|
98
|
|
|
def split(self, ratio): |
|
99
|
|
|
|
|
100
|
|
|
""" Return splits of the data with thresholded similarity according to a |
|
101
|
|
|
specified ratio. |
|
102
|
|
|
|
|
103
|
|
|
Args: |
|
104
|
|
|
ratio (tuple[ints]): |
|
105
|
|
|
the ratio to use. |
|
106
|
|
|
Returns: |
|
107
|
|
|
generator[pd.Series]: |
|
108
|
|
|
Generator of boolean split masks for the reqested splits. |
|
109
|
|
|
|
|
110
|
|
|
Example: |
|
111
|
|
|
st = SimThresholdSplit(ms, fper='morgan', similarity_metric='jaccard') |
|
112
|
|
|
train, valid, test = st.split(ratio=(70, 15, 15)) |
|
113
|
|
|
""" |
|
114
|
|
|
|
|
115
|
|
|
ratio = self.split_sizes(ratio) |
|
116
|
|
|
nums = self._cluster_cumsum() |
|
117
|
|
|
res = pd.Series(np.nan, index=nums.index, name='split') |
|
118
|
|
|
|
|
119
|
|
|
for i, _ in enumerate(ratio): |
|
120
|
|
|
lower = 0 if i == 0 else sum(ratio[:i]) |
|
121
|
|
|
upper = len(ratio) if i == len(ratio) else sum(ratio[:i + 1]) |
|
122
|
|
|
res[nums[(nums > lower) & (nums <= upper)].index] = i |
|
123
|
|
|
|
|
124
|
|
|
res = res.sort_index() |
|
125
|
|
|
res = self.clusters.to_frame().join(res, on='clusters')['split'] |
|
126
|
|
|
return (res == i for i, _ in enumerate(ratio)) |
|
127
|
|
|
|
|
128
|
|
|
def k_fold(self, n_folds): |
|
129
|
|
|
|
|
130
|
|
|
""" Returns k-fold cross-validated folds with thresholded similarity. |
|
131
|
|
|
|
|
132
|
|
|
Args: |
|
133
|
|
|
n_folds (int): |
|
134
|
|
|
The number of folds to provide. |
|
135
|
|
|
|
|
136
|
|
|
Returns: |
|
137
|
|
|
generator[(pd.Series, pd.Series)]: |
|
138
|
|
|
The splits in series. |
|
139
|
|
|
""" |
|
140
|
|
|
|
|
141
|
|
|
folds = self.split((1,) * n_folds) |
|
142
|
|
|
return ((~fold, fold) for fold in folds) |
|
143
|
|
|
|
|
144
|
|
|
|
|
145
|
|
|
def split_sizes(self, ratio): |
|
146
|
|
|
""" Calculate the sizes of the splits """ |
|
147
|
|
|
|
|
148
|
|
|
tot = sum(ratio) |
|
149
|
|
|
return [self.n_instances * rat / tot for rat in ratio] |
|
150
|
|
|
|
|
151
|
|
|
|
|
152
|
|
|
def similarity_matrix(self, fps): |
|
153
|
|
|
""" Calculate the similarity matrix for fingerprints. """ |
|
154
|
|
|
|
|
155
|
|
|
if self.memory_optimized: |
|
156
|
|
|
return self._sim_low_mem(fps) |
|
157
|
|
|
else: |
|
158
|
|
|
return self._sim(fps) |
|
159
|
|
|
|
|
160
|
|
|
|
|
161
|
|
|
def _sim(self, fps): |
|
162
|
|
|
""" Fast but memory intensive implementation of similarity matrix |
|
163
|
|
|
calculation. """ |
|
164
|
|
|
|
|
165
|
|
|
D = squareform(pdist(fps, self.similarity_metric)) |
|
|
|
|
|
|
166
|
|
|
D = 1 - D # similarity is 1 - distance |
|
|
|
|
|
|
167
|
|
|
return triu(D >= self.threshold, k=1).todok() |
|
168
|
|
|
|
|
169
|
|
|
def _sim_low_mem(self, fps): |
|
170
|
|
|
""" Slow but memory efficient implementation of similarity matrix |
|
171
|
|
|
calculation """ |
|
172
|
|
|
|
|
173
|
|
|
S = dok_matrix(len(fps), len(fps)) |
|
|
|
|
|
|
174
|
|
|
for i, fp in enumerate(fps): |
|
|
|
|
|
|
175
|
|
|
D = cdist(fp[np.newaxis, :], fps[i + 1:], self.similarity_metric) |
|
|
|
|
|
|
176
|
|
|
D = 1 - D |
|
|
|
|
|
|
177
|
|
|
S[i, i + 1:] = dok_matrix(D >= threshold) |
|
|
|
|
|
|
178
|
|
|
return S |
|
179
|
|
|
|
|
180
|
|
|
def _cluster(self, S): |
|
|
|
|
|
|
181
|
|
|
""" Assign instances to clusters. """ |
|
182
|
|
|
|
|
183
|
|
|
pairs = sorted(S.keys(), key=lambda x: x[0]) # sort pairs by first index |
|
184
|
|
|
clustered = np.arange(self.n_instances) |
|
185
|
|
|
|
|
186
|
|
|
for i, j in pairs: |
|
187
|
|
|
clustered[j] = clustered[i] |
|
188
|
|
|
|
|
189
|
|
|
return clustered |
|
190
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.pyfiles in your module folders. Make sure that you place one file in each sub-folder.