1
|
|
|
#! /usr/bin/env python |
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
|
|
""" |
7
|
|
|
## skchem.cross_validation.similarity_threshold |
8
|
|
|
|
9
|
|
|
Similarity threshold dataset partitioning functionality. |
10
|
|
|
""" |
11
|
|
|
|
12
|
|
|
import logging |
13
|
|
|
import sys |
|
|
|
|
14
|
|
|
|
15
|
|
|
import numpy as np |
|
|
|
|
16
|
|
|
import pandas as pd |
|
|
|
|
17
|
|
|
import matplotlib.pyplot as plt |
|
|
|
|
18
|
|
|
from scipy.spatial.distance import pdist, squareform |
|
|
|
|
19
|
|
|
from scipy.sparse import triu |
|
|
|
|
20
|
|
|
from scipy.optimize import minimize_scalar |
|
|
|
|
21
|
|
|
|
22
|
|
|
import multiprocessing |
23
|
|
|
from functools import partial, wraps |
24
|
|
|
|
25
|
|
|
from .. import descriptors |
26
|
|
|
|
27
|
|
|
LOGGER = logging.getLogger(__name__) |
28
|
|
|
|
29
|
|
|
|
30
|
|
|
def returns_pairs(func): |
31
|
|
|
""" Wraps a function that returns a ((i, j), sim) list to return a dataframe. """ |
32
|
|
|
@wraps(func) |
33
|
|
|
def inner(*args, **kwargs): |
|
|
|
|
34
|
|
|
pairs = func(*args, **kwargs) |
35
|
|
|
return pd.DataFrame([(p[0][0], p[0][1], p[1]) for p in pairs], columns=['i', 'j', 'sim']).sort_values('sim') |
|
|
|
|
36
|
|
|
return inner |
37
|
|
|
|
38
|
|
|
|
39
|
|
|
def _above_minimum(args, X, metric, threshold, size): |
|
|
|
|
40
|
|
|
""" finds pairs above a minimum similarity in chunks """ |
41
|
|
|
from scipy.spatial.distance import cdist |
|
|
|
|
42
|
|
|
from scipy.sparse import dok_matrix |
|
|
|
|
43
|
|
|
import numpy as np |
|
|
|
|
44
|
|
|
i, j = slice(*args[0]), slice(*args[1]) |
45
|
|
|
x_i, x_j = X[i], X[j] |
46
|
|
|
C = 1 - cdist(x_i, x_j, metric=metric) |
|
|
|
|
47
|
|
|
if i == j: |
48
|
|
|
C = np.triu(C, k=1) |
|
|
|
|
49
|
|
|
C[C <= threshold] = 0 |
50
|
|
|
M = dok_matrix((size, size), dtype=float) |
|
|
|
|
51
|
|
|
M[i, j] = C |
52
|
|
|
return list(M.items()) |
53
|
|
|
|
54
|
|
|
|
55
|
|
|
class SimThresholdSplit(object): |
|
|
|
|
56
|
|
|
|
57
|
|
|
def __init__(self, inp, pairs=None, min_threshold=0.45, largest_cluster_fraction=0.1, fper='morgan', |
|
|
|
|
58
|
|
|
similarity_metric='jaccard', memory_optimized=True, n_jobs=1, block_width=1000, verbose=False): |
|
|
|
|
59
|
|
|
|
60
|
|
|
""" Threshold similarity split for chemical datasets. |
61
|
|
|
|
62
|
|
|
This class implements a splitting technique that will pool compounds |
63
|
|
|
with similarity above a theshold into the same splits. The threshold |
64
|
|
|
value is decided by specifying the maximum number of compounds to pool |
65
|
|
|
into a cluster, as the density of compounds varies with dataset. |
66
|
|
|
|
67
|
|
|
Machine learning techniques should be able to extrapolate outside of a |
68
|
|
|
molecular series, or scaffold, however random splits will result in some |
69
|
|
|
'easy' test sets that are either *identical* or in the same molecular |
70
|
|
|
series or share a significant scaffold with training set compounds. |
71
|
|
|
|
72
|
|
|
This splitting technique reduces or eliminates (depending on the |
73
|
|
|
threshold set) this effect, making the problem harder. |
74
|
|
|
|
75
|
|
|
Args: |
76
|
|
|
inp (pd.Series or pd.DataFrame or np.array): |
77
|
|
|
- `pd.Series` of `skchem.Mol` instances |
78
|
|
|
- `pd.DataFrame` with `skchm.Mol` instances as a `structure` row. |
79
|
|
|
- `pd.DataFrame` of fingerprints if `fper` is `None` |
80
|
|
|
- `pd.DataFrame` of similarity matrix if `similarity_metric` is `None` |
81
|
|
|
- `np.array` of similarity matrix if `similarity_metric` is `None` |
82
|
|
|
|
83
|
|
|
pairs (list<tuple<tuple(i, j), k>>): |
84
|
|
|
An optional precalculated list of pairwise distances. |
85
|
|
|
|
86
|
|
|
min_threshold (float): |
87
|
|
|
The minimum similarity threshold. Lower will be slower. |
88
|
|
|
|
89
|
|
|
largest_cluster_fraction (float): |
90
|
|
|
The fraction of the total dataset the largest cluster can be. This decided the final similarity |
|
|
|
|
91
|
|
|
threshold. |
92
|
|
|
|
93
|
|
|
fper (str or skchem.Fingerprinter): |
94
|
|
|
The fingerprinting technique to use to generate the similarity |
95
|
|
|
matrix. |
96
|
|
|
|
97
|
|
|
similarity_metric (str or callable): |
98
|
|
|
The similarity metric to use. |
99
|
|
|
|
100
|
|
|
memory_optimized (bool): |
101
|
|
|
Whether to use the memory optimized implementation. |
102
|
|
|
|
103
|
|
|
n_jobs (int): |
104
|
|
|
If memory_optimized is True, how many processes to run it over. |
105
|
|
|
|
106
|
|
|
block_width (int): |
107
|
|
|
If memory_optimized, what block length to use. This is the width of the sub |
108
|
|
|
matrices that are calculated at a time. |
109
|
|
|
|
110
|
|
|
Notes: |
111
|
|
|
The splits will not always be exactly the size requested, due to the |
112
|
|
|
constraint and requirement to maintain random shuffling. |
113
|
|
|
""" |
114
|
|
|
|
115
|
|
|
if isinstance(fper, str): |
116
|
|
|
fper = descriptors.get(fper) |
117
|
|
|
|
118
|
|
|
self.n_instances_ = len(inp) |
119
|
|
|
self.fper = fper |
120
|
|
|
self.similarity_metric = similarity_metric |
121
|
|
|
self.memory_optimized = memory_optimized |
122
|
|
|
self.n_jobs = n_jobs |
123
|
|
|
self.block_width = block_width |
124
|
|
|
self.min_threshold = min_threshold |
125
|
|
|
self.largest_cluster = largest_cluster_fraction |
126
|
|
|
self.pairs_ = pairs |
127
|
|
|
|
128
|
|
|
if self.fper: |
129
|
|
|
self.fper.verbose = verbose |
130
|
|
|
|
131
|
|
|
|
132
|
|
|
if isinstance(inp, (pd.Series, pd.DataFrame)): |
133
|
|
|
self.index = inp.index |
134
|
|
|
else: |
135
|
|
|
self.index = pd.RangeIndex(len(inp), name='batch') |
136
|
|
|
|
137
|
|
|
if similarity_metric is None: |
138
|
|
|
# we were passed a similarity matrix directly |
139
|
|
|
self.pairs_ = self._pairs_from_sim_mat(inp) |
140
|
|
|
|
141
|
|
|
elif fper is None: |
142
|
|
|
# we were passed fingerprints directly |
143
|
|
|
self.fps = inp |
144
|
|
|
if pairs is None: |
145
|
|
|
self.pairs_ = self._pairs_from_fps(inp) |
146
|
|
|
|
147
|
|
|
else: |
148
|
|
|
# we were passed Mol |
149
|
|
|
if pairs is None: |
150
|
|
|
self.fps = self.fper.transform(inp) |
151
|
|
|
self.pairs_ = self._pairs_from_fps(self.fps) |
152
|
|
|
|
153
|
|
|
self._optimal_thresh() |
154
|
|
|
|
155
|
|
|
@property |
156
|
|
|
def n_jobs(self): |
157
|
|
|
""" The number of processes to use to calculate the distance matrix. -1 for all available. """ |
|
|
|
|
158
|
|
|
return self._n_jobs |
159
|
|
|
|
160
|
|
|
@n_jobs.setter |
161
|
|
|
def n_jobs(self, val): |
|
|
|
|
162
|
|
|
if val == -1: |
163
|
|
|
self._n_jobs = multiprocessing.cpu_count() |
|
|
|
|
164
|
|
|
else: |
165
|
|
|
self._n_jobs = val |
|
|
|
|
166
|
|
|
|
167
|
|
|
@property |
168
|
|
|
def block_width(self): |
169
|
|
|
""" The width of the subsets of features. Only used in parallelized. """ |
170
|
|
|
return self._block_width |
171
|
|
|
|
172
|
|
|
@block_width.setter |
173
|
|
|
def block_width(self, val): |
|
|
|
|
174
|
|
|
assert val <= self.n_instances_, 'The block size should be less than or equal to the number of instances' |
|
|
|
|
175
|
|
|
self._block_width = val |
|
|
|
|
176
|
|
|
|
177
|
|
|
def _cluster_cumsum(self, shuffled=True): |
178
|
|
|
|
179
|
|
|
nums = self.clusters.value_counts() |
180
|
|
|
if shuffled: |
181
|
|
|
nums = nums.ix[np.random.permutation(nums.index)].cumsum() |
182
|
|
|
return nums |
183
|
|
|
|
184
|
|
|
def split(self, ratio): |
185
|
|
|
|
186
|
|
|
""" Return splits of the data with thresholded similarity according to a |
187
|
|
|
specified ratio. |
188
|
|
|
|
189
|
|
|
Args: |
190
|
|
|
ratio (tuple[ints]): |
191
|
|
|
the ratio to use. |
192
|
|
|
Returns: |
193
|
|
|
generator[pd.Series]: |
194
|
|
|
Generator of boolean split masks for the reqested splits. |
195
|
|
|
|
196
|
|
|
Example: |
197
|
|
|
st = SimThresholdSplit(ms, fper='morgan', similarity_metric='jaccard') |
198
|
|
|
train, valid, test = st.split(ratio=(70, 15, 15)) |
199
|
|
|
""" |
200
|
|
|
|
201
|
|
|
ratio = self._split_sizes(ratio) |
202
|
|
|
nums = self._cluster_cumsum() |
203
|
|
|
res = pd.Series(np.nan, index=nums.index, name='split') |
204
|
|
|
|
205
|
|
|
for i in range(len(ratio)): |
206
|
|
|
lower = 0 if i == 0 else sum(ratio[:i]) |
207
|
|
|
upper = ratio if i == len(ratio) else sum(ratio[:i + 1]) |
208
|
|
|
res[nums[(nums > lower) & (nums <= upper)].index] = i |
209
|
|
|
|
210
|
|
|
res = res.sort_index() |
211
|
|
|
res = self.clusters.to_frame().join(res, on='clusters')['split'] |
212
|
|
|
return (res == i for i in range(len(ratio))) |
213
|
|
|
|
214
|
|
|
def k_fold(self, n_folds): |
215
|
|
|
|
216
|
|
|
""" Returns k-fold cross-validated folds with thresholded similarity. |
217
|
|
|
|
218
|
|
|
Args: |
219
|
|
|
n_folds (int): |
220
|
|
|
The number of folds to provide. |
221
|
|
|
|
222
|
|
|
Returns: |
223
|
|
|
generator[(pd.Series, pd.Series)]: |
224
|
|
|
The splits in series. |
225
|
|
|
""" |
226
|
|
|
|
227
|
|
|
folds = self.split((1,) * n_folds) |
228
|
|
|
return ((~fold, fold) for fold in folds) |
229
|
|
|
|
230
|
|
|
def _split_sizes(self, ratio): |
231
|
|
|
""" Calculate the sizes of the splits """ |
232
|
|
|
|
233
|
|
|
tot = sum(ratio) |
234
|
|
|
return [self.n_instances_ * rat / tot for rat in ratio] |
235
|
|
|
|
236
|
|
|
@staticmethod |
237
|
|
|
@returns_pairs |
238
|
|
|
def _pairs_from_sim_mat(S): |
|
|
|
|
239
|
|
|
S = triu(S, k=1).todok() |
240
|
|
|
return list(S.items()) |
241
|
|
|
|
242
|
|
|
@returns_pairs |
243
|
|
|
def _pairs_from_fps(self, fps): |
244
|
|
|
""" Pairs from fps. """ |
245
|
|
|
if self.memory_optimized: |
246
|
|
|
pairs = self._pairs_from_fps_mem_opt(fps) |
247
|
|
|
else: |
248
|
|
|
pairs = self._pairs_from_fps_mem_intensive(fps) |
249
|
|
|
|
250
|
|
|
return pairs |
251
|
|
|
|
252
|
|
|
def _pairs_from_fps_mem_intensive(self, fps): |
253
|
|
|
""" Fast single process but memory intensive implementation of pairs. """ |
254
|
|
|
LOGGER.debug('Generating pairs using memory intensive technique.') |
255
|
|
|
D = squareform(pdist(fps, self.similarity_metric)) |
|
|
|
|
256
|
|
|
S = 1 - D # similarity is 1 - distance |
|
|
|
|
257
|
|
|
S[S <= self.min_threshold] = 0 |
258
|
|
|
return self._pairs_from_sim_mat(S) |
259
|
|
|
|
260
|
|
|
def _pairs_from_fps_mem_opt(self, fps): |
261
|
|
|
|
262
|
|
|
""" Fast, multi-processed and memory efficient generation of pairwise distances above a certain threshold.""" |
|
|
|
|
263
|
|
|
|
264
|
|
|
def slice_generator(low, high, width, end=False): |
265
|
|
|
""" Generator of index of checkerboards for the upper triangle of a matrix. """ |
266
|
|
|
while low < high: |
267
|
|
|
res = (low, low + width if low + width < high else high) |
268
|
|
|
if end: |
269
|
|
|
yield res |
270
|
|
|
else: |
271
|
|
|
# yield from ((res, j) for j in slice_generator(low, high, width, end=True)) |
272
|
|
|
for slice_ in ((res, j) for j in slice_generator(low, high, width, end=True)): |
273
|
|
|
yield slice_ |
274
|
|
|
low += width |
275
|
|
|
|
276
|
|
|
size = len(fps) |
277
|
|
|
|
278
|
|
|
fps = fps.values |
279
|
|
|
f = partial(_above_minimum, X=fps, threshold=self.min_threshold, metric=self.similarity_metric, size=size) |
|
|
|
|
280
|
|
|
slices = slice_generator(0, len(fps), self.block_width) |
281
|
|
|
|
282
|
|
|
if self.n_jobs == 1: |
283
|
|
|
# single processed |
284
|
|
|
LOGGER.debug('Generating pairs using memory optimized technique.') |
285
|
|
|
return sum((f(slice) for slice in slices), []) |
286
|
|
|
else: |
287
|
|
|
# multiprocessed |
288
|
|
|
LOGGER.debug('Generating pairs using memory optimized technique with %s processes', self.n_jobs) |
|
|
|
|
289
|
|
|
# with multiprocessing.Pool(self.n_jobs) as p: |
290
|
|
|
# return sum(p.map(f, [(i, j) for i, j in slices]), []) |
291
|
|
|
p = multiprocessing.Pool(self.n_jobs) |
|
|
|
|
292
|
|
|
res = sum(p.map(f, [(i, j) for i, j in slices]), []) |
293
|
|
|
p.close() |
294
|
|
|
return res |
295
|
|
|
|
296
|
|
|
def _cluster(self, pairs): |
297
|
|
|
""" Assign instances to clusters. """ |
298
|
|
|
|
299
|
|
|
LOGGER.debug('Generating clusters with %s close pairs', len(pairs)) |
300
|
|
|
clustered = np.arange(self.n_instances_) |
301
|
|
|
|
302
|
|
|
for i, j in pairs.values.tolist(): # faster as list |
303
|
|
|
i_clust, j_clust = clustered[i], clustered[j] |
304
|
|
|
if i_clust < j_clust: |
305
|
|
|
clustered[clustered == j_clust] = i_clust |
306
|
|
|
else: |
307
|
|
|
clustered[clustered == i_clust] = j_clust |
308
|
|
|
return clustered |
309
|
|
|
|
310
|
|
|
def _optimal_thresh(self): |
311
|
|
|
""" Calculate the optimal threshold for the given max pair density. """ |
312
|
|
|
def f(threshold): |
|
|
|
|
313
|
|
|
pairs = self.pairs_.loc[self.pairs_.sim > threshold, ('i', 'j')] |
|
|
|
|
314
|
|
|
res = pd.Series(self._cluster(pairs)) |
315
|
|
|
return np.abs(res.value_counts().max() - self.largest_cluster * self.n_instances_) |
316
|
|
|
|
317
|
|
|
self.threshold_ = minimize_scalar(f, bounds=(self.min_threshold, 1), method='bounded').x |
318
|
|
|
LOGGER.info('Optimal threshold: %s', self.threshold_) |
319
|
|
|
self.clusters = pd.Series(self._cluster(self.pairs_.loc[self.pairs_.sim > self.threshold_, ('i', 'j')]), |
|
|
|
|
320
|
|
|
index=self.index, |
321
|
|
|
name='clusters') |
322
|
|
|
return self.threshold_ |
323
|
|
|
|
324
|
|
|
def visualize_similarities(self, subsample=5000, ax=None): |
|
|
|
|
325
|
|
|
|
326
|
|
|
""" Plot a histogram of similarities, with the threshold plotted. |
327
|
|
|
|
328
|
|
|
Args: |
329
|
|
|
subsample (int): |
330
|
|
|
For a large dataset, subsample the number of compounds to |
331
|
|
|
consider. |
332
|
|
|
ax (matplotlib.axis): |
333
|
|
|
Axis to make the plot on. |
334
|
|
|
Returns: |
335
|
|
|
matplotlib.axes |
336
|
|
|
""" |
337
|
|
|
|
338
|
|
|
if not ax: |
339
|
|
|
ax = plt.gca() |
340
|
|
|
|
341
|
|
|
if subsample and len(self.fps) > subsample: |
342
|
|
|
fps = self.fps.sample(subsample) |
343
|
|
|
else: |
344
|
|
|
fps = self.fps |
345
|
|
|
|
346
|
|
|
dists = 1 - squareform(pdist(fps, self.similarity_metric)) |
347
|
|
|
dists = (dists - np.identity(dists.shape[0])).flatten() |
348
|
|
|
hist = ax.hist(dists, bins=50) |
349
|
|
|
ax.vlines(self.threshold_, 0, max(hist[0])) |
350
|
|
|
ax.set_xlabel('similarity') |
351
|
|
|
return ax |
352
|
|
|
|
353
|
|
|
def visualize_space(self, dim_reducer='tsne', dim_red_kw={}, subsample=5000, ax=None, c=None): |
|
|
|
|
354
|
|
|
|
355
|
|
|
""" Plot chemical space using a transformer |
356
|
|
|
|
357
|
|
|
Args: |
358
|
|
|
dim_reducer (str or sklearn object): |
359
|
|
|
Technique to use to reduce fingerprint space. |
360
|
|
|
|
361
|
|
|
subsample (int): |
362
|
|
|
for a large dataset, subsample the number of compounds to |
363
|
|
|
consider. |
364
|
|
|
|
365
|
|
|
ax (matplotlib.axis): |
366
|
|
|
Axis to make the plot on. |
367
|
|
|
Returns: |
368
|
|
|
matplotlib.axes |
369
|
|
|
""" |
370
|
|
|
|
371
|
|
|
if isinstance(dim_reducer, str): |
372
|
|
|
if dim_reducer not in ('tsne', 'mds'): |
373
|
|
|
raise NotImplementedError('Dimensionality reducer {} not available'.format(dim_reducer)) |
|
|
|
|
374
|
|
|
from sklearn.manifold import TSNE, MDS |
|
|
|
|
375
|
|
|
reducers = {'tsne': TSNE, 'mds': MDS} |
376
|
|
|
dim_reducer = reducers[dim_reducer](**dim_red_kw) |
|
|
|
|
377
|
|
|
|
378
|
|
|
two_d = dim_reducer.fit_transform(self.fps) |
|
|
|
|
379
|
|
|
|
380
|
|
|
if not ax: |
381
|
|
|
ax = plt.gca() |
382
|
|
|
|
383
|
|
|
return ax.scatter(two_d[:, 0], two_d[:, 1], c=c) |
384
|
|
|
|
385
|
|
|
|
386
|
|
|
|
387
|
|
|
|