1
|
|
|
#! /usr/bin/env python |
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
1 |
|
""" |
7
|
|
|
## skchem.cross_validation.similarity_threshold |
8
|
|
|
|
9
|
|
|
Similarity threshold dataset partitioning functionality. |
10
|
|
|
""" |
11
|
|
|
|
12
|
1 |
|
import logging |
13
|
|
|
|
14
|
1 |
|
import numpy as np |
|
|
|
|
15
|
1 |
|
import pandas as pd |
|
|
|
|
16
|
1 |
|
import matplotlib.pyplot as plt |
|
|
|
|
17
|
1 |
|
from scipy.spatial.distance import pdist, cdist, squareform |
|
|
|
|
18
|
1 |
|
from scipy.sparse import triu, dok_matrix |
|
|
|
|
19
|
1 |
|
from scipy.optimize import minimize_scalar |
|
|
|
|
20
|
1 |
|
from sklearn.manifold import TSNE, MDS |
|
|
|
|
21
|
|
|
|
22
|
1 |
|
import multiprocessing |
23
|
1 |
|
from functools import partial, wraps |
24
|
|
|
|
25
|
1 |
|
from .. import features |
26
|
|
|
|
27
|
1 |
|
LOGGER = logging.getLogger(__name__) |
28
|
|
|
|
29
|
|
|
|
30
|
1 |
|
def returns_pairs(func): |
31
|
|
|
""" Wraps a function that returns a ((i, j), sim) list to return a |
32
|
|
|
dataframe. """ |
33
|
|
|
|
34
|
1 |
|
@wraps(func) |
35
|
|
|
def inner(*args, **kwargs): |
|
|
|
|
36
|
1 |
|
pairs = func(*args, **kwargs) |
37
|
1 |
|
return pd.DataFrame([(p[0][0], p[0][1], p[1]) for p in pairs], |
38
|
|
|
columns=['i', 'j', 'sim']).sort_values('sim') |
39
|
1 |
|
return inner |
40
|
|
|
|
41
|
|
|
|
42
|
1 |
|
def _above_minimum(args, inp, metric, threshold, size): |
43
|
|
|
""" finds pairs above a minimum similarity in chunks """ |
44
|
1 |
|
i, j = slice(*args[0]), slice(*args[1]) |
45
|
1 |
|
x_i, x_j = inp[i], inp[j] |
46
|
1 |
|
sim_mat = 1 - cdist(x_i, x_j, metric=metric) |
47
|
1 |
|
if i == j: |
48
|
1 |
|
sim_mat = np.triu(sim_mat, k=1) |
49
|
1 |
|
sim_mat[sim_mat <= threshold] = 0 |
50
|
1 |
|
sparse_mat = dok_matrix((size, size), dtype=float) |
51
|
1 |
|
sparse_mat[i, j] = sim_mat |
52
|
1 |
|
return list(sparse_mat.items()) |
53
|
|
|
|
54
|
|
|
|
55
|
1 |
|
class SimThresholdSplit(object): |
|
|
|
|
56
|
|
|
|
57
|
1 |
|
def __init__(self, min_threshold=0.45, largest_cluster_fraction=0.1, |
|
|
|
|
58
|
|
|
fper='morgan', similarity_metric='jaccard', |
59
|
|
|
memory_optimized=True, n_jobs=1, block_width=1000, |
60
|
|
|
verbose=False): |
61
|
|
|
|
62
|
|
|
""" Threshold similarity split for chemical datasets. |
63
|
|
|
|
64
|
|
|
This class implements a splitting technique that will pool compounds |
65
|
|
|
with similarity above a theshold into the same splits. The threshold |
66
|
|
|
value is decided by specifying the maximum number of compounds to pool |
67
|
|
|
into a cluster, as the density of compounds varies with dataset. |
68
|
|
|
|
69
|
|
|
Machine learning techniques should be able to extrapolate outside of a |
70
|
|
|
molecular series, or scaffold, however random splits will result in |
71
|
|
|
some 'easy' test sets that are either *identical* or in the same |
72
|
|
|
molecular series or share a significant scaffold with training set |
73
|
|
|
compounds. |
74
|
|
|
|
75
|
|
|
This splitting technique reduces or eliminates (depending on the |
76
|
|
|
threshold set) this effect, making the problem harder. |
77
|
|
|
|
78
|
|
|
Args: |
79
|
|
|
|
80
|
|
|
min_threshold (float): |
81
|
|
|
The minimum similarity threshold. Lower will be slower. |
82
|
|
|
|
83
|
|
|
largest_cluster_fraction (float): |
84
|
|
|
The fraction of the total dataset the largest cluster can be. |
85
|
|
|
This decided the final similarity threshold. |
86
|
|
|
|
87
|
|
|
fper (str or skchem.Fingerprinter): |
88
|
|
|
The fingerprinting technique to use to generate the similarity |
89
|
|
|
matrix. |
90
|
|
|
|
91
|
|
|
similarity_metric (str or callable): |
92
|
|
|
The similarity metric to use. |
93
|
|
|
|
94
|
|
|
memory_optimized (bool): |
95
|
|
|
Whether to use the memory optimized implementation. |
96
|
|
|
|
97
|
|
|
n_jobs (int): |
98
|
|
|
If memory_optimized is True, how many processes to run it over. |
99
|
|
|
|
100
|
|
|
block_width (int): |
101
|
|
|
If memory_optimized, what block length to use. This is the |
102
|
|
|
width of the submatrices that are calculated at a time. |
103
|
|
|
|
104
|
|
|
Notes: |
105
|
|
|
The splits will not always be exactly the size requested, due to |
106
|
|
|
the constraint and requirement to maintain random shuffling. |
107
|
|
|
""" |
108
|
|
|
|
109
|
1 |
|
self.index = self.fps = self._n_jobs = self._n_instances_ = None |
110
|
1 |
|
self.pairs_ = self.threshold_ = None |
111
|
|
|
|
112
|
1 |
|
if isinstance(fper, str): |
113
|
|
|
fper = features.get(fper) |
114
|
|
|
|
115
|
1 |
|
self.fper = fper |
116
|
1 |
|
self.similarity_metric = similarity_metric |
117
|
1 |
|
self.memory_optimized = memory_optimized |
118
|
1 |
|
self.n_jobs = n_jobs |
119
|
1 |
|
self._block_width = block_width |
120
|
1 |
|
self.min_threshold = min_threshold |
121
|
1 |
|
self.largest_cluster = largest_cluster_fraction |
122
|
|
|
|
123
|
1 |
|
if self.fper: |
124
|
|
|
self.fper.verbose = verbose |
125
|
|
|
|
126
|
1 |
|
def fit(self, inp, pairs=None): |
127
|
|
|
|
128
|
|
|
""" Fit the cross validator to the data. |
129
|
|
|
Args: |
130
|
|
|
inp (pd.Series or pd.DataFrame or np.array): |
131
|
|
|
- `pd.Series` of `Mol` instances |
132
|
|
|
- `pd.DataFrame` with `Mol` instances as a `structure` row. |
133
|
|
|
- `pd.DataFrame` of fingerprints if `fper` is `None` |
134
|
|
|
- `pd.DataFrame` of sim matrix if `similarity_metric` is `None` |
135
|
|
|
- `np.array` of sim matrix if `similarity_metric` is `None` |
136
|
|
|
|
137
|
|
|
pairs (list<tuple<tuple(i, j), k>>): |
138
|
|
|
An optional precalculated list of pairwise distances. |
139
|
|
|
""" |
140
|
|
|
|
141
|
1 |
|
self.n_instances_ = len(inp) |
142
|
1 |
|
self.pairs_ = pairs |
143
|
|
|
|
144
|
1 |
|
if isinstance(inp, (pd.Series, pd.DataFrame)): |
145
|
1 |
|
self.index = inp.index |
146
|
|
|
else: |
147
|
|
|
self.index = pd.RangeIndex(len(inp), name='batch') |
148
|
|
|
|
149
|
1 |
|
if self.similarity_metric is None: |
150
|
|
|
# we were passed a similarity matrix directly |
151
|
|
|
self.pairs_ = self._pairs_from_sim_mat(inp) |
152
|
|
|
|
153
|
1 |
|
elif self.fper is None: |
154
|
|
|
# we were passed fingerprints directly |
155
|
1 |
|
self.fps = inp |
156
|
1 |
|
if self.pairs_ is None: |
157
|
1 |
|
self.pairs_ = self._pairs_from_fps(inp) |
158
|
|
|
|
159
|
|
|
else: |
160
|
|
|
# we were passed Mol |
161
|
|
|
if self.pairs_ is None: |
162
|
|
|
self.fps = self.fper.transform(inp) |
163
|
|
|
self.pairs_ = self._pairs_from_fps(self.fps) |
164
|
|
|
|
165
|
1 |
|
self.threshold_ = self._optimal_thresh() |
166
|
|
|
|
167
|
1 |
|
return self |
168
|
|
|
|
169
|
1 |
|
@property |
170
|
|
|
def n_jobs(self): |
171
|
|
|
""" The number of processes to use to calculate the distance matrix. |
172
|
|
|
|
173
|
|
|
-1 for all available. """ |
174
|
1 |
|
return self._n_jobs |
175
|
|
|
|
176
|
1 |
|
@n_jobs.setter |
177
|
|
|
def n_jobs(self, val): |
|
|
|
|
178
|
1 |
|
if val == -1: |
179
|
|
|
self._n_jobs = multiprocessing.cpu_count() |
180
|
|
|
else: |
181
|
1 |
|
self._n_jobs = val |
182
|
|
|
|
183
|
1 |
|
@property |
184
|
|
|
def block_width(self): |
185
|
|
|
""" The width of the subsets of features. |
186
|
|
|
|
187
|
|
|
Note: |
188
|
|
|
Only used in parallelized. """ |
189
|
|
|
|
190
|
1 |
|
return self._block_width |
191
|
|
|
|
192
|
1 |
|
@block_width.setter |
193
|
|
|
def block_width(self, val): |
|
|
|
|
194
|
|
|
assert val <= self.n_instances_, 'The block width should be less than ' |
195
|
|
|
'or equal to the number of instances' |
|
|
|
|
196
|
|
|
self._block_width = val |
197
|
|
|
|
198
|
1 |
|
@property |
199
|
|
|
def n_instances_(self): |
200
|
|
|
""" The number of instances that were used to fit the object. """ |
201
|
1 |
|
return self._n_instances_ |
202
|
|
|
|
203
|
1 |
|
@n_instances_.setter |
204
|
|
|
def n_instances_(self, val): |
|
|
|
|
205
|
1 |
|
assert val >= self._block_width, 'The block width should be less than ' |
206
|
|
|
'or equal to the number of instances' |
|
|
|
|
207
|
1 |
|
self._n_instances_ = val |
208
|
|
|
|
209
|
1 |
|
def _cluster_cumsum(self, shuffled=True): |
210
|
|
|
|
211
|
1 |
|
nums = self.clusters.value_counts() |
212
|
1 |
|
if shuffled: |
213
|
1 |
|
nums = nums.ix[np.random.permutation(nums.index)].cumsum() |
214
|
1 |
|
return nums |
215
|
|
|
|
216
|
1 |
|
def split(self, ratio): |
217
|
|
|
|
218
|
|
|
""" Return splits of the data with thresholded similarity according to a |
219
|
|
|
specified ratio. |
220
|
|
|
|
221
|
|
|
Args: |
222
|
|
|
ratio (tuple[ints]): |
223
|
|
|
the ratio to use. |
224
|
|
|
Returns: |
225
|
|
|
generator[pd.Series]: |
226
|
|
|
Generator of boolean split masks for the reqested splits. |
227
|
|
|
|
228
|
|
|
Example: |
229
|
|
|
>>> ms = skchem.data.Diversity.read_frame('structure') # doctest: +SKIP |
230
|
|
|
>>> st = SimThresholdSplit(fper='morgan', # doctest: +SKIP |
231
|
|
|
... similarity_metric='jaccard') |
232
|
|
|
>>> st.fit(ms) # doctest: +SKIP |
233
|
|
|
>>> train, valid, test = st.split(ratio=(70, 15, 15)) # doctest: +SKIP |
234
|
|
|
|
235
|
|
|
""" |
236
|
|
|
|
237
|
1 |
|
ratio = self._split_sizes(ratio) |
238
|
1 |
|
nums = self._cluster_cumsum() |
239
|
1 |
|
res = pd.Series(np.nan, index=nums.index, name='split') |
240
|
|
|
|
241
|
1 |
|
for i in range(len(ratio)): |
242
|
1 |
|
lower = 0 if i == 0 else sum(ratio[:i]) |
243
|
1 |
|
upper = ratio if i == len(ratio) else sum(ratio[:i + 1]) |
244
|
1 |
|
res[nums[(nums > lower) & (nums <= upper)].index] = i |
245
|
|
|
|
246
|
1 |
|
res = res.sort_index() |
247
|
1 |
|
res = self.clusters.to_frame().join(res, on='clusters')['split'] |
248
|
1 |
|
return (res == i for i in range(len(ratio))) |
249
|
|
|
|
250
|
1 |
|
def k_fold(self, n_folds): |
251
|
|
|
|
252
|
|
|
""" Returns k-fold cross-validated folds with thresholded similarity. |
253
|
|
|
|
254
|
|
|
Args: |
255
|
|
|
n_folds (int): |
256
|
|
|
The number of folds to provide. |
257
|
|
|
|
258
|
|
|
Returns: |
259
|
|
|
generator[(pd.Series, pd.Series)]: |
260
|
|
|
The splits in series. |
261
|
|
|
""" |
262
|
|
|
|
263
|
1 |
|
folds = self.split((1,) * n_folds) |
264
|
1 |
|
return ((~fold, fold) for fold in folds) |
265
|
|
|
|
266
|
1 |
|
def _split_sizes(self, ratio): |
267
|
|
|
""" Calculate the sizes of the splits """ |
268
|
|
|
|
269
|
1 |
|
tot = sum(ratio) |
270
|
1 |
|
return [self.n_instances_ * rat / tot for rat in ratio] |
271
|
|
|
|
272
|
1 |
|
@staticmethod |
273
|
1 |
|
@returns_pairs |
274
|
|
|
def _pairs_from_sim_mat(sim_mat): |
275
|
|
|
sim_mat = triu(sim_mat, k=1).todok() |
276
|
|
|
return list(sim_mat.items()) |
277
|
|
|
|
278
|
1 |
|
@returns_pairs |
279
|
|
|
def _pairs_from_fps(self, fps): |
280
|
|
|
""" Pairs from fps. """ |
281
|
1 |
|
if self.memory_optimized: |
282
|
1 |
|
pairs = self._pairs_from_fps_mem_opt(fps) |
283
|
|
|
else: |
284
|
|
|
pairs = self._pairs_from_fps_mem_intensive(fps) |
285
|
|
|
|
286
|
1 |
|
return pairs |
287
|
|
|
|
288
|
1 |
|
def _pairs_from_fps_mem_intensive(self, fps): |
289
|
|
|
""" Fast single process, memory intensive implementation of pairs. """ |
290
|
|
|
LOGGER.debug('Generating pairs using memory intensive technique.') |
291
|
|
|
dist_mat = squareform(pdist(fps, self.similarity_metric)) |
292
|
|
|
sim_mat = 1 - dist_mat # similarity is 1 - distance |
293
|
|
|
sim_mat[sim_mat <= self.min_threshold] = 0 |
294
|
|
|
return self._pairs_from_sim_mat(sim_mat) |
295
|
|
|
|
296
|
1 |
|
def _pairs_from_fps_mem_opt(self, fps): |
297
|
|
|
|
298
|
|
|
""" Fast, multi-processed and memory efficient pairs.""" |
299
|
|
|
|
300
|
1 |
|
def slice_generator(low, high, width, end=False): |
301
|
|
|
""" Checkerboard indices for the upper triangle of a matrix. """ |
302
|
1 |
|
while low < high: |
303
|
1 |
|
res = (low, low + width if low + width < high else high) |
304
|
1 |
|
if end: |
305
|
1 |
|
yield res |
306
|
|
|
else: |
307
|
1 |
|
gen = slice_generator(low, high, width, end=True) |
308
|
|
|
# py2 compat |
309
|
|
|
# yield from ((res, j) for j in |
310
|
|
|
# slice_generator(low, high, width, end=True)) |
311
|
1 |
|
for slice_ in ((res, j) for j in gen): |
312
|
1 |
|
yield slice_ |
313
|
1 |
|
low += width |
314
|
|
|
|
315
|
1 |
|
size = len(fps) |
316
|
|
|
|
317
|
1 |
|
fps = fps.values |
318
|
1 |
|
f = partial(_above_minimum, inp=fps, threshold=self.min_threshold, |
|
|
|
|
319
|
|
|
metric=self.similarity_metric, size=size) |
320
|
1 |
|
slices = slice_generator(0, len(fps), self.block_width) |
321
|
|
|
|
322
|
1 |
|
if self.n_jobs == 1: |
323
|
|
|
# single processed |
324
|
1 |
|
LOGGER.debug('Generating pairs using memory optimized technique.') |
325
|
1 |
|
return sum((f(slice_) for slice_ in slices), []) |
326
|
|
|
else: |
327
|
|
|
# multiprocessed |
328
|
|
|
LOGGER.debug('Generating pairs using memory optimized technique ' |
329
|
|
|
'with %s processes', self.n_jobs) |
330
|
|
|
# py2 compat |
331
|
|
|
# with multiprocessing.Pool(self.n_jobs) as p: |
332
|
|
|
# return sum(p.map(f, [(i, j) for i, j in slices]), []) |
333
|
|
|
p = multiprocessing.Pool(self.n_jobs) |
|
|
|
|
334
|
|
|
res = sum(p.map(f, [(i, j) for i, j in slices]), []) |
335
|
|
|
p.close() |
336
|
|
|
return res |
337
|
|
|
|
338
|
1 |
|
def _cluster(self, pairs): |
339
|
|
|
""" Assign instances to clusters. """ |
340
|
|
|
|
341
|
1 |
|
LOGGER.debug('Generating clusters with %s close pairs', len(pairs)) |
342
|
1 |
|
clustered = np.arange(self.n_instances_) |
343
|
|
|
|
344
|
1 |
|
for i, j in pairs.values.tolist(): # faster as list |
345
|
1 |
|
i_clust, j_clust = clustered[i], clustered[j] |
346
|
1 |
|
if i_clust < j_clust: |
347
|
1 |
|
clustered[clustered == j_clust] = i_clust |
348
|
|
|
else: |
349
|
1 |
|
clustered[clustered == i_clust] = j_clust |
350
|
1 |
|
return clustered |
351
|
|
|
|
352
|
1 |
|
def _optimal_thresh(self): |
353
|
|
|
""" Calculate the optimal threshold for the given max pair density. """ |
354
|
1 |
|
def f(threshold): |
|
|
|
|
355
|
1 |
|
pairs = self.pairs_.loc[self.pairs_.sim > threshold, ('i', 'j')] |
|
|
|
|
356
|
1 |
|
res = pd.Series(self._cluster(pairs)) |
357
|
1 |
|
return np.abs(res.value_counts().max() - |
358
|
|
|
self.largest_cluster * self.n_instances_) |
359
|
|
|
|
360
|
1 |
|
self.threshold_ = minimize_scalar(f, |
361
|
|
|
bounds=(self.min_threshold, 1), |
362
|
|
|
method='bounded').x |
363
|
|
|
|
364
|
1 |
|
LOGGER.info('Optimal threshold: %s', self.threshold_) |
365
|
1 |
|
gd_prs = self.pairs_.loc[self.pairs_.sim > self.threshold_, ('i', 'j')] |
|
|
|
|
366
|
1 |
|
self.clusters = pd.Series(self._cluster(gd_prs), |
|
|
|
|
367
|
|
|
index=self.index, |
368
|
|
|
name='clusters') |
369
|
1 |
|
return self.threshold_ |
370
|
|
|
|
371
|
1 |
|
def visualize_similarities(self, subsample=5000, ax=None): |
|
|
|
|
372
|
|
|
|
373
|
|
|
""" Plot a histogram of similarities, with the threshold plotted. |
374
|
|
|
|
375
|
|
|
Args: |
376
|
|
|
subsample (int): |
377
|
|
|
For a large dataset, subsample the number of compounds to |
378
|
|
|
consider. |
379
|
|
|
ax (matplotlib.axis): |
380
|
|
|
Axis to make the plot on. |
381
|
|
|
Returns: |
382
|
|
|
matplotlib.axes |
383
|
|
|
""" |
384
|
|
|
|
385
|
|
|
if not ax: |
386
|
|
|
ax = plt.gca() |
387
|
|
|
|
388
|
|
|
if subsample and len(self.fps) > subsample: |
389
|
|
|
fps = self.fps.sample(subsample) |
390
|
|
|
else: |
391
|
|
|
fps = self.fps |
392
|
|
|
|
393
|
|
|
dists = 1 - squareform(pdist(fps, self.similarity_metric)) |
394
|
|
|
dists = (dists - np.identity(dists.shape[0])).flatten() |
395
|
|
|
hist = ax.hist(dists, bins=50) |
396
|
|
|
ax.vlines(self.threshold_, 0, max(hist[0])) |
397
|
|
|
ax.set_xlabel('similarity') |
398
|
|
|
return ax |
399
|
|
|
|
400
|
1 |
|
def visualize_space(self, dim_reducer='tsne', dim_red_kw=None, |
|
|
|
|
401
|
|
|
subsample=5000, ax=None, plt_kw=None): |
|
|
|
|
402
|
|
|
|
403
|
|
|
""" Plot chemical space using a transformer |
404
|
|
|
|
405
|
|
|
Args: |
406
|
|
|
dim_reducer (str or sklearn object): |
407
|
|
|
Technique to use to reduce fingerprint space. |
408
|
|
|
|
409
|
|
|
dim_red_kw (dict): |
410
|
|
|
Keyword args to pass to dim_reducer. |
411
|
|
|
|
412
|
|
|
subsample (int): |
413
|
|
|
for a large dataset, subsample the number of compounds to |
414
|
|
|
consider. |
415
|
|
|
|
416
|
|
|
ax (matplotlib.axis): |
417
|
|
|
Axis to make the plot on. |
418
|
|
|
|
419
|
|
|
plt_kw (dict): |
420
|
|
|
Keyword args to pass to the plot. |
421
|
|
|
|
422
|
|
|
Returns: |
423
|
|
|
matplotlib.axes |
424
|
|
|
""" |
425
|
|
|
|
426
|
|
|
if dim_red_kw is None: |
427
|
|
|
dim_red_kw = {} |
428
|
|
|
|
429
|
|
|
if plt_kw is None: |
430
|
|
|
plt_kw = {} |
431
|
|
|
|
432
|
|
|
if isinstance(dim_reducer, str): |
433
|
|
|
if dim_reducer not in ('tsne', 'mds'): |
434
|
|
|
msg = 'Dimensionality reducer {} not available'.format( |
435
|
|
|
dim_reducer) |
436
|
|
|
raise NotImplementedError(msg) |
437
|
|
|
reducers = {'tsne': TSNE, 'mds': MDS} |
438
|
|
|
dim_reducer = reducers[dim_reducer](**dim_red_kw) |
|
|
|
|
439
|
|
|
|
440
|
|
|
two_d = dim_reducer.fit_transform(self.fps) |
|
|
|
|
441
|
|
|
|
442
|
|
|
if not ax: |
443
|
|
|
ax = plt.gca() |
444
|
|
|
|
445
|
|
|
return ax.scatter(two_d[:, 0], two_d[:, 1], **plt_kw) |
|
|
|
|
446
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.