|
1
|
|
|
#! /usr/bin/env python |
|
2
|
|
|
# |
|
3
|
|
|
# Copyright (C) 2007-2009 Rich Lewis <[email protected]> |
|
4
|
|
|
# License: 3-clause BSD |
|
5
|
|
|
|
|
6
|
|
|
# The map functions are a stand in before parallelism is applied, |
|
7
|
|
|
# so ignore the errors for using map + lambdas. |
|
8
|
|
|
|
|
9
|
|
|
# pylint: disable=W0110 |
|
|
|
|
|
|
10
|
|
|
|
|
11
|
|
|
""" |
|
12
|
|
|
skchem.target_prediction.PIDGIN |
|
13
|
|
|
|
|
14
|
|
|
Wrapper for the PIDGIN models. |
|
15
|
|
|
""" |
|
16
|
|
|
|
|
17
|
|
|
import pandas as pd |
|
|
|
|
|
|
18
|
|
|
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect |
|
|
|
|
|
|
19
|
|
|
import gzip |
|
20
|
|
|
|
|
21
|
|
|
import sys |
|
22
|
|
|
|
|
23
|
|
|
# if cpickle available, import it. otherwise use pickle |
|
24
|
|
|
try: |
|
25
|
|
|
import cPickle as pickle |
|
26
|
|
|
except ImportError: |
|
27
|
|
|
import pickle |
|
28
|
|
|
|
|
29
|
|
|
from .target_prediction import AbstractTargetPredictionAlgorithm |
|
30
|
|
|
from ..descriptors import skchemize |
|
31
|
|
|
from ..data import resource |
|
32
|
|
|
|
|
33
|
|
|
class PIDGIN(AbstractTargetPredictionAlgorithm): |
|
34
|
|
|
|
|
35
|
|
|
""" Class implementing the PIDGIN target prediction algorithm """ |
|
36
|
|
|
|
|
37
|
|
|
def __init__(self): |
|
38
|
|
|
# fix py3 incompat by creating py2k and py3k PIDGIN models |
|
39
|
|
|
filename = 'models_{}{}.pkl.gz'.format(*sys.version_info[:2]) |
|
40
|
|
|
|
|
41
|
|
|
with gzip.open(resource('PIDGIN', filename), 'rb') as f: |
|
|
|
|
|
|
42
|
|
|
self.models = pickle.load(f) |
|
43
|
|
|
self.fingerprint = skchemize(GetMorganFingerprintAsBitVect, \ |
|
44
|
|
|
radius=2, nBits=2048) |
|
45
|
|
|
self.targets = self.models.keys() |
|
46
|
|
|
|
|
47
|
|
|
def __call__(self, m): |
|
48
|
|
|
return self.predict_proba(m) |
|
49
|
|
|
|
|
50
|
|
|
def _m_predict(self, m): |
|
51
|
|
|
|
|
52
|
|
|
""" Predict binary binding profile for a molecule against 1080 protein targets """ |
|
53
|
|
|
|
|
54
|
|
|
fp = self.fingerprint(m) |
|
|
|
|
|
|
55
|
|
|
return pd.Series((self.models[targ].predict(fp)[0] for targ in self.targets), \ |
|
56
|
|
|
index=self.targets) |
|
57
|
|
|
|
|
58
|
|
|
def _map_predict(self, m): |
|
|
|
|
|
|
59
|
|
|
|
|
60
|
|
|
""" Map based prediction for binary binding profile """ |
|
61
|
|
|
|
|
62
|
|
|
fp = self.fingerprint(m) |
|
|
|
|
|
|
63
|
|
|
return pd.Series(map(lambda k: self.models[k].predict(fp), self.targets), \ |
|
|
|
|
|
|
64
|
|
|
index=self.targets) |
|
65
|
|
|
|
|
66
|
|
|
def _m_predict_proba(self, m): |
|
67
|
|
|
|
|
68
|
|
|
""" Predict probability of molecule m binding to 1080 protein targets """ |
|
69
|
|
|
|
|
70
|
|
|
fp = self.fingerprint(m) |
|
|
|
|
|
|
71
|
|
|
res = pd.Series(index=self.targets) |
|
72
|
|
|
for target in self.models: |
|
73
|
|
|
res[target] = self.models[target].predict_proba(fp)[:, 1][0] |
|
74
|
|
|
return res |
|
75
|
|
|
|
|
76
|
|
|
def _map_predict_proba(self, m): |
|
|
|
|
|
|
77
|
|
|
|
|
78
|
|
|
""" Predict the log probability of molecule m binding to the 1080 proteins """ |
|
79
|
|
|
|
|
80
|
|
|
fp = self.fingerprint(m) |
|
|
|
|
|
|
81
|
|
|
return pd.Series(map(lambda k: self.models[k].predict_proba(fp)[:, 1][0],\ |
|
|
|
|
|
|
82
|
|
|
self.targets), index=self.targets) |
|
83
|
|
|
|
|
84
|
|
|
def _m_predict_log_proba(self, m): |
|
85
|
|
|
|
|
86
|
|
|
""" Predict the log probability of molecule m binding to the 1080 proteins """ |
|
87
|
|
|
|
|
88
|
|
|
fp = self.fingerprint(m) |
|
|
|
|
|
|
89
|
|
|
res = pd.Series(index=self.targets) |
|
90
|
|
|
for target, model in self.models.iteritems(): |
|
91
|
|
|
res[target] = model.predict_log_proba(fp)[:, 1][0] |
|
92
|
|
|
return res |
|
93
|
|
|
|
|
94
|
|
|
def map_predict_log_proba(self, m): |
|
|
|
|
|
|
95
|
|
|
|
|
96
|
|
|
""" Predict the log probabiltiy of molecule m binding to the 1080 proteins |
|
97
|
|
|
using map, for simple parallelism """ |
|
98
|
|
|
|
|
99
|
|
|
fp = self.fingerprint(m) |
|
|
|
|
|
|
100
|
|
|
return pd.Series(map(lambda k: self.models[k].predict_log_proba(fp[:, 1][0]), \ |
|
|
|
|
|
|
101
|
|
|
self.targets), index=self.targets) |
|
102
|
|
|
|
|
103
|
|
|
def _df_predict(self, df): |
|
104
|
|
|
|
|
105
|
|
|
"""more efficient way to call the predict on large scikit-chem style dataframes""" |
|
106
|
|
|
|
|
107
|
|
|
fps = df.structure.apply(self.fingerprint) |
|
108
|
|
|
res = pd.DataFrame(index=fps.index, columns=self.targets) |
|
109
|
|
|
for target in self.models: |
|
110
|
|
|
res[target] = self.models[target].predict(fps) |
|
111
|
|
|
return res |
|
112
|
|
|
|
|
113
|
|
|
def _df_map_predict(self, df): |
|
|
|
|
|
|
114
|
|
|
|
|
115
|
|
|
""" More efficient way to call the predict on large scikit-chem style dataframes, |
|
116
|
|
|
with a map implementation for easy parallelism""" |
|
117
|
|
|
|
|
118
|
|
|
fps = df.structure.apply(self.fingerprint) |
|
119
|
|
|
|
|
120
|
|
|
return pd.DataFrame(map(lambda k: self.models[k].predict(fps), self.targets), \ |
|
|
|
|
|
|
121
|
|
|
columns=fps.index, index=self.targets).T |
|
122
|
|
|
|
|
123
|
|
|
|
|
124
|
|
View Code Duplication |
def _df_predict_proba(self, df): |
|
|
|
|
|
|
125
|
|
|
|
|
126
|
|
|
""" More efficient way to call the predict_proba on large scikit-chem style dataframes""" |
|
127
|
|
|
|
|
128
|
|
|
fps = df.structure.apply(self.fingerprint) |
|
129
|
|
|
res = pd.DataFrame(index=fps.index, columns=self.targets) |
|
130
|
|
|
|
|
131
|
|
|
#parallelize here |
|
132
|
|
|
for target in self.models: |
|
133
|
|
|
res[target] = self.models[target].predict_proba(fps)[:, 1] |
|
134
|
|
|
return res |
|
135
|
|
|
|
|
136
|
|
|
def _df_map_predict_proba(self, df): |
|
|
|
|
|
|
137
|
|
|
|
|
138
|
|
|
""" map based way to call the predict_proba on large scikit-chem style dataframes """ |
|
139
|
|
|
|
|
140
|
|
|
fps = df.structure.apply(self.fingerprint) |
|
141
|
|
|
|
|
142
|
|
|
#parallize here trivially |
|
143
|
|
|
return pd.DataFrame(map(lambda k: self.models[k].predict_proba(fps)[:, 1], self.targets), \ |
|
|
|
|
|
|
144
|
|
|
columns=fps.index, index=self.targets).T |
|
145
|
|
|
|
|
146
|
|
View Code Duplication |
def _df_predict_log_proba(self, df): |
|
|
|
|
|
|
147
|
|
|
|
|
148
|
|
|
""" More efficient way to call the predict_proba on large scikit-chem style dataframes""" |
|
149
|
|
|
|
|
150
|
|
|
fps = df.structure.apply(self.fingerprint) |
|
151
|
|
|
res = pd.DataFrame(index=fps.index, columns=self.targets) |
|
152
|
|
|
|
|
153
|
|
|
for target in self.models: |
|
154
|
|
|
res[target] = self.models[target].predict_log_proba(fps)[:, 1] |
|
155
|
|
|
return res |
|
156
|
|
|
|
|
157
|
|
|
def _df_map_predict_log_proba(self, df): |
|
|
|
|
|
|
158
|
|
|
|
|
159
|
|
|
""" |
|
160
|
|
|
More efficient way to call the predict on large scikit-chem style dataframes, |
|
161
|
|
|
with a map implementation for easy parallelism |
|
162
|
|
|
""" |
|
163
|
|
|
|
|
164
|
|
|
fps = df.structure.apply(self.fingerprint) |
|
165
|
|
|
|
|
166
|
|
|
return pd.DataFrame(map(lambda k: self.models[k].predict_log_proba(fps)[:, 1], \ |
|
|
|
|
|
|
167
|
|
|
self.targets), columns=fps.index, index=self.targets).T |
|
168
|
|
|
|