| Total Complexity | 27 | 
| Total Lines | 135 | 
| Duplicated Lines | 15.56 % | 
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
| 1 | #! /usr/bin/env python  | 
            ||
| 33 | class PIDGIN(AbstractTargetPredictionAlgorithm):  | 
            ||
| 34 | |||
| 35 | """ Class implementing the PIDGIN target prediction algorithm """  | 
            ||
| 36 | |||
| 37 | def __init__(self):  | 
            ||
| 38 | # fix py3 incompat by creating py2k and py3k PIDGIN models  | 
            ||
| 39 |         filename = 'models_{}{}.pkl.gz'.format(*sys.version_info[:2]) | 
            ||
| 40 | |||
| 41 |         with gzip.open(resource('PIDGIN', filename), 'rb') as f: | 
            ||
| 42 | self.models = pickle.load(f)  | 
            ||
| 43 | self.fingerprint = skchemize(GetMorganFingerprintAsBitVect, \  | 
            ||
| 44 | radius=2, nBits=2048)  | 
            ||
| 45 | self.targets = self.models.keys()  | 
            ||
| 46 | |||
| 47 | def __call__(self, m):  | 
            ||
| 48 | return self.predict_proba(m)  | 
            ||
| 49 | |||
| 50 | def _m_predict(self, m):  | 
            ||
| 51 | |||
| 52 | """ Predict binary binding profile for a molecule against 1080 protein targets """  | 
            ||
| 53 | |||
| 54 | fp = self.fingerprint(m)  | 
            ||
| 55 | return pd.Series((self.models[targ].predict(fp)[0] for targ in self.targets), \  | 
            ||
| 56 | index=self.targets)  | 
            ||
| 57 | |||
| 58 | def _map_predict(self, m):  | 
            ||
| 59 | |||
| 60 | """ Map based prediction for binary binding profile """  | 
            ||
| 61 | |||
| 62 | fp = self.fingerprint(m)  | 
            ||
| 63 | return pd.Series(map(lambda k: self.models[k].predict(fp), self.targets), \  | 
            ||
| 64 | index=self.targets)  | 
            ||
| 65 | |||
| 66 | def _m_predict_proba(self, m):  | 
            ||
| 67 | |||
| 68 | """ Predict probability of molecule m binding to 1080 protein targets """  | 
            ||
| 69 | |||
| 70 | fp = self.fingerprint(m)  | 
            ||
| 71 | res = pd.Series(index=self.targets)  | 
            ||
| 72 | for target in self.models:  | 
            ||
| 73 | res[target] = self.models[target].predict_proba(fp)[:, 1][0]  | 
            ||
| 74 | return res  | 
            ||
| 75 | |||
| 76 | def _map_predict_proba(self, m):  | 
            ||
| 77 | |||
| 78 | """ Predict the log probability of molecule m binding to the 1080 proteins """  | 
            ||
| 79 | |||
| 80 | fp = self.fingerprint(m)  | 
            ||
| 81 | return pd.Series(map(lambda k: self.models[k].predict_proba(fp)[:, 1][0],\  | 
            ||
| 82 | self.targets), index=self.targets)  | 
            ||
| 83 | |||
| 84 | def _m_predict_log_proba(self, m):  | 
            ||
| 85 | |||
| 86 | """ Predict the log probability of molecule m binding to the 1080 proteins """  | 
            ||
| 87 | |||
| 88 | fp = self.fingerprint(m)  | 
            ||
| 89 | res = pd.Series(index=self.targets)  | 
            ||
| 90 | for target, model in self.models.iteritems():  | 
            ||
| 91 | res[target] = model.predict_log_proba(fp)[:, 1][0]  | 
            ||
| 92 | return res  | 
            ||
| 93 | |||
| 94 | def map_predict_log_proba(self, m):  | 
            ||
| 95 | |||
| 96 | """ Predict the log probabiltiy of molecule m binding to the 1080 proteins  | 
            ||
| 97 | using map, for simple parallelism """  | 
            ||
| 98 | |||
| 99 | fp = self.fingerprint(m)  | 
            ||
| 100 | return pd.Series(map(lambda k: self.models[k].predict_log_proba(fp[:, 1][0]), \  | 
            ||
| 101 | self.targets), index=self.targets)  | 
            ||
| 102 | |||
| 103 | def _df_predict(self, df):  | 
            ||
| 104 | |||
| 105 | """more efficient way to call the predict on large scikit-chem style dataframes"""  | 
            ||
| 106 | |||
| 107 | fps = df.structure.apply(self.fingerprint)  | 
            ||
| 108 | res = pd.DataFrame(index=fps.index, columns=self.targets)  | 
            ||
| 109 | for target in self.models:  | 
            ||
| 110 | res[target] = self.models[target].predict(fps)  | 
            ||
| 111 | return res  | 
            ||
| 112 | |||
| 113 | def _df_map_predict(self, df):  | 
            ||
| 114 | |||
| 115 | """ More efficient way to call the predict on large scikit-chem style dataframes,  | 
            ||
| 116 | with a map implementation for easy parallelism"""  | 
            ||
| 117 | |||
| 118 | fps = df.structure.apply(self.fingerprint)  | 
            ||
| 119 | |||
| 120 | return pd.DataFrame(map(lambda k: self.models[k].predict(fps), self.targets), \  | 
            ||
| 121 | columns=fps.index, index=self.targets).T  | 
            ||
| 122 | |||
| 123 | |||
| 124 | View Code Duplication | def _df_predict_proba(self, df):  | 
            |
| 125 | |||
| 126 | """ More efficient way to call the predict_proba on large scikit-chem style dataframes"""  | 
            ||
| 127 | |||
| 128 | fps = df.structure.apply(self.fingerprint)  | 
            ||
| 129 | res = pd.DataFrame(index=fps.index, columns=self.targets)  | 
            ||
| 130 | |||
| 131 | #parallelize here  | 
            ||
| 132 | for target in self.models:  | 
            ||
| 133 | res[target] = self.models[target].predict_proba(fps)[:, 1]  | 
            ||
| 134 | return res  | 
            ||
| 135 | |||
| 136 | def _df_map_predict_proba(self, df):  | 
            ||
| 137 | |||
| 138 | """ map based way to call the predict_proba on large scikit-chem style dataframes """  | 
            ||
| 139 | |||
| 140 | fps = df.structure.apply(self.fingerprint)  | 
            ||
| 141 | |||
| 142 | #parallize here trivially  | 
            ||
| 143 | return pd.DataFrame(map(lambda k: self.models[k].predict_proba(fps)[:, 1], self.targets), \  | 
            ||
| 144 | columns=fps.index, index=self.targets).T  | 
            ||
| 145 | |||
| 146 | View Code Duplication | def _df_predict_log_proba(self, df):  | 
            |
| 147 | |||
| 148 | """ More efficient way to call the predict_proba on large scikit-chem style dataframes"""  | 
            ||
| 149 | |||
| 150 | fps = df.structure.apply(self.fingerprint)  | 
            ||
| 151 | res = pd.DataFrame(index=fps.index, columns=self.targets)  | 
            ||
| 152 | |||
| 153 | for target in self.models:  | 
            ||
| 154 | res[target] = self.models[target].predict_log_proba(fps)[:, 1]  | 
            ||
| 155 | return res  | 
            ||
| 156 | |||
| 157 | def _df_map_predict_log_proba(self, df):  | 
            ||
| 158 | |||
| 159 | """  | 
            ||
| 160 | More efficient way to call the predict on large scikit-chem style dataframes,  | 
            ||
| 161 | with a map implementation for easy parallelism  | 
            ||
| 162 | """  | 
            ||
| 163 | |||
| 164 | fps = df.structure.apply(self.fingerprint)  | 
            ||
| 165 | |||
| 166 | return pd.DataFrame(map(lambda k: self.models[k].predict_log_proba(fps)[:, 1], \  | 
            ||
| 167 | self.targets), columns=fps.index, index=self.targets).T  | 
            ||
| 168 |