1
|
|
|
#! /usr/bin/env python |
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2007-2009 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
|
|
""" |
7
|
|
|
# skchem.filters.smarts |
8
|
|
|
|
9
|
|
|
Module defines SMARTS filters. |
10
|
|
|
""" |
11
|
|
|
|
12
|
|
|
from rdkit import RDConfig |
|
|
|
|
13
|
|
|
import pandas as pd |
|
|
|
|
14
|
|
|
import os |
15
|
|
|
import pandas as pd |
|
|
|
|
16
|
|
|
|
17
|
|
|
from .base import Filter |
18
|
|
|
from ..core import Mol |
19
|
|
|
|
20
|
|
|
class SMARTSFilter(Filter): |
21
|
|
|
|
22
|
|
|
""" Filter a molecule based on smarts. |
23
|
|
|
|
24
|
|
|
Args: |
25
|
|
|
smarts (pd.Series): |
26
|
|
|
A series of SMARTS to use in the filter. |
27
|
|
|
agg (function): |
28
|
|
|
Option specifying the mode of the filter. |
29
|
|
|
|
30
|
|
|
- None : No filtering takes place |
31
|
|
|
- any: If any of the substructures are in molecule return True. |
32
|
|
|
- all: If all of the substructures are in molecule. |
33
|
|
|
|
34
|
|
|
Examples: |
35
|
|
|
|
36
|
|
|
>>> import skchem |
37
|
|
|
>>> m1 = skchem.Mol.from_smiles('CC') |
38
|
|
|
>>> m2 = skchem.Mol.from_smiles('c1ccccc1') |
39
|
|
|
>>> m3 = skchem.Mol.from_smiles('c1ccccc1-c2c(C=O)ccnc2') |
40
|
|
|
>>> ms = pd.Series({'ethane': m1, 'benzene': m2, 'big': m3}) |
41
|
|
|
>>> f = skchem.filters.SMARTSFilter({'benzene': 'c1ccccc1', 'pyridine': 'c1ccccn1', 'acetyl': 'C=O'}) |
|
|
|
|
42
|
|
|
>>> f.apply(ms) |
43
|
|
|
acetyl benzene pyridine |
44
|
|
|
benzene False True False |
45
|
|
|
big True True True |
46
|
|
|
ethane False False False |
47
|
|
|
|
48
|
|
|
>>> f.filter(ms, agg=any) |
49
|
|
|
benzene <Mol: c1ccccc1> |
50
|
|
|
big <Mol: O=Cc1ccncc1-c1ccccc1> |
51
|
|
|
dtype: object |
52
|
|
|
|
53
|
|
|
>>> f.filter(ms, agg=all) |
54
|
|
|
big <Mol: O=Cc1ccncc1-c1ccccc1> |
55
|
|
|
dtype: object |
56
|
|
|
""" |
57
|
|
|
|
58
|
|
|
def __init__(self, smarts, **kwargs): |
59
|
|
|
|
60
|
|
|
def read_smarts(s): |
|
|
|
|
61
|
|
|
if isinstance(s, str): |
62
|
|
|
return Mol.from_smarts(s, mergeHs=True) |
|
|
|
|
63
|
|
|
else: |
64
|
|
|
return s |
65
|
|
|
|
66
|
|
|
self.smarts = pd.Series(smarts).apply(read_smarts) |
67
|
|
|
|
68
|
|
|
self.index = self.smarts.index |
69
|
|
|
super(SMARTSFilter, self).__init__(self.func, **kwargs) |
70
|
|
|
|
71
|
|
|
def func(self, mol): |
|
|
|
|
72
|
|
|
|
73
|
|
|
return self.smarts.apply(lambda smarts: smarts in mol) |
74
|
|
|
|
75
|
|
|
|
76
|
|
|
|
77
|
|
|
class PAINSFilter(SMARTSFilter): |
78
|
|
|
|
79
|
|
|
""" Whether a molecule passes the Pan Assay INterference (PAINS) filters. |
80
|
|
|
|
81
|
|
|
These are supplied with RDKit, and were originally proposed by Baell et al. |
82
|
|
|
|
83
|
|
|
References: |
84
|
|
|
[The original paper](http://dx.doi.org/10.1021/jm901137j) |
85
|
|
|
|
86
|
|
|
Examples: |
87
|
|
|
|
88
|
|
|
Basic usage as a function on molecules: |
89
|
|
|
|
90
|
|
|
>>> import skchem |
91
|
|
|
>>> m1 = skchem.Mol.from_smiles('c1ccccc1', name='benzene') |
92
|
|
|
>>> no_pains = PAINSFilter() |
93
|
|
|
>>> no_pains(m1) |
94
|
|
|
True |
95
|
|
|
>>> m2 = skchem.Mol.from_smiles('Oc1c(O)cccc1', name='catechol') |
96
|
|
|
>>> no_pains(m2) |
97
|
|
|
False |
98
|
|
|
|
99
|
|
|
More useful in combination with pandas DataFrames: |
100
|
|
|
|
101
|
|
|
>>> import gzip |
102
|
|
|
>>> sdf = gzip.open(skchem.data.resource('ames_mutagenicity.sdf.gz')) |
103
|
|
|
>>> data = skchem.read_sdf(sdf) |
104
|
|
|
>>> no_pains.apply(data).value_counts() |
105
|
|
|
True 3855 |
106
|
|
|
False 482 |
107
|
|
|
dtype: int64 |
108
|
|
|
|
109
|
|
|
>>> len(no_pains.filter(data)) |
110
|
|
|
3855 |
111
|
|
|
""" |
112
|
|
|
|
113
|
|
|
def __init__(self): |
114
|
|
|
|
115
|
|
|
super(PAINSFilter, self).__init__(self._load_pains(), agg=any, neg=True) |
116
|
|
|
|
117
|
|
|
def _load_pains(cls): |
|
|
|
|
118
|
|
|
|
119
|
|
|
""" Load PAINS included in rdkit into a pandas dataframe and cache as class attribute. """ |
120
|
|
|
|
121
|
|
|
if not hasattr(cls, '_pains'): |
122
|
|
|
path = os.path.join(RDConfig.RDDataDir, 'Pains', 'wehi_pains.csv') |
123
|
|
|
pains = pd.read_csv(path, names=['pains', 'names']) |
124
|
|
|
pains['names'] = pains.names.str.lstrip('<regId=').str.rstrip('>') |
125
|
|
|
pains = pains.set_index('names').pains.apply(Mol.from_smarts, mergeHs=True) |
|
|
|
|
126
|
|
|
cls._pains = pains |
127
|
|
|
return cls._pains |
128
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.