1 | #! /usr/bin/env python |
||
2 | # |
||
3 | # Copyright (C) 2016 Rich Lewis <[email protected]> |
||
4 | # License: 3-clause BSD |
||
5 | |||
6 | 1 | """ |
|
7 | # skchem.data.converters.base |
||
8 | |||
9 | Defines the base converter class. |
||
10 | """ |
||
11 | |||
12 | 1 | import warnings |
|
13 | 1 | import logging |
|
14 | 1 | import os |
|
15 | 1 | from collections import namedtuple |
|
16 | |||
17 | 1 | import numpy as np |
|
0 ignored issues
–
show
|
|||
18 | 1 | import pandas as pd |
|
0 ignored issues
–
show
The import
pandas could not be resolved.
This can be caused by one of the following: 1. Missing DependenciesThis error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml
before_commands:
- sudo pip install abc # Python2
- sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use
the command for the correct version.
2. Missing __init__.py filesThis error could also result from missing ![]() |
|||
19 | 1 | import h5py |
|
0 ignored issues
–
show
The import
h5py could not be resolved.
This can be caused by one of the following: 1. Missing DependenciesThis error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml
before_commands:
- sudo pip install abc # Python2
- sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use
the command for the correct version.
2. Missing __init__.py filesThis error could also result from missing ![]() |
|||
20 | 1 | from fuel.datasets import H5PYDataset |
|
0 ignored issues
–
show
The import
fuel.datasets could not be resolved.
This can be caused by one of the following: 1. Missing DependenciesThis error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml
before_commands:
- sudo pip install abc # Python2
- sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use
the command for the correct version.
2. Missing __init__.py filesThis error could also result from missing ![]() |
|||
21 | |||
22 | 1 | from ... import forcefields |
|
23 | 1 | from ... import filters |
|
24 | 1 | from ... import features |
|
25 | 1 | from ... import standardizers |
|
26 | 1 | from ... import pipeline |
|
27 | |||
28 | 1 | logger = logging.getLogger(__name__) |
|
0 ignored issues
–
show
The name
logger does not conform to the constant naming conventions ((([A-Z_][A-Z0-9_]*)|(__.*__))$ ).
This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. ![]() |
|||
29 | |||
30 | |||
31 | 1 | def default_pipeline(): |
|
32 | """ Return a default pipeline to be used for general datasets. """ |
||
33 | return pipeline.Pipeline([ |
||
34 | standardizers.ChemAxonStandardizer(keep_failed=True, warn_on_fail=False), |
||
35 | forcefields.UFF(add_hs=True, warn_on_fail=False), |
||
36 | filters.OrganicFilter(), |
||
37 | filters.AtomNumberFilter(above=5, below=100, include_hydrogens=True), |
||
38 | filters.MassFilter(below=1000) |
||
39 | ]) |
||
40 | |||
41 | 1 | DEFAULT_PYTABLES_KW = { |
|
42 | 'complib': 'bzip2', |
||
43 | 'complevel': 9 |
||
44 | } |
||
45 | |||
46 | 1 | def contiguous_order(to_order, splits): |
|
47 | """ Determine a contiguous order from non-overlapping splits, and put data in that order. |
||
48 | |||
49 | Args: |
||
50 | to_order (iterable<pd.Series, pd.DataFrame, pd.Panel>): |
||
51 | The pandas objects to put in contiguous order. |
||
52 | splits (iterable<pd.Series>): |
||
53 | The non-overlapping splits, as boolean masks. |
||
54 | |||
55 | Returns: |
||
56 | iterable<pd.Series, pd.DataFrame, pd.Panel>: The data in contiguous order. |
||
57 | """ |
||
58 | |||
59 | member = pd.Series(0, index=splits[0].index) |
||
60 | for i, split in enumerate(splits): |
||
61 | member[split] = i |
||
62 | idx = member.sort_values().index |
||
63 | return (order.reindex(idx) for order in to_order) |
||
64 | |||
65 | 1 | Feature = namedtuple('Feature', ['fper', 'key', 'axis_names']) |
|
66 | |||
67 | |||
68 | 1 | def default_features(): |
|
0 ignored issues
–
show
This function should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
69 | return ( |
||
70 | Feature(fper=features.MorganFeaturizer(), |
||
71 | key='X_morg', |
||
72 | axis_names=['batch', 'features']), |
||
73 | Feature(fper=features.PhysicochemicalFeaturizer(), |
||
74 | key='X_pc', |
||
75 | axis_names=['batch', 'features']), |
||
76 | Feature(fper=features.AtomFeaturizer(max_atoms=100), |
||
0 ignored issues
–
show
|
|||
77 | key='A', |
||
78 | axis_names=['batch', 'atom_idx', 'features']), |
||
79 | Feature(fper=features.GraphDistanceTransformer(max_atoms=100), |
||
0 ignored issues
–
show
|
|||
80 | key='G', |
||
81 | axis_names=['batch', 'atom_idx', 'atom_idx']), |
||
82 | Feature(fper=features.SpacialDistanceTransformer(max_atoms=100), |
||
0 ignored issues
–
show
|
|||
83 | key='G_d', |
||
84 | axis_names=['batch', 'atom_idx', 'atom_idx']), |
||
85 | Feature(fper=features.ChemAxonFeaturizer(features='all'), |
||
86 | key='X_cx', |
||
87 | axis_names=['batch', 'features']), |
||
88 | Feature(fper=features.ChemAxonAtomFeaturizer(features='all', max_atoms=100), |
||
0 ignored issues
–
show
|
|||
89 | key='A_cx', |
||
90 | axis_names=['batch', 'atom_idx', 'features']) |
||
91 | ) |
||
92 | |||
93 | |||
94 | 1 | class Split(object): |
|
0 ignored issues
–
show
This class should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
95 | |||
96 | 1 | def __init__(self, mask, name, converter): |
|
97 | self.mask = mask |
||
98 | self.name = name |
||
99 | self.converter = converter |
||
100 | |||
101 | 1 | @property |
|
102 | def contiguous(self): |
||
0 ignored issues
–
show
This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
103 | diff = np.ediff1d(self.mask.astype(int)) |
||
104 | if self.mask.iloc[0] != 0: |
||
105 | diff[0] = 1 |
||
106 | if self.mask.iloc[-1] != 0: |
||
107 | diff[-1] = -1 |
||
108 | return sum(diff == -1) == 1 or sum(diff == 1) == 1 |
||
109 | |||
110 | 1 | @property |
|
111 | def indices(self): |
||
0 ignored issues
–
show
This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
112 | return np.nonzero(self.mask)[0] |
||
113 | |||
114 | 1 | def save(self): |
|
0 ignored issues
–
show
This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
115 | self.converter.data_file[self.name + '_indices'] = self.indices |
||
116 | with warnings.catch_warnings(): |
||
117 | warnings.simplefilter('ignore') |
||
118 | self.mask.to_hdf(self.converter.data_file.filename, '/indices/' + self.name) |
||
119 | |||
120 | 1 | @property |
|
121 | def ref(self): |
||
0 ignored issues
–
show
This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
122 | return self.converter.data_file[self.name + '_indices'].ref |
||
123 | |||
124 | 1 | def to_dict(self): |
|
0 ignored issues
–
show
This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
125 | idx = self.indices |
||
126 | if self.contiguous: |
||
127 | low, high = min(idx), max(idx) |
||
128 | return {source: (low, high) for source in self.converter.source_names} |
||
129 | else: |
||
130 | return {source: (-1, -1, self.ref) for source in self.converter.source_names} |
||
131 | |||
132 | |||
133 | 1 | class Converter(object): |
|
134 | """ Create a fuel dataset from molecules and targets. """ |
||
135 | |||
136 | 1 | def __init__(self, directory, output_directory, output_filename='default.h5'): |
|
0 ignored issues
–
show
|
|||
137 | raise NotImplemented |
||
0 ignored issues
–
show
|
|||
138 | |||
139 | 1 | def run(self, ms, y, output_path, splits=None, features=None, pytables_kws=DEFAULT_PYTABLES_KW): |
|
0 ignored issues
–
show
The default value
DEFAULT_PYTABLES_KW (__builtin__.dict) might cause unintended side-effects.
Objects as default values are only created once in Python and not on each invocation of the function. If the default object is modified, this modification is carried over to the next invocation of the method. # Bad:
# If array_param is modified inside the function, the next invocation will
# receive the modified object.
def some_function(array_param=[]):
# ...
# Better: Create an array on each invocation
def some_function(array_param=None):
array_param = array_param or []
# ...
![]() The name
ms does not conform to the argument naming conventions ([a-z_][a-z0-9_]{2,30}$ ).
This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. ![]() The name
y does not conform to the argument naming conventions ([a-z_][a-z0-9_]{2,30}$ ).
This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. ![]() features is re-defining a name which is already available in the outer-scope (previously defined on line 24 ).
It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior: param = 5
class Foo:
def __init__(self, param): # "param" would be flagged here
self.param = param
![]() |
|||
140 | |||
141 | """ |
||
142 | Args: |
||
143 | ms (pd.Series): |
||
144 | The molecules of the dataset. |
||
145 | ys (pd.Series or pd.DataFrame): |
||
146 | The target labels of the dataset. |
||
147 | output_path (str): |
||
148 | The path to which the dataset should be saved. |
||
149 | features (list[Feature]): |
||
150 | The features to calculate. Defaults are used if `None`. |
||
151 | splits (iterable<(name, split)>): |
||
152 | An iterable of name, split tuples. Splits are provided as boolean arrays of the whole data. |
||
0 ignored issues
–
show
|
|||
153 | """ |
||
154 | |||
155 | self.output_path = output_path |
||
0 ignored issues
–
show
|
|||
156 | self.pytables_kws = pytables_kws |
||
0 ignored issues
–
show
|
|||
157 | self.features = features if features is not None else default_features() |
||
0 ignored issues
–
show
|
|||
158 | self.feature_names = [feat.key for feat in self.features] |
||
0 ignored issues
–
show
|
|||
159 | self.task_names = ['y'] |
||
0 ignored issues
–
show
|
|||
160 | self.splits = [Split(split, name, self) for name, split in splits] |
||
0 ignored issues
–
show
|
|||
161 | |||
162 | self.create_file(output_path) |
||
163 | |||
164 | self.save_splits() |
||
165 | self.save_molecules(ms) |
||
166 | self.save_targets(y) |
||
167 | self.save_features(ms) |
||
168 | |||
169 | 1 | @property |
|
170 | def source_names(self): |
||
0 ignored issues
–
show
This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
171 | return self.feature_names + self.task_names |
||
172 | |||
173 | 1 | @property |
|
174 | def split_names(self): |
||
0 ignored issues
–
show
This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
175 | return self.splits |
||
176 | |||
177 | 1 | def create_file(self, path): |
|
0 ignored issues
–
show
This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
178 | logger.info('Creating h5 file at %s...', self.output_path) |
||
179 | self.data_file = h5py.File(path, 'w') |
||
0 ignored issues
–
show
|
|||
180 | return self.data_file |
||
181 | |||
182 | 1 | def save_molecules(self, mols): |
|
183 | |||
184 | """ Save the molecules to the data file. """ |
||
185 | |||
186 | logger.info('Writing molecules to file...') |
||
187 | logger.debug('Writing %s molecules to %s', len(mols), self.data_file.filename) |
||
188 | with warnings.catch_warnings(): |
||
189 | warnings.simplefilter('ignore') |
||
190 | mols.to_hdf(self.data_file.filename, 'structure', **self.pytables_kws) |
||
191 | mols.apply(lambda m: m.to_smiles().encode('utf-8')).to_hdf(self.data_file.filename, 'smiles') |
||
0 ignored issues
–
show
|
|||
192 | |||
193 | 1 | def save_frame(self, data, name, prefix='targets'): |
|
194 | |||
195 | """ Save the a frame to the data file. """ |
||
196 | |||
197 | logger.info('Writing %s', name) |
||
198 | logger.debug('Writing data of shape %s to %s', data.shape, self.data_file.filename) |
||
199 | |||
200 | with warnings.catch_warnings(): |
||
201 | warnings.simplefilter('ignore') |
||
202 | if len(data.shape) > 2: |
||
203 | data = data.transpose(2, 1, 0) # panel serializes backwards for some reason... |
||
204 | data.to_hdf(self.data_file.filename, |
||
205 | key='/{prefix}/{name}'.format(prefix=prefix, name=name), |
||
206 | **self.pytables_kws) |
||
207 | |||
208 | if isinstance(data, pd.Series): |
||
209 | self.data_file[name] = h5py.SoftLink('/{prefix}/{name}/values'.format(prefix=prefix, name=name)) |
||
0 ignored issues
–
show
|
|||
210 | self.data_file[name].dims[0].label = data.index.name |
||
211 | |||
212 | elif isinstance(data, pd.DataFrame): |
||
213 | self.data_file[name] = h5py.SoftLink('/{prefix}/{name}/block0_values'.format(prefix=prefix, name=name)) |
||
0 ignored issues
–
show
|
|||
214 | self.data_file[name].dims[0].label = data.index.name |
||
215 | self.data_file[name].dims[1].label = data.columns.name |
||
216 | |||
217 | elif isinstance(data, pd.Panel): |
||
218 | self.data_file[name] = h5py.SoftLink('/{prefix}/{name}/block0_values'.format(prefix=prefix, name=name)) |
||
0 ignored issues
–
show
|
|||
219 | self.data_file[name].dims[0].label = data.minor_axis.name # as panel serializes backwards |
||
0 ignored issues
–
show
|
|||
220 | self.data_file[name].dims[1].label = data.major_axis.name |
||
221 | self.data_file[name].dims[2].label = data.items.name |
||
222 | |||
223 | 1 | def save_targets(self, y): |
|
0 ignored issues
–
show
The name
y does not conform to the argument naming conventions ([a-z_][a-z0-9_]{2,30}$ ).
This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. ![]() This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
224 | |||
225 | self.save_frame(y, name='y', prefix='targets') |
||
226 | |||
227 | 1 | def save_features(self, ms): |
|
0 ignored issues
–
show
The name
ms does not conform to the argument naming conventions ([a-z_][a-z0-9_]{2,30}$ ).
This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. ![]() |
|||
228 | |||
229 | """ Save all features for the dataset. """ |
||
230 | logger.debug('Saving features') |
||
231 | for feat in self.features: |
||
232 | self._save_feature(ms, feat) |
||
233 | |||
234 | 1 | def _save_feature(self, ms, feat): |
|
0 ignored issues
–
show
The name
ms does not conform to the argument naming conventions ([a-z_][a-z0-9_]{2,30}$ ).
This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. ![]() |
|||
235 | |||
236 | """ Calculate and save a feature to the data file. """ |
||
237 | logger.info('Calculating %s', feat.key) |
||
238 | |||
239 | fps = feat.fper.transform(ms) |
||
240 | self.save_frame(fps, name=feat.key, prefix='feats') |
||
241 | |||
242 | 1 | def save_splits(self): |
|
243 | |||
244 | """ Save the splits to the data file. """ |
||
245 | |||
246 | logger.info('Producing dataset splits...') |
||
247 | for split in self.splits: |
||
248 | split.save() |
||
249 | split_dict = {split.name: split.to_dict() for split in self.splits} |
||
250 | splits = H5PYDataset.create_split_array(split_dict) |
||
251 | logger.debug('split: %s', splits) |
||
252 | logger.info('Saving splits...') |
||
253 | with warnings.catch_warnings(): |
||
254 | warnings.simplefilter('ignore') |
||
255 | self.data_file.attrs['split'] = splits |
||
256 | |||
257 | 1 | @classmethod |
|
258 | def convert(cls, **kwargs): |
||
0 ignored issues
–
show
This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
259 | kwargs.setdefault('directory', os.getcwd()) |
||
260 | kwargs.setdefault('output_directory', os.getcwd()) |
||
261 | |||
262 | return cls(**kwargs).output_path, |
||
263 | |||
264 | 1 | @classmethod |
|
265 | def fill_subparser(cls, subparser): |
||
0 ignored issues
–
show
This method should have a docstring.
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass:
def some_method(self):
"""Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. ![]() |
|||
266 | return cls.convert |
||
267 |
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.