1
|
|
|
#! /usr/bin/env python |
|
|
|
|
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
1 |
|
import os |
7
|
1 |
|
import logging |
8
|
1 |
|
logger = logging.getLogger(__name__) |
|
|
|
|
9
|
|
|
|
10
|
1 |
|
import pandas as pd |
|
|
|
|
11
|
|
|
|
12
|
1 |
|
from .base import Converter, default_pipeline, contiguous_order |
13
|
1 |
|
from ...core import Mol |
14
|
1 |
|
from ...cross_validation import SimThresholdSplit |
15
|
|
|
|
16
|
1 |
|
class BradleyOpenMPConverter(Converter): |
|
|
|
|
17
|
|
|
|
18
|
1 |
|
def __init__(self, directory, output_directory, output_filename='bradley_open_mp.h5'): |
|
|
|
|
19
|
|
|
|
20
|
|
|
output_path = os.path.join(output_directory, output_filename) |
21
|
|
|
data = self.parse_data(os.path.join(directory, 'bradley_melting_point_dataset.xlsx')) |
22
|
|
|
data = self.filter_bad(data) |
23
|
|
|
|
24
|
|
|
def parse_smiles(smi): |
|
|
|
|
25
|
|
|
try: |
26
|
|
|
return Mol.from_smiles(smi) |
|
|
|
|
27
|
|
|
except ValueError: |
28
|
|
|
return None |
29
|
|
|
|
30
|
|
|
data['structure'] = data.smiles.apply(parse_smiles) |
31
|
|
|
data = data[data.structure.notnull()] |
32
|
|
|
ms, y = data.structure, self.fix_mp(data) |
|
|
|
|
33
|
|
|
|
34
|
|
|
pipeline = default_pipeline() |
35
|
|
|
ms, y = pipeline.transform_filter(ms, y) |
|
|
|
|
36
|
|
|
|
37
|
|
|
cv = SimThresholdSplit(min_threshold=0.6, n_jobs=-1).fit(ms) |
|
|
|
|
38
|
|
|
train, valid, test = cv.split((70, 15, 15)) |
39
|
|
|
(ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test)) |
|
|
|
|
40
|
|
|
splits = (('train', train), ('valid', valid), ('test', test)) |
41
|
|
|
|
42
|
|
|
self.run(ms, y, output_path=output_path, splits=splits) |
43
|
|
|
|
44
|
1 |
|
@staticmethod |
45
|
|
|
def parse_data(path): |
|
|
|
|
46
|
|
|
logger.info('Parsing data at %s...', path) |
47
|
|
|
return pd.read_excel(path, index_col=0) |
48
|
|
|
|
49
|
1 |
|
@staticmethod |
50
|
|
|
def filter_bad(data): |
|
|
|
|
51
|
|
|
logger.info('Removing manually annotated errors...') |
52
|
|
|
bad_data = data.donotuse.notnull() |
53
|
|
|
logger.debug('Removed %s', bad_data.sum()) |
54
|
|
|
return data[~bad_data] |
55
|
|
|
|
56
|
1 |
|
@staticmethod |
57
|
|
|
def fix_mp(data): |
|
|
|
|
58
|
|
|
logger.info('Converting temperature to Kelvin...') |
59
|
|
|
return data.mpC + 278.15 |
60
|
|
|
|
61
|
1 |
|
if __name__ == '__main__': |
62
|
|
|
logging.basicConfig(level=logging.DEBUG) |
63
|
|
|
LOGGER.info('Converting Bradley Open Melting Point Dataset...') |
|
|
|
|
64
|
|
|
BradleyOpenMPConverter.convert() |
65
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.