|
1
|
|
|
#! /usr/bin/env python |
|
|
|
|
|
|
2
|
|
|
# |
|
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
|
4
|
|
|
# License: 3-clause BSD |
|
5
|
|
|
|
|
6
|
|
|
import os |
|
7
|
|
|
import logging |
|
8
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
9
|
|
|
|
|
10
|
|
|
import pandas as pd |
|
|
|
|
|
|
11
|
|
|
|
|
12
|
|
|
from ... import core |
|
|
|
|
|
|
13
|
|
|
from .base import Converter |
|
14
|
|
|
|
|
15
|
|
|
class BradleyOpenMPConverter(Converter): |
|
|
|
|
|
|
16
|
|
|
|
|
17
|
|
|
def __init__(self, directory, output_directory, output_filename='bradley_open_mp.h5'): |
|
|
|
|
|
|
18
|
|
|
|
|
19
|
|
|
output_path = os.path.join(output_directory, output_filename) |
|
20
|
|
|
data = self.parse_data(os.path.join(directory, 'bradley_melting_point_dataset.xlsx')) |
|
21
|
|
|
data = self.filter_bad(data) |
|
22
|
|
|
data['structure'] = self.standardize(data.smiles) |
|
23
|
|
|
data = self.filter(data) |
|
24
|
|
|
ms, y = data.structure, self.fix_mp(data) |
|
|
|
|
|
|
25
|
|
|
self.run(ms, y, output_path=output_path) |
|
26
|
|
|
|
|
27
|
|
|
def parse_data(self, path): |
|
|
|
|
|
|
28
|
|
|
logger.info('Parsing data at %s...', path) |
|
29
|
|
|
return pd.read_excel(path, index_col=0) |
|
30
|
|
|
|
|
31
|
|
|
def filter_bad(self, data): |
|
|
|
|
|
|
32
|
|
|
logger.info('Removing manually annotated errors...') |
|
33
|
|
|
bad_data = data.donotuse.notnull() |
|
34
|
|
|
logger.debug('Removed %s', bad_data.sum()) |
|
35
|
|
|
return data[~bad_data] |
|
36
|
|
|
|
|
37
|
|
|
def fix_mp(self, data): |
|
|
|
|
|
|
38
|
|
|
logger.info('Converting temperature to Kelvin...') |
|
39
|
|
|
return data.mpC + 278.15 |
|
40
|
|
|
|
|
41
|
|
|
if __name__ == '__main__': |
|
42
|
|
|
logging.basicConfig(level=logging.INFO) |
|
43
|
|
|
BradleyOpenMPConverter.convert() |
|
44
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.