Passed
Push — main ( 9813db...5006f2 )
by Douglas
01:43
created

CompoundIdFiller._get_pubchem()   B

Complexity

Conditions 6

Size

Total Lines 13
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 12
nop 3
dl 0
loc 13
rs 8.6666
c 0
b 0
f 0
1
from __future__ import annotations
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
2
from dataclasses import dataclass
3
from typing import Set, Optional, Mapping, Tuple, Dict
0 ignored issues
show
Unused Code introduced by
Unused Set imported from typing
Loading history...
4
5
from mandos import logger
6
from pocketutils.tools.common_tools import CommonTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.common_tools'
Loading history...
7
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
8
9
from mandos.model import CompoundNotFoundError, CompoundStruct
0 ignored issues
show
introduced by
Imports from package mandos are not grouped
Loading history...
10
11
from mandos.entries.api_singletons import Apis
12
from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
13
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
14
15
16
IdMatchFrame = (
17
    TypedDfs.typed("IdMatchFrame")
18
    .reserve("inchikey", dtype=str)
19
    .reserve("compound_id", "compound_name", "library", dtype=str)
20
    .reserve("inchi", dtype=str)
21
    .reserve("chembl_id", "pubchem_id", "hmdb_id", dtype=str)
22
    .reserve("chembl_inchikey", "pubchem_inchikey", dtype=str)
23
    .reserve("chembl_inchi", "pubchem_inchi", dtype=str)
24
    .reserve("origin_inchi", "origin_inchikey", dtype=str)
25
    .strict(index=True, cols=False)
26
).build()
27
28
29
FILL_IDS = [
30
    "inchi",
31
    "inchikey",
32
    "chembl_id",
33
    "pubchem_id",
34
    "chembl_inchi",
35
    "chembl_inchikey",
36
    "pubchem_inchi",
37
    "pubchem_inchikey",
38
]
39
PUT_FIRST = [
40
    "compound_id",
41
    "compound_name",
42
    "library",
43
    "inchikey",
44
    "chembl_id",
45
    "pubchem_id",
46
    "g2p_id",
47
    "chembl_inchikey",
48
    "pubchem_inchikey",
49
    "origin_inchikey",
50
]
51
PUT_LAST = ["inchi", "chembl_inchi", "pubchem_inchi", "origin_inchi", "smiles"]
52
53
Db = str
54
55
56
def look(obj, attrs):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
57
    s = CommonTools.look(obj, attrs)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "s" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
58
    if isinstance(s, str) and s.upper() == "N/A":
59
        return None
60
    return None if CommonTools.is_probable_null(s) else s
61
62
63
@dataclass(frozen=True, repr=True)
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
64
class CompoundIdFiller:
65
    chembl: bool = True
66
    pubchem: bool = True
67
68
    def fill(self, df: IdMatchFrame) -> IdMatchFrame:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
introduced by
Missing function or method docstring
Loading history...
69
        df = self._prep(df)
70
        logger.info(f"Processing {len(df)} input compounds...")
71
        fill = []
72
        for i, row in enumerate(df.itertuples()):
73
            if i % 200 == 0 and i > 0:
74
                logger.notice(f"Processed {i:,} / {len(df):,}")
75
            elif i % 20 == 0 and i > 0:
76
                logger.info(f"Processed {i:,} / {len(df):,}")
77
            proc = self._process(
78
                compound_id=look(row, "compound_id"),
79
                library=look(row, "library"),
80
                inchi=look(row, "origin_inchi"),
81
                inchikey=look(row, "origin_inchikey"),
82
                pubchem_id=look(row, "origin_pubchem_id"),
83
                chembl_id=look(row, "origin_chembl_id"),
84
                line_no=i,
85
            )
86
            fill.append(proc)
87
        for c in FILL_IDS:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "c" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
88
            df[c] = [r[c] for r in fill]
89
        duplicate_cols = []
90
        for c in FILL_IDS:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "c" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
91
            if c in df.columns and "origin_" + c in df.columns:
92
                if df[c].values.tolist() == df["origin_" + c].values.tolist():
93
                    duplicate_cols.append("origin_" + c)
94
        logger.notice(f"Done. Filled {len(df):,} rows.")
95
        if len(duplicate_cols) > 0:
96
            df = df.drop_cols(duplicate_cols)
97
            logger.notice(f"Dropped duplicated columns {', '.join(duplicate_cols)}")
98
        order = [o for o in PUT_FIRST if o in df.columns]
99
        order += [c for c in df.columns if c not in PUT_FIRST and c not in PUT_LAST]
100
        order += [o for o in PUT_LAST if o in df.columns]
101
        df = df.cfirst(order)
102
        have_chembl = len(df) - len(df[df["chembl_id"].isnull()]["chembl_id"].tolist())
103
        have_pubchem = len(df) - len(df[df["pubchem_id"].isnull()]["pubchem_id"].tolist())
104
        logger.notice(f"{have_chembl:,}/{len(df):,} have ChEMBL IDs")
105
        logger.notice(f"{have_pubchem:,}/{len(df):,} have PubChem IDs")
106
        return df
107
108
    def _process(
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (16/15).
Loading history...
109
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
110
        compound_id: Optional[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
111
        library: Optional[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
Unused Code introduced by
The argument library seems to be unused.
Loading history...
112
        inchi: Optional[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
113
        inchikey: Optional[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
114
        pubchem_id: Optional[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
115
        chembl_id: Optional[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
116
        line_no: int,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
117
    ):
118
        if inchikey is pubchem_id is chembl_id is None:
119
            logger.error(f"[line {line_no}] No data for {compound_id}")
120
            return dict(
121
                inchi=inchi,
122
                inchikey=inchikey,
123
                chembl_id=None,
124
                chembl_inchi=None,
125
                chembl_inchikey=None,
126
                pubchem_id=None,
127
                pubchem_inchi=None,
128
                pubchem_inchikey=None,
129
            )
130
        fake_x = CompoundStruct("input", compound_id, inchi, inchikey)
131
        chembl_x = self._get_chembl(inchikey, chembl_id)
132
        pubchem_x = self._get_pubchem(inchikey, pubchem_id)
133
        #################################################################################
134
        # This is important and weird!
135
        # Where DNE = does not exist and E = exists
136
        # If chembl DNE and pubchem E ==> fill chembl
137
        # THEN: If chembl E and (pubchem E or pubchem DNE) ==> fill pubchem
138
        # we might therefore go from pubchem --> chembl --> pubchem
139
        # The advantage is that chembl might have a good parent compound
140
        # Whereas pubchem does not
141
        # This is often true: chembl is much better at this than pubchem
142
        # In contrast, only fill ChEMBL if it's missing
143
        if chembl_x is None and pubchem_x is not None:
144
            chembl_x = self._get_chembl(pubchem_x.inchikey, None)
145
        if chembl_x is not None:
146
            pubchem_x = self._get_pubchem(chembl_x.inchikey, None)
147
        #################################################################################
148
        # the order is from best to worst
149
        prioritize_choices = [chembl_x, pubchem_x, fake_x]
150
        db_to_struct = {o.db: o for o in prioritize_choices if o is not None}
151
        inchikey, inchikey_choices = self._choose(db_to_struct, "inchikey")
152
        inchi, inchi_choices = self._choose(db_to_struct, "inchi")
153
        about = " ; ".join([x.simple_str for x in prioritize_choices if x is not None])
154
        if len(inchikey_choices) == 0:
155
            logger.error(f"[line {line_no}] no database inchikeys found :: {about}")
156
        elif len(inchikey_choices) > 1:
157
            logger.error(f"[line {line_no}] inchikey mismatch :: {about} :: {inchikey_choices}")
158
        elif len(inchi_choices) > 1:
159
            logger.debug(f"[line {line_no}] inchi mismatch :: {about} :: {inchi_choices}")
160
        return dict(
161
            inchi=inchi,
162
            inchikey=inchikey,
163
            chembl_id=look(chembl_x, "id"),
164
            chembl_inchi=look(chembl_x, "inchi"),
165
            chembl_inchikey=look(chembl_x, "inchikey"),
166
            pubchem_id=look(pubchem_x, "id"),
167
            pubchem_inchi=look(pubchem_x, "inchi"),
168
            pubchem_inchikey=look(pubchem_x, "inchikey"),
169
        )
170
171
    def _choose(
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
172
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
173
        db_to_struct: Mapping[str, CompoundStruct],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
174
        what: str,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
175
    ) -> Tuple[Optional[str], Dict[str, Db]]:
176
        """
177
        Chooses the best what="inchi" or what="inchikey".
178
179
        Arguments:
180
            db_to_struct: Should be in order from most preferred to least
181
            what: The name of the CompoundStruct attribute to access
182
        """
183
        options = {o.db: look(o, what) for o in db_to_struct.values() if look(o, what) is not None}
184
        _s = ", ".join([f"{k}={v}" for k, v in options.items()])
185
        non_input_dbs = {v: k for k, v in options.items() if k != "input"}
186
        all_uniques = set(options.values())
187
        if len(all_uniques) == 0:
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
188
            return None, {}
189
        else:
190
            return list(all_uniques)[0], non_input_dbs
191
192
    def _prep(self, df: IdMatchFrame) -> IdMatchFrame:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
193
        bad_cols = [c for c in df.columns if c.startswith("origin_")]
194
        if len(bad_cols) > 0:
195
            raise ValueError(f"Columns {', '.join(bad_cols)} start with 'origin_'")
196
        rename_cols = {c: "origin_" + c for c in FILL_IDS if c in df.columns}
197
        if len(rename_cols) > 0:
198
            logger.notice(f"Renaming columns: {', '.join(rename_cols.keys())}")
199
        df: IdMatchFrame = df.rename(columns=rename_cols)
200
        drop_cols = {c for c in df.columns if df[c].isnull().all()}
201
        if len(drop_cols):
0 ignored issues
show
Unused Code introduced by
Do not use len(SEQUENCE) without comparison to determine if a sequence is empty
Loading history...
202
            logger.warning(f"Dropping empty columns: {', '.join(drop_cols)}")
203
        df = df.drop_cols(drop_cols)
204
        return df
205
206
    def _get_pubchem(self, inchikey: Optional[str], cid: Optional[int]) -> Optional[CompoundStruct]:
0 ignored issues
show
Unused Code introduced by
Either all return statements in a function should return an expression, or none of them should.
Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
207
        api = Apis.Pubchem
208
        if cid is not None:
209
            # let it raise a CompoundNotFoundError
210
            inchikey = api.fetch_data(int(cid)).names_and_identifiers.inchikey
211
            if inchikey is None:
212
                return None
213
        if inchikey is not None:
214
            try:
215
                data: Optional[PubchemData] = api.fetch_data(inchikey)
216
            except CompoundNotFoundError:
217
                return None
218
            return None if data is None else data.struct_view
219
220
    def _get_chembl(self, inchikey: Optional[str], cid: Optional[str]) -> Optional[CompoundStruct]:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
221
        util = ChemblUtils(Apis.Chembl)
222
        if cid is not None:
223
            # let it raise a CompoundNotFoundError
224
            return util.get_compound(cid).struct_view
225
        try:
226
            return util.get_compound(inchikey).struct_view
227
        except CompoundNotFoundError:
228
            return None
229
230
231
__all__ = ["CompoundIdFiller", "IdMatchFrame"]
232