Passed
Push — main ( 82dd22...9813db )
by Douglas
01:58
created

mandos.entries.filler.IdType.primary()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nop 1
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
from __future__ import annotations
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
2
import enum
3
from typing import Set, Mapping, Optional, Union, List, Dict
0 ignored issues
show
Unused Code introduced by
Unused Mapping imported from typing
Loading history...
4
5
from loguru import logger
0 ignored issues
show
introduced by
Unable to import 'loguru'
Loading history...
6
from mandos.model import CleverEnum
7
8
from mandos.entries.api_singletons import Apis
9
from mandos.entries.searcher import IdMatchFrame, ChemFinder
10
from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
11
from mandos.model.apis.pubchem_api import PubchemCompoundLookupError
12
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
13
14
15
PUT_FIRST = [
16
    "compound_id",
17
    "library",
18
    "inchikey",
19
    "chembl_id",
20
    "pubchem_id",
21
    "g2p_id",
22
    "common_name",
23
]
24
PUT_LAST = ["inchi", "smiles" "iupac", "origin_inchikey", "origin_inchi", "origin_smiles"]
0 ignored issues
show
introduced by
Implicit string concatenation found in list
Loading history...
25
26
27
class IdType(CleverEnum):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
28
    inchikey = enum.auto()
29
    chembl_id = enum.auto()
30
    pubchem_id = enum.auto()
31
    # g2p_id = enum.auto()
32
    common_name = enum.auto()
33
    iupac = enum.auto()
34
    inchi = enum.auto()
35
    smiles = enum.auto()
36
37
    @classmethod
38
    def parse(cls, fill: str) -> Set[IdType]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
39
        if fill == "@all":
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
40
            return set(IdType)
41
        elif fill == "@primary":
42
            return IdType.primary()
43
        else:
44
            return {IdType.of(s.strip().lower()) for s in fill.split(",")}
45
46
    @property
47
    def is_primary(self) -> bool:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
48
        return self in self.__class__.primary()
49
50
    @classmethod
51
    def primary(cls) -> Set[IdType]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
52
        # in order from best to worst
53
        return {IdType.inchikey, IdType.chembl_id, IdType.pubchem_id}
54
55
56
class CompoundIdFiller:
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
57
    def __init__(self, wanted: Set[Union[str, IdType]] = None, replace: bool = False):
58
        self.wanted = [IdType.of(s) for s in wanted]
59
        self.replace = replace
60
61
    def fill(
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
62
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
63
        df: IdMatchFrame,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
64
    ) -> IdMatchFrame:
65
        df = df.copy()
66
        df = df.dropna(how="all", axis=1)
67
        sources: Set[IdType] = {s for s in IdType.primary() if s.name in df.columns}
68
        targets: Set[IdType] = {s for s in self.wanted if s.name not in df.columns or self.replace}
69
        if len(sources) == 0:
70
            raise ValueError(f"No valid sources in list {df.columns.values}")
71
        source = next(iter(sources))
72
        # noinspection PyUnresolvedReferences
73
        logger.notice(f"Getting {', '.join([s.name for s in targets])} from {source.name}")
74
        # watch out! these are simply in order, nothing more
75
        remapped: Dict[IdType, List[str]] = {t: [] for t in IdType}
76
        for i, source_val in enumerate(df[source.name].values):
77
            if source_val is None:
78
                raise AssertionError()
79
            matches: Dict[IdType, str] = self._matches(source, source_val, targets)
80
            for target, target_val in matches.items():
81
                remapped[target].append(target_val)
82
            logger.info(f"Processed {source_val} ({i} of {len(df)}")
83
            if i % 20 == 0 and i > 0:
84
                logger.notice(f"Processed {i} / {len(df)}")
85
        for target in targets:
86
            rx = remapped[target]
0 ignored issues
show
Coding Style Naming introduced by
Variable name "rx" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
87
            df[target.name] = rx
88
        order = [o for o in PUT_FIRST if o in df.columns]
89
        order += [c for c in df.columns if c not in PUT_FIRST and c not in PUT_LAST]
90
        order += [o for o in PUT_LAST if o in df.columns]
91
        df = df.cfirst(order)
92
        return df
93
94
    def _matches(self, source: IdType, source_val: str, targets: Set[IdType]) -> Dict[IdType, str]:
95
        if source is IdType.pubchem_id:
96
            inchikey = Apis.Pubchem.find_inchikey(int(source_val))
97
        elif source is IdType.chembl_id:
98
            # TODO
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
99
            # get_compound wants an inchikey,
100
            # but we're secretly passing a CHEMBLxxxx ID instead
101
            # we just know that that works
102
            inchikey = ChemblUtils(Apis.Chembl).get_compound(source_val).inchikey
103
        elif source is IdType.inchikey:
104
            inchikey = source
105
        else:
106
            raise AssertionError(source.name)
107
        matched: Dict[IdType, str] = {k: None for k in self.wanted}
108
        matched[IdType.inchikey] = inchikey
109
        if IdType.pubchem_id in targets:
110
            try:
111
                pubchem_data: Optional[PubchemData] = Apis.Pubchem.fetch_data(inchikey)
112
            except PubchemCompoundLookupError:
113
                pubchem_data = None
114
            if pubchem_data is not None:
115
                matched[IdType.pubchem_id] = str(pubchem_data.cid)
116
                if IdType.common_name in targets:
117
                    matched[IdType.common_name] = pubchem_data.name
118
                if IdType.iupac in targets:
119
                    matched[IdType.iupac] = pubchem_data.names_and_identifiers.iupac
120
                if IdType.smiles in targets:
121
                    matched[IdType.smiles] = pubchem_data.names_and_identifiers.isomeric_smiles
122
                if IdType.inchi in targets:
123
                    matched[IdType.inchi] = pubchem_data.names_and_identifiers.inchi
124
        if IdType.chembl_id in targets:
125
            chembl_id = ChemFinder.chembl().find(inchikey)
126
            if chembl_id is not None:
127
                matched[IdType.chembl_id] = chembl_id
128
        return matched
129
130
131
__all__ = ["IdType", "CompoundIdFiller"]
132