mandos.entries.filler.CompoundIdFiller.fill() - Code Metrics - Inspection of "feat: fill missing compound info" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( 82dd22...9813db )

by Douglas

created 2021-08-03 04:49 UTC

mandos.entries.filler.CompoundIdFiller.fill() B

↳ Parent: mandos.entries.filler

Complexity

Conditions

Size

Total Lines	32
Code Lines	29

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	8
eloc	29
nop	2
dl	0
loc	32
rs	7.3173
c	0
b	0
f	0

from __future__ import annotations

import enum
from typing import Set, Mapping, Optional, Union, List, Dict


from loguru import logger

from mandos.model import CleverEnum

from mandos.entries.api_singletons import Apis
from mandos.entries.searcher import IdMatchFrame, ChemFinder
from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
from mandos.model.apis.pubchem_api import PubchemCompoundLookupError
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData


PUT_FIRST = [
    "compound_id",
    "library",
    "inchikey",
    "chembl_id",
    "pubchem_id",
    "g2p_id",
    "common_name",
]
PUT_LAST = ["inchi", "smiles" "iupac", "origin_inchikey", "origin_inchi", "origin_smiles"]



class IdType(CleverEnum):

    inchikey = enum.auto()
    chembl_id = enum.auto()
    pubchem_id = enum.auto()
    # g2p_id = enum.auto()
    common_name = enum.auto()
    iupac = enum.auto()
    inchi = enum.auto()
    smiles = enum.auto()

    @classmethod
    def parse(cls, fill: str) -> Set[IdType]:

        if fill == "@all":

            return set(IdType)
        elif fill == "@primary":
            return IdType.primary()
        else:
            return {IdType.of(s.strip().lower()) for s in fill.split(",")}

    @property
    def is_primary(self) -> bool:

        return self in self.__class__.primary()

    @classmethod
    def primary(cls) -> Set[IdType]:

        # in order from best to worst
        return {IdType.inchikey, IdType.chembl_id, IdType.pubchem_id}


class CompoundIdFiller:

    def __init__(self, wanted: Set[Union[str, IdType]] = None, replace: bool = False):
        self.wanted = [IdType.of(s) for s in wanted]
        self.replace = replace

    def fill(

        self,

        df: IdMatchFrame,

    ) -> IdMatchFrame:
        df = df.copy()
        df = df.dropna(how="all", axis=1)
        sources: Set[IdType] = {s for s in IdType.primary() if s.name in df.columns}
        targets: Set[IdType] = {s for s in self.wanted if s.name not in df.columns or self.replace}
        if len(sources) == 0:
            raise ValueError(f"No valid sources in list {df.columns.values}")
        source = next(iter(sources))
        # noinspection PyUnresolvedReferences
        logger.notice(f"Getting {', '.join([s.name for s in targets])} from {source.name}")
        # watch out! these are simply in order, nothing more
        remapped: Dict[IdType, List[str]] = {t: [] for t in IdType}
        for i, source_val in enumerate(df[source.name].values):
            if source_val is None:
                raise AssertionError()
            matches: Dict[IdType, str] = self._matches(source, source_val, targets)
            for target, target_val in matches.items():
                remapped[target].append(target_val)
            logger.info(f"Processed {source_val} ({i} of {len(df)}")
            if i % 20 == 0 and i > 0:
                logger.notice(f"Processed {i} / {len(df)}")
        for target in targets:
            rx = remapped[target]

            df[target.name] = rx
        order = [o for o in PUT_FIRST if o in df.columns]
        order += [c for c in df.columns if c not in PUT_FIRST and c not in PUT_LAST]
        order += [o for o in PUT_LAST if o in df.columns]
        df = df.cfirst(order)
        return df

    def _matches(self, source: IdType, source_val: str, targets: Set[IdType]) -> Dict[IdType, str]:
        if source is IdType.pubchem_id:
            inchikey = Apis.Pubchem.find_inchikey(int(source_val))
        elif source is IdType.chembl_id:
            # TODO

            # get_compound wants an inchikey,
            # but we're secretly passing a CHEMBLxxxx ID instead
            # we just know that that works
            inchikey = ChemblUtils(Apis.Chembl).get_compound(source_val).inchikey
        elif source is IdType.inchikey:
            inchikey = source
        else:
            raise AssertionError(source.name)
        matched: Dict[IdType, str] = {k: None for k in self.wanted}
        matched[IdType.inchikey] = inchikey
        if IdType.pubchem_id in targets:
            try:
                pubchem_data: Optional[PubchemData] = Apis.Pubchem.fetch_data(inchikey)
            except PubchemCompoundLookupError:
                pubchem_data = None
            if pubchem_data is not None:
                matched[IdType.pubchem_id] = str(pubchem_data.cid)
                if IdType.common_name in targets:
                    matched[IdType.common_name] = pubchem_data.name
                if IdType.iupac in targets:
                    matched[IdType.iupac] = pubchem_data.names_and_identifiers.iupac
                if IdType.smiles in targets:
                    matched[IdType.smiles] = pubchem_data.names_and_identifiers.isomeric_smiles
                if IdType.inchi in targets:
                    matched[IdType.inchi] = pubchem_data.names_and_identifiers.inchi
        if IdType.chembl_id in targets:
            chembl_id = ChemFinder.chembl().find(inchikey)
            if chembl_id is not None:
                matched[IdType.chembl_id] = chembl_id
        return matched


__all__ = ["IdType", "CompoundIdFiller"]


1			from __future__ import annotations
			0 ignored issues – show introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Missing module docstring Loading history...
2			import enum
3			from typing import Set, Mapping, Optional, Union, List, Dict
			0 ignored issues – show Unused Code introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Unused Mapping imported from typing Loading history...
4
5			from loguru import logger
			0 ignored issues – show introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Unable to import 'loguru' Loading history...
6			from mandos.model import CleverEnum
7
8			from mandos.entries.api_singletons import Apis
9			from mandos.entries.searcher import IdMatchFrame, ChemFinder
10			from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
11			from mandos.model.apis.pubchem_api import PubchemCompoundLookupError
12			from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
13
14
15			PUT_FIRST = [
16			"compound_id",
17			"library",
18			"inchikey",
19			"chembl_id",
20			"pubchem_id",
21			"g2p_id",
22			"common_name",
23			]
24			PUT_LAST = ["inchi", "smiles" "iupac", "origin_inchikey", "origin_inchi", "origin_smiles"]
			0 ignored issues – show introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Implicit string concatenation found in list Loading history...
25
26
27			class IdType(CleverEnum):
			0 ignored issues – show introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
28			inchikey = enum.auto()
29			chembl_id = enum.auto()
30			pubchem_id = enum.auto()
31			# g2p_id = enum.auto()
32			common_name = enum.auto()
33			iupac = enum.auto()
34			inchi = enum.auto()
35			smiles = enum.auto()
36
37			@classmethod
38			def parse(cls, fill: str) -> Set[IdType]:
			0 ignored issues – show introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
39			if fill == "@all":
			0 ignored issues – show unused-code introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Unnecessary "elif" after "return" Loading history...
40			return set(IdType)
41			elif fill == "@primary":
42			return IdType.primary()
43			else:
44			return {IdType.of(s.strip().lower()) for s in fill.split(",")}
45
46			@property
47			def is_primary(self) -> bool:
			0 ignored issues – show introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
48			return self in self.__class__.primary()
49
50			@classmethod
51			def primary(cls) -> Set[IdType]:
			0 ignored issues – show introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
52			# in order from best to worst
53			return {IdType.inchikey, IdType.chembl_id, IdType.pubchem_id}
54
55
56			class CompoundIdFiller:
			0 ignored issues – show introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
57			def __init__(self, wanted: Set[Union[str, IdType]] = None, replace: bool = False):
58			self.wanted = [IdType.of(s) for s in wanted]
59			self.replace = replace
60
61			def fill(
			0 ignored issues – show introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history... Coding Style Naming introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
62			self,
			0 ignored issues – show Coding Style introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
63			df: IdMatchFrame,
			0 ignored issues – show Coding Style introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
64			) -> IdMatchFrame:
65			df = df.copy()
66			df = df.dropna(how="all", axis=1)
67			sources: Set[IdType] = {s for s in IdType.primary() if s.name in df.columns}
68			targets: Set[IdType] = {s for s in self.wanted if s.name not in df.columns or self.replace}
69			if len(sources) == 0:
70			raise ValueError(f"No valid sources in list {df.columns.values}")
71			source = next(iter(sources))
72			# noinspection PyUnresolvedReferences
73			logger.notice(f"Getting {', '.join([s.name for s in targets])} from {source.name}")
74			# watch out! these are simply in order, nothing more
75			remapped: Dict[IdType, List[str]] = {t: [] for t in IdType}
76			for i, source_val in enumerate(df[source.name].values):
77			if source_val is None:
78			raise AssertionError()
79			matches: Dict[IdType, str] = self._matches(source, source_val, targets)
80			for target, target_val in matches.items():
81			remapped[target].append(target_val)
82			logger.info(f"Processed {source_val} ({i} of {len(df)}")
83			if i % 20 == 0 and i > 0:
84			logger.notice(f"Processed {i} / {len(df)}")
85			for target in targets:
86			rx = remapped[target]
			0 ignored issues – show Coding Style Naming introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Variable name "rx" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
87			df[target.name] = rx
88			order = [o for o in PUT_FIRST if o in df.columns]
89			order += [c for c in df.columns if c not in PUT_FIRST and c not in PUT_LAST]
90			order += [o for o in PUT_LAST if o in df.columns]
91			df = df.cfirst(order)
92			return df
93
94			def _matches(self, source: IdType, source_val: str, targets: Set[IdType]) -> Dict[IdType, str]:
95			if source is IdType.pubchem_id:
96			inchikey = Apis.Pubchem.find_inchikey(int(source_val))
97			elif source is IdType.chembl_id:
98			# TODO
			0 ignored issues – show Coding Style introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
99			# get_compound wants an inchikey,
100			# but we're secretly passing a CHEMBLxxxx ID instead
101			# we just know that that works
102			inchikey = ChemblUtils(Apis.Chembl).get_compound(source_val).inchikey
103			elif source is IdType.inchikey:
104			inchikey = source
105			else:
106			raise AssertionError(source.name)
107			matched: Dict[IdType, str] = {k: None for k in self.wanted}
108			matched[IdType.inchikey] = inchikey
109			if IdType.pubchem_id in targets:
110			try:
111			pubchem_data: Optional[PubchemData] = Apis.Pubchem.fetch_data(inchikey)
112			except PubchemCompoundLookupError:
113			pubchem_data = None
114			if pubchem_data is not None:
115			matched[IdType.pubchem_id] = str(pubchem_data.cid)
116			if IdType.common_name in targets:
117			matched[IdType.common_name] = pubchem_data.name
118			if IdType.iupac in targets:
119			matched[IdType.iupac] = pubchem_data.names_and_identifiers.iupac
120			if IdType.smiles in targets:
121			matched[IdType.smiles] = pubchem_data.names_and_identifiers.isomeric_smiles
122			if IdType.inchi in targets:
123			matched[IdType.inchi] = pubchem_data.names_and_identifiers.inchi
124			if IdType.chembl_id in targets:
125			chembl_id = ChemFinder.chembl().find(inchikey)
126			if chembl_id is not None:
127			matched[IdType.chembl_id] = chembl_id
128			return matched
129
130
131			__all__ = ["IdType", "CompoundIdFiller"]
132

dmyersturnbull / mandos

Push — main ( 82dd22...9813db )

mandos.entries.filler.CompoundIdFiller.fill() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like