Passed
Push — dependabot/pip/pyarrow-4.0.1 ( ca09ce...b2836e )
by
unknown
02:18 queued 20s
created

mandos.entries.searcher.SearcherUtils.dl()   A

Complexity

Conditions 3

Size

Total Lines 15
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 14
nop 6
dl 0
loc 15
rs 9.7
c 0
b 0
f 0
1
"""
2
Run searches and write files.
3
"""
4
5
from __future__ import annotations
6
7
from dataclasses import dataclass
8
from pathlib import Path
9
from typing import Callable, Optional, Sequence
10
11
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
12
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
13
from pocketutils.tools.common_tools import CommonTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.common_tools'
Loading history...
14
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
15
16
from mandos import logger
17
from mandos.entries.api_singletons import Apis
18
from mandos.entries.paths import EntryPaths
19
from mandos.model import CompoundNotFoundError
20
from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
21
from mandos.model.searches import Search
22
from mandos.search.chembl import ChemblSearch
23
from mandos.search.pubchem import PubchemSearch
24
25
InputFrame = (TypedDfs.typed("InputFrame").require("inchikey")).build()
26
27
IdMatchFrame = (
28
    TypedDfs.typed("IdMatchFrame")
29
    .require("inchikey", dtype=str)
30
    .reserve("chembl_id", "pubchem_id", "hmdb_id", dtype=str)
31
    .strict()
32
).build()
33
34
35
@dataclass(frozen=True, repr=True)
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
36
class ChemFinder:
37
    what: str
38
    how: Callable[[str], str]
39
    complain: bool = False
40
41
    @classmethod
42
    def chembl(cls, complain: bool = False) -> ChemFinder:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
43
        def how(inchikey: str) -> str:
44
            return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid
45
46
        return ChemFinder("ChEMBL", how, complain=complain)
47
48
    @classmethod
49
    def pubchem(cls, complain: bool = False) -> ChemFinder:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
50
        def how(inchikey: str) -> str:
51
            return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid
52
53
        return ChemFinder("PubChem", how, complain=complain)
54
55
    def find(self, inchikey: str) -> Optional[str]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
56
        try:
57
            return self.how(inchikey)
58
        except CompoundNotFoundError:
59
            if self.complain:
60
                logger.info(f"NOT FOUND: {self.what.rjust(8)}  ] {inchikey}")
61
            logger.debug(f"Did not find {self.what} {inchikey}", exc_info=True)
62
        return None
63
64
65
class SearcherUtils:
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
66
    @classmethod
67
    def dl(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
Coding Style Naming introduced by
Method name "dl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
introduced by
Missing function or method docstring
Loading history...
68
        cls,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
69
        inchikeys: Sequence[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
70
        pubchem: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
71
        chembl: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
72
        hmdb: bool = True,
0 ignored issues
show
Unused Code introduced by
The argument hmdb seems to be unused.
Loading history...
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
73
        complain: bool = False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
74
    ) -> IdMatchFrame:
75
        df = IdMatchFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
76
        if chembl:
77
            df["chembl_id"] = df["inchikey"].map(ChemFinder.chembl(complain=complain).find)
78
        if pubchem:
79
            df["pubchem_id"] = df["inchikey"].map(ChemFinder.pubchem(complain=complain).find)
80
        return df
81
82
    @classmethod
83
    def read(cls, input_path: Path) -> InputFrame:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
84
        df = InputFrame.read_file(input_path)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
85
        logger.info(f"Read {len(df)} input compounds")
86
        return df
87
88
89
class Searcher:
90
    """
91
    Executes one or more searches and saves the results to CSV files.
92
    Create and use once.
93
    """
94
95
    def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
96
        """
97
        Constructor.
98
99
        Args:
100
            searches:
101
            input_path: Path to the input file of one of the formats:
102
                - .txt containing one InChI Key per line
103
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (109/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
104
        """
105
        self.what = searches
106
        self.input_path: Optional[Path] = input_path
107
        self.input_df: InputFrame = None
108
        self.output_paths = {
109
            what.key: EntryPaths.output_path_of(what, input_path, path)
110
            for what, path in CommonTools.zip_list(searches, to)
111
        }
112
113
    def search(self) -> Searcher:
114
        """
115
        Performs the search, and writes data.
116
        """
117
        if self.input_df is not None:
118
            raise ValueError(f"Already ran a search")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
119
        self.input_df = SearcherUtils.read(self.input_path)
120
        inchikeys = self.input_df["inchikey"].unique()
121
        has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable what does not seem to be defined.
Loading history...
122
        has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
123
        # find the compounds first so the user knows what's missing before proceeding
124
        SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
125
        for what in self.what:
126
            output_path = self.output_paths[what.key]
127
            metadata_path = output_path.with_suffix(".metadata.json")
128
            df = what.find_to_df(inchikeys)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
129
            # TODO keep any other columns in input_df
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
130
            df.to_csv(output_path)
131
            params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
132
            metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
133
            metadata.write_json(metadata_path)
134
            logger.info(f"Wrote {what.key} to {output_path}")
135
        return self
136
137
138
__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"]
139