Passed
Push — main ( d08a4e...a07aa0 )
by Douglas
01:59
created

mandos.entries.searcher.SearcherUtils._get_sep()   B

Complexity

Conditions 6

Size

Total Lines 14
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 13
nop 2
dl 0
loc 14
rs 8.6666
c 0
b 0
f 0
1
"""
2
Run searches and write files.
3
"""
4
5
from __future__ import annotations
6
7
import gzip
8
import logging
9
from pathlib import Path
10
from typing import Sequence, Optional, Dict
11
12
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
13
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
14
from pocketutils.tools.path_tools import PathTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.path_tools'
Loading history...
15
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
16
17
from mandos.model import CompoundNotFoundError
18
from mandos.model.chembl_support.chembl_utils import ChemblUtils
19
from mandos.model.searches import Search
20
from mandos.model.settings import MANDOS_SETTINGS
21
from mandos.search.chembl import ChemblSearch
22
from mandos.search.pubchem import PubchemSearch
23
from mandos.entries.api_singletons import Apis
24
25
Chembl, Pubchem = Apis.Chembl, Apis.Pubchem
26
logger = logging.getLogger(__package__)
27
28
IdMatchFrame = (
29
    TypedDfs.typed("IdMatchFrame")
30
    .require("inchikey")
31
    .require("chembl_id")
32
    .require("pubchem_id")
33
    .strict()
34
).build()
35
36
37
class SearcherUtils:
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
38
    @classmethod
39
    def dl(
0 ignored issues
show
Unused Code introduced by
Either all return statements in a function should return an expression, or none of them should.
Loading history...
introduced by
Missing function or method docstring
Loading history...
Coding Style Naming introduced by
Method name "dl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
40
        cls, inchikeys: Sequence[str], pubchem: bool = True, chembl: bool = True
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
41
    ) -> IdMatchFrame:
42
        # we actually cache the results, even though the underlying APIs cache
43
        # the reasons for this are a little obscure --
44
        # when running a Searcher, we want to run before the FIRST search
45
        # for the typer commands to be replicas of the ``Entry.run`` methods, Searcher fetches before running a search
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (118/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
46
        # but if we have multiple searches (as in ``mandos search --config``), we only want that at the beginning
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (113/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
47
        # the alternative was having ``mandos search`` dynamically subclass each ``Entry`` -- which was really hard
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (115/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
48
        # this is much cleaner, even though it's redundant
49
        # if the cached results under /pubchem and /chembl are deleted, we unfortunately won't cache the results
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (112/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
50
        # when running this command
51
        # to fix that, we need to delete the cached /match dataframes
52
        # now that I'm writing this down, I realize this is pretty bad
53
        # TODO
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
54
        key = hash(",".join(inchikeys))
55
        cached_path = (MANDOS_SETTINGS.match_cache_path / str(key)).with_suffix(".feather")
56
        if cached_path.exists():
57
            logger.info(f"Found ID matching results at {cached_path}")
0 ignored issues
show
introduced by
Use lazy % formatting in logging functions
Loading history...
58
            return IdMatchFrame.read_feather(cached_path)
59
        found_chembl: Dict[str, str] = {}
60
        found_pubchem: Dict[str, str] = {}
61
        if pubchem:
62
            for inchikey in inchikeys:
63
                try:
64
                    found_pubchem[inchikey] = str(Pubchem.fetch_data(inchikey).cid)
65
                except CompoundNotFoundError:
66
                    logger.error(f"Did not find compound {inchikey}")
0 ignored issues
show
introduced by
Use lazy % formatting in logging functions
Loading history...
67
                    logger.debug(f"Did not find compound {inchikey}", exc_info=True)
0 ignored issues
show
introduced by
Use lazy % formatting in logging functions
Loading history...
68
        if chembl:
69
            for inchikey in inchikeys:
70
                try:
71
                    found_chembl[inchikey] = ChemblUtils(Chembl).get_compound(inchikey).chid
72
                except CompoundNotFoundError:
73
                    logger.error(f"Did not find compound {inchikey}")
0 ignored issues
show
introduced by
Use lazy % formatting in logging functions
Loading history...
74
                    logger.debug(f"Did not find compound {inchikey}", exc_info=True)
0 ignored issues
show
introduced by
Use lazy % formatting in logging functions
Loading history...
75
        df = pd.DataFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
76
        df["chembl_id"] = df["inchikey"].map(found_chembl.get)
77
        df["pubchem_id"] = df["inchikey"].map(found_pubchem.get)
78
        df = IdMatchFrame(df)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
79
        df.to_feather(cached_path)
80
81
    @classmethod
82
    def read(cls, input_path: Path) -> Sequence[str]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
83
        sep = cls._get_sep(input_path)
84
        if sep in {"\t", ","}:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
85
            df = pd.read_csv(input_path, sep=sep)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
86
            return cls._from_df(df)
87
        elif sep == "feather":
88
            df = pd.read_feather(input_path)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
89
            return cls._from_df(df)
90
        elif sep == "gz":
91
            with gzip.open(input_path, "rt") as f:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "f" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
92
                return cls._from_txt(f.read())
93
        elif sep == "txt":
94
            return cls._from_txt(input_path.read_text(encoding="utf8"))
95
        else:
96
            raise AssertionError(sep)
97
98
    @classmethod
99
    def _from_df(cls, df: pd.DataFrame) -> Sequence[str]:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
100
        df.columns = [c.lower() if isinstance(c, str) else c for c in df.columns]
101
        if "inchikey" not in df.columns:
102
            raise KeyError("For a CSV or TSV file, include a column called 'inchikey'")
103
        return df["inchikey"].values.tolist()
104
105
    @classmethod
106
    def _from_txt(cls, text: str) -> Sequence[str]:
107
        return [line.strip() for line in text.splitlines() if len(line.strip()) > 0]
108
109
    @classmethod
110
    def _get_sep(cls, input_path: Path) -> str:
111
        if any((str(input_path).endswith(z) for z in {".tab", ".tsv", ".tab.gz", ".tsv.gz"})):
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
Comprehensibility Best Practice introduced by
The variable z does not seem to be defined.
Loading history...
112
            return "\t"
113
        elif any((str(input_path).endswith(z) for z in {".csv", ".csv.gz"})):
114
            return ","
115
        elif any((str(input_path).endswith(z) for z in {".feather"})):
116
            return "feather"
117
        elif any((str(input_path).endswith(z) for z in {".txt.gz", ".lines.gz"})):
118
            return "gz"
119
        elif any((str(input_path).endswith(z) for z in {".txt", ".lines"})):
120
            return "txt"
121
        else:
122
            raise ValueError(f"{input_path} should end in .tab, .tsv, .csv, .txt, .lines, or .gz")
123
124
125
class Searcher:
126
    """
127
    Executes one or more searches and saves the results to CSV files.
128
    Create and use once.
129
    """
130
131
    def __init__(self, searches: Sequence[Search], input_path: Path):
132
        """
133
        Constructor.
134
135
        Args:
136
            searches:
137
            input_path: Path to the input file of one of the formats:
138
                - .txt containing one InChI Key per line
139
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (109/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
140
        """
141
        self.what = searches
142
        self.input_path: Optional[Path] = input_path
143
        self.inchikeys: Optional[Sequence[str]] = []
144
145
    def search(self) -> Searcher:
146
        """
147
        Performs the search, and writes data.
148
        """
149
        if self.inchikeys is not None:
150
            raise ValueError(f"Already ran a search")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
151
        self.inchikeys = SearcherUtils.read(self.input_path)
152
        has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable what does not seem to be defined.
Loading history...
153
        has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
154
        # find the compounds first so the user knows what's missing before proceeding
155
        SearcherUtils.dl(self.inchikeys, pubchem=has_pubchem, chembl=has_chembl)
156
        for what in self.what:
157
            output_path = self.output_path_of(what)
158
            df = what.find_to_df(self.inchikeys)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
159
            df.to_csv(output_path)
160
            metadata = NestedDotDict(
161
                dict(key=what.key, search=what.search_class, params=what.get_params())
162
            )
163
            metadata.write_json(output_path.with_suffix(".json"))
164
        return self
165
166
    def paths(self) -> Sequence[Path]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
167
        return [self.output_path_of(what) for what in self.what]
168
169
    def output_path_of(self, what: Search) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
170
        parent = self.input_path.parent
171
        child = self.input_path.stem + what.key + ".tab"
172
        node = PathTools.sanitize_path_node(child)
173
        if (parent / node).resolve() != (parent / child).resolve():
174
            logger.debug(f"Path {child} sanitized to {node}")
0 ignored issues
show
introduced by
Use lazy % formatting in logging functions
Loading history...
175
        return parent / node
176
177
178
__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"]
179