TaxonomyFactories.from_fixed_file()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nop 2
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
"""
2
Caching.
3
"""
4
5
from __future__ import annotations
6
7
import abc
8
import logging
9
import shutil
10
from pathlib import Path
11
from typing import Union
12
13
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
14
import requests
0 ignored issues
show
introduced by
Unable to import 'requests'
Loading history...
15
from pocketutils.core.hashers import Hasher
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.hashers'
Loading history...
16
17
from mandos import MandosResources
18
from mandos.model.taxonomy import Taxonomy
19
20
logger = logging.getLogger(__package__)
21
hasher = Hasher("sha1")
22
23
24
class TaxonomyFactory(metaclass=abc.ABCMeta):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
25
    def load(self, taxon: int) -> Taxonomy:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
26
        raise NotImplementedError()
27
28
29
class UniprotTaxonomyCache(TaxonomyFactory, metaclass=abc.ABCMeta):
30
    """
31
    Preps a new taxonomy file for use in mandos.
32
    Just returns if a corresponding file already exists in the resources dir or mandos cache (``~/.mandos``).
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (109/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
33
    Otherwise, downloads a tab-separated file from UniProt.
34
    (To find manually, follow the ``All lower taxonomy nodes`` link and click ``Download``.)
35
    Then applies fixes and reduces the file size, creating a new file alongside.
36
    Puts both the raw data and fixed data in the cache under ``~/.mandos/taxonomy/``.
37
    """
38
39
    def load(self, taxon: int) -> Taxonomy:
40
        path = self._resolve_non_vertebrate_final(taxon)
41
        if path.exists():
42
            return Taxonomy.from_path(path)
43
        vertebrata = Taxonomy.from_path(MandosResources.path("7742.tab.gz"))
44
        if taxon in vertebrata:
45
            return vertebrata.subtree(taxon)
46
        raw_path = self._resolve_non_vertebrate_raw(taxon)
47
        if not raw_path.exists():
48
            self._download(raw_path, taxon)
49
            self._fix(raw_path, taxon, path)
50
        return Taxonomy.from_path(path)
51
52
    def _resolve_non_vertebrate_final(self, taxon: int) -> Path:
53
        raise NotImplementedError()
54
55
    def _resolve_non_vertebrate_raw(self, taxon: int) -> Path:
56
        raise NotImplementedError()
57
58
    def _download(self, raw_path: Path, taxon: int) -> None:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
59
        # this is faster and safer than using pd.read_csv(url)
60
        # https://uniprot.org/taxonomy/?query=ancestor:7742&format=tab&force=true&columns=id&compress=yes
61
        url = f"https://uniprot.org/taxonomy/?query=ancestor:{taxon}&format=tab&force=true&columns=id&compress=yes"
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (115/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
62
        with requests.get(url, stream=True) as r:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "r" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
63
            with raw_path.open("wb") as f:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "f" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
64
                shutil.copyfileobj(r.raw, f)
65
        hasher.to_write(raw_path).write()
66
67
    def _fix(self, raw_path: Path, taxon: int, final_path: Path):
68
        # now process it!
69
        # unfortunately it won't include an entry for the root ancestor (`taxon`)
70
        # so, we'll add it in
71
        df = pd.read_csv(raw_path, sep="\t")
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
72
        # find the scientific name of the parent
73
        scientific_name = self._determine_name(df, taxon)
74
        # now fix the columns
75
        df = df[["Taxon", "Scientific name", "Parent"]]
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
76
        df.columns = ["taxon", "scientific_name", "parent"]
77
        # now add the ancestor back in
78
        df = df.append(
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
79
            pd.Series(dict(taxon=taxon, scientific_name=scientific_name, parent=0)),
80
            ignore_index=True,
81
        )
82
        # write it to a csv.gz
83
        df["parent"] = df["parent"].astype(int)
84
        df.to_csv(final_path, index=False, sep="\t")
85
86
    def _determine_name(self, df: pd.DataFrame, taxon: int) -> str:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
87
        got = df[df["Parent"] == taxon]
88
        if len(got) == 0:
89
            raise ValueError(f"Could not infer scientific name for {taxon}")
90
        z = str(list(got["Lineage"])[0])
0 ignored issues
show
Coding Style Naming introduced by
Variable name "z" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
91
        return z.split("; ")[-1].strip()
92
93
94
class FixedTaxonomyFactory(TaxonomyFactory):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
95
    def __init__(self, tax: Taxonomy):
96
        self._tax = tax
97
98
    def load(self, taxon: int) -> Taxonomy:
99
        return self._tax.subtree(taxon)
100
101
102
class FixedFileTaxonomyFactory(TaxonomyFactory):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
103
    def __init__(self, path: Path):
104
        self._path = path
105
106
    def load(self, taxon: int) -> Taxonomy:
107
        return Taxonomy.from_path(self._path).subtree(taxon)
108
109
110
class CacheDirTaxonomyCache(UniprotTaxonomyCache):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
111
    def __init__(self, cache_dir: Path):
112
        self.cache_dir = cache_dir
113
114
    def _resolve_non_vertebrate_final(self, taxon: int) -> Path:
115
        return self._get_resource(f"{taxon}.tab.gz")
116
117
    def _resolve_non_vertebrate_raw(self, taxon: int) -> Path:
118
        return self._get_resource(f"taxonomy-ancestor_{taxon}.tab.gz")
119
120
    def _get_resource(self, *nodes: Union[Path, str]) -> Path:
121
        path = MandosResources.path(*nodes)
122
        if path.exists():
123
            return path
124
        self.cache_dir.mkdir(parents=True, exist_ok=True)
125
        return Path(self.cache_dir, *nodes)
126
127
128
class TaxonomyFactories:
129
    """
130
    Collection of static factory methods.
131
    """
132
133
    @classmethod
134
    def from_vertebrata(cls) -> TaxonomyFactory:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
135
        return CacheDirTaxonomyCache(MandosResources.path("7742.tab.gz"))
136
137
    @classmethod
138
    def from_uniprot(cls, cache_dir: Path) -> TaxonomyFactory:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
139
        return CacheDirTaxonomyCache(cache_dir)
140
141
    @classmethod
142
    def from_fixed_file(cls, cache_dir: Path) -> TaxonomyFactory:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
143
        return FixedFileTaxonomyFactory(cache_dir)
144
145
146
__all__ = ["TaxonomyFactory", "TaxonomyFactories"]
147