mandos.model.caches.TaxonomyFactories.from_fixed_file() - Code Metrics - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

TaxonomyFactories.from_fixed_file() A
last analyzed 2021-01-25 23:07 UTC

↳ Parent: mandos.model.caches

Complexity

Conditions

Size

Total Lines	3
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	3
nop	2
dl	0
loc	3
rs	10
c	0
b	0
f	0

"""
Caching.
"""

from __future__ import annotations

import abc
import logging
import shutil
from pathlib import Path
from typing import Union

import pandas as pd

import requests

from pocketutils.core.hashers import Hasher


from mandos import MandosResources
from mandos.model.taxonomy import Taxonomy

logger = logging.getLogger(__package__)
hasher = Hasher("sha1")


class TaxonomyFactory(metaclass=abc.ABCMeta):

    def load(self, taxon: int) -> Taxonomy:

        raise NotImplementedError()


class UniprotTaxonomyCache(TaxonomyFactory, metaclass=abc.ABCMeta):
    """
    Preps a new taxonomy file for use in mandos.
    Just returns if a corresponding file already exists in the resources dir or mandos cache (``~/.mandos``).

    Otherwise, downloads a tab-separated file from UniProt.
    (To find manually, follow the ``All lower taxonomy nodes`` link and click ``Download``.)
    Then applies fixes and reduces the file size, creating a new file alongside.
    Puts both the raw data and fixed data in the cache under ``~/.mandos/taxonomy/``.
    """

    def load(self, taxon: int) -> Taxonomy:
        path = self._resolve_non_vertebrate_final(taxon)
        if path.exists():
            return Taxonomy.from_path(path)
        vertebrata = Taxonomy.from_path(MandosResources.path("7742.tab.gz"))
        if taxon in vertebrata:
            return vertebrata.subtree(taxon)
        raw_path = self._resolve_non_vertebrate_raw(taxon)
        if not raw_path.exists():
            self._download(raw_path, taxon)
            self._fix(raw_path, taxon, path)
        return Taxonomy.from_path(path)

    def _resolve_non_vertebrate_final(self, taxon: int) -> Path:
        raise NotImplementedError()

    def _resolve_non_vertebrate_raw(self, taxon: int) -> Path:
        raise NotImplementedError()

    def _download(self, raw_path: Path, taxon: int) -> None:
class Foo:
    def some_method(self, x, y):
        return x + y;
        # this is faster and safer than using pd.read_csv(url)
        # https://uniprot.org/taxonomy/?query=ancestor:7742&format=tab&force=true&columns=id&compress=yes
        url = f"https://uniprot.org/taxonomy/?query=ancestor:{taxon}&format=tab&force=true&columns=id&compress=yes"

        with requests.get(url, stream=True) as r:

            with raw_path.open("wb") as f:

                shutil.copyfileobj(r.raw, f)
        hasher.to_write(raw_path).write()

    def _fix(self, raw_path: Path, taxon: int, final_path: Path):
        # now process it!
        # unfortunately it won't include an entry for the root ancestor (`taxon`)
        # so, we'll add it in
        df = pd.read_csv(raw_path, sep="\t")

        # find the scientific name of the parent
        scientific_name = self._determine_name(df, taxon)
        # now fix the columns
        df = df[["Taxon", "Scientific name", "Parent"]]

        df.columns = ["taxon", "scientific_name", "parent"]
        # now add the ancestor back in
        df = df.append(

            pd.Series(dict(taxon=taxon, scientific_name=scientific_name, parent=0)),
            ignore_index=True,
        )
        # write it to a csv.gz
        df["parent"] = df["parent"].astype(int)
        df.to_csv(final_path, index=False, sep="\t")

    def _determine_name(self, df: pd.DataFrame, taxon: int) -> str:
class Foo:
    def some_method(self, x, y):
        return x + y;
        got = df[df["Parent"] == taxon]
        if len(got) == 0:
            raise ValueError(f"Could not infer scientific name for {taxon}")
        z = str(list(got["Lineage"])[0])

        return z.split("; ")[-1].strip()


class FixedTaxonomyFactory(TaxonomyFactory):

    def __init__(self, tax: Taxonomy):
        self._tax = tax

    def load(self, taxon: int) -> Taxonomy:
        return self._tax.subtree(taxon)


class FixedFileTaxonomyFactory(TaxonomyFactory):

    def __init__(self, path: Path):
        self._path = path

    def load(self, taxon: int) -> Taxonomy:
        return Taxonomy.from_path(self._path).subtree(taxon)


class CacheDirTaxonomyCache(UniprotTaxonomyCache):

    def __init__(self, cache_dir: Path):
        self.cache_dir = cache_dir

    def _resolve_non_vertebrate_final(self, taxon: int) -> Path:
        return self._get_resource(f"{taxon}.tab.gz")

    def _resolve_non_vertebrate_raw(self, taxon: int) -> Path:
        return self._get_resource(f"taxonomy-ancestor_{taxon}.tab.gz")

    def _get_resource(self, *nodes: Union[Path, str]) -> Path:
        path = MandosResources.path(*nodes)
        if path.exists():
            return path
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        return Path(self.cache_dir, *nodes)


class TaxonomyFactories:
    """
    Collection of static factory methods.
    """

    @classmethod
    def from_vertebrata(cls) -> TaxonomyFactory:

        return CacheDirTaxonomyCache(MandosResources.path("7742.tab.gz"))

    @classmethod
    def from_uniprot(cls, cache_dir: Path) -> TaxonomyFactory:

        return CacheDirTaxonomyCache(cache_dir)

    @classmethod
    def from_fixed_file(cls, cache_dir: Path) -> TaxonomyFactory:

        return FixedFileTaxonomyFactory(cache_dir)


__all__ = ["TaxonomyFactory", "TaxonomyFactories"]


1			"""
2			Caching.
3			"""
4
5			from __future__ import annotations
6
7			import abc
8			import logging
9			import shutil
10			from pathlib import Path
11			from typing import Union
12
13			import pandas as pd
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
14			import requests
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'requests' Loading history...
15			from pocketutils.core.hashers import Hasher
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.hashers' Loading history...
16
17			from mandos import MandosResources
18			from mandos.model.taxonomy import Taxonomy
19
20			logger = logging.getLogger(__package__)
21			hasher = Hasher("sha1")
22
23
24			class TaxonomyFactory(metaclass=abc.ABCMeta):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
25			def load(self, taxon: int) -> Taxonomy:
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
26			raise NotImplementedError()
27
28
29			class UniprotTaxonomyCache(TaxonomyFactory, metaclass=abc.ABCMeta):
30			"""
31			Preps a new taxonomy file for use in mandos.
32			Just returns if a corresponding file already exists in the resources dir or mandos cache (``~/.mandos``).
			0 ignored issues – show Coding Style introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (109/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
33			Otherwise, downloads a tab-separated file from UniProt.
34			(To find manually, follow the ``All lower taxonomy nodes`` link and click ``Download``.)
35			Then applies fixes and reduces the file size, creating a new file alongside.
36			Puts both the raw data and fixed data in the cache under ``~/.mandos/taxonomy/``.
37			"""
38
39			def load(self, taxon: int) -> Taxonomy:
40			path = self._resolve_non_vertebrate_final(taxon)
41			if path.exists():
42			return Taxonomy.from_path(path)
43			vertebrata = Taxonomy.from_path(MandosResources.path("7742.tab.gz"))
44			if taxon in vertebrata:
45			return vertebrata.subtree(taxon)
46			raw_path = self._resolve_non_vertebrate_raw(taxon)
47			if not raw_path.exists():
48			self._download(raw_path, taxon)
49			self._fix(raw_path, taxon, path)
50			return Taxonomy.from_path(path)
51
52			def _resolve_non_vertebrate_final(self, taxon: int) -> Path:
53			raise NotImplementedError()
54
55			def _resolve_non_vertebrate_raw(self, taxon: int) -> Path:
56			raise NotImplementedError()
57
58			def _download(self, raw_path: Path, taxon: int) -> None:
			0 ignored issues – show Coding Style introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report This method could be written as a function/class method. If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example class Foo: def some_method(self, x, y): return x + y; could be written as class Foo: @classmethod def some_method(cls, x, y): return x + y; Loading history...
59			# this is faster and safer than using pd.read_csv(url)
60			# https://uniprot.org/taxonomy/?query=ancestor:7742&format=tab&force=true&columns=id&compress=yes
61			url = f"https://uniprot.org/taxonomy/?query=ancestor:{taxon}&format=tab&force=true&columns=id&compress=yes"
			0 ignored issues – show Coding Style introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (115/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
62			with requests.get(url, stream=True) as r:
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "r" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
63			with raw_path.open("wb") as f:
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "f" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
64			shutil.copyfileobj(r.raw, f)
65			hasher.to_write(raw_path).write()
66
67			def _fix(self, raw_path: Path, taxon: int, final_path: Path):
68			# now process it!
69			# unfortunately it won't include an entry for the root ancestor (`taxon`)
70			# so, we'll add it in
71			df = pd.read_csv(raw_path, sep="\t")
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
72			# find the scientific name of the parent
73			scientific_name = self._determine_name(df, taxon)
74			# now fix the columns
75			df = df[["Taxon", "Scientific name", "Parent"]]
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
76			df.columns = ["taxon", "scientific_name", "parent"]
77			# now add the ancestor back in
78			df = df.append(
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
79			pd.Series(dict(taxon=taxon, scientific_name=scientific_name, parent=0)),
80			ignore_index=True,
81			)
82			# write it to a csv.gz
83			df["parent"] = df["parent"].astype(int)
84			df.to_csv(final_path, index=False, sep="\t")
85
86			def _determine_name(self, df: pd.DataFrame, taxon: int) -> str:
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... Coding Style introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report This method could be written as a function/class method. If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example class Foo: def some_method(self, x, y): return x + y; could be written as class Foo: @classmethod def some_method(cls, x, y): return x + y; Loading history...
87			got = df[df["Parent"] == taxon]
88			if len(got) == 0:
89			raise ValueError(f"Could not infer scientific name for {taxon}")
90			z = str(list(got["Lineage"])[0])
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "z" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
91			return z.split("; ")[-1].strip()
92
93
94			class FixedTaxonomyFactory(TaxonomyFactory):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
95			def __init__(self, tax: Taxonomy):
96			self._tax = tax
97
98			def load(self, taxon: int) -> Taxonomy:
99			return self._tax.subtree(taxon)
100
101
102			class FixedFileTaxonomyFactory(TaxonomyFactory):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
103			def __init__(self, path: Path):
104			self._path = path
105
106			def load(self, taxon: int) -> Taxonomy:
107			return Taxonomy.from_path(self._path).subtree(taxon)
108
109
110			class CacheDirTaxonomyCache(UniprotTaxonomyCache):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
111			def __init__(self, cache_dir: Path):
112			self.cache_dir = cache_dir
113
114			def _resolve_non_vertebrate_final(self, taxon: int) -> Path:
115			return self._get_resource(f"{taxon}.tab.gz")
116
117			def _resolve_non_vertebrate_raw(self, taxon: int) -> Path:
118			return self._get_resource(f"taxonomy-ancestor_{taxon}.tab.gz")
119
120			def _get_resource(self, *nodes: Union[Path, str]) -> Path:
121			path = MandosResources.path(*nodes)
122			if path.exists():
123			return path
124			self.cache_dir.mkdir(parents=True, exist_ok=True)
125			return Path(self.cache_dir, *nodes)
126
127
128			class TaxonomyFactories:
129			"""
130			Collection of static factory methods.
131			"""
132
133			@classmethod
134			def from_vertebrata(cls) -> TaxonomyFactory:
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
135			return CacheDirTaxonomyCache(MandosResources.path("7742.tab.gz"))
136
137			@classmethod
138			def from_uniprot(cls, cache_dir: Path) -> TaxonomyFactory:
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
139			return CacheDirTaxonomyCache(cache_dir)
140
141			@classmethod
142			def from_fixed_file(cls, cache_dir: Path) -> TaxonomyFactory:
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
143			return FixedFileTaxonomyFactory(cache_dir)
144
145
146			__all__ = ["TaxonomyFactory", "TaxonomyFactories"]
147

dmyersturnbull / mandos

TaxonomyFactories.from_fixed_file() A last analyzed 2021-01-25 23:07 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

TaxonomyFactories.from_fixed_file() A
last analyzed 2021-01-25 23:07 UTC