|
1
|
|
|
""" |
|
2
|
|
|
Caching. |
|
3
|
|
|
""" |
|
4
|
|
|
|
|
5
|
|
|
from __future__ import annotations |
|
6
|
|
|
|
|
7
|
|
|
import abc |
|
8
|
|
|
import shutil |
|
9
|
|
|
from pathlib import Path |
|
10
|
|
|
from typing import AbstractSet, Collection, Iterable, Mapping, Optional, Set, Union |
|
|
|
|
|
|
11
|
|
|
|
|
12
|
|
|
import pandas as pd |
|
|
|
|
|
|
13
|
|
|
import requests |
|
|
|
|
|
|
14
|
|
|
from pocketutils.core.exceptions import LookupFailedError, XValueError |
|
|
|
|
|
|
15
|
|
|
from typeddfs import TypedDfs |
|
|
|
|
|
|
16
|
|
|
from typeddfs.checksums import Checksums |
|
|
|
|
|
|
17
|
|
|
|
|
18
|
|
|
from mandos.model.settings import SETTINGS, Globals |
|
19
|
|
|
from mandos.model.taxonomy import Taxonomy, TaxonomyDf |
|
20
|
|
|
from mandos.model.utils.resources import MandosResources |
|
21
|
|
|
from mandos.model.utils.setup import logger |
|
22
|
|
|
|
|
23
|
|
|
|
|
24
|
|
|
class TaxonomyFactory(metaclass=abc.ABCMeta): |
|
|
|
|
|
|
25
|
|
|
def load(self, taxon: Union[int, str]) -> Taxonomy: |
|
|
|
|
|
|
26
|
|
|
raise NotImplementedError() |
|
27
|
|
|
|
|
28
|
|
|
|
|
29
|
|
|
class CachedTaxonomyCache(TaxonomyFactory, metaclass=abc.ABCMeta): |
|
30
|
|
|
""" |
|
31
|
|
|
Preps a new taxonomy file for use in mandos. |
|
32
|
|
|
Just returns if a corresponding file already exists in the resources dir or mandos cache (``~/.mandos``). |
|
|
|
|
|
|
33
|
|
|
Otherwise, downloads a tab-separated file from UniProt. |
|
34
|
|
|
(To find manually, follow the ``All lower taxonomy nodes`` link and click ``Download``.) |
|
35
|
|
|
Then applies fixes and reduces the file size, creating a new file alongside. |
|
36
|
|
|
Puts both the raw data and fixed data in the cache under ``~/.mandos/taxonomy/``. |
|
37
|
|
|
""" |
|
38
|
|
|
|
|
39
|
|
|
def __init__(self, *, cache_dir: Path = SETTINGS.taxonomy_cache_path, local_only: bool): |
|
40
|
|
|
self.cache_dir = cache_dir |
|
41
|
|
|
self.local_only = local_only |
|
42
|
|
|
|
|
43
|
|
|
def load(self, taxon: Union[int, str]) -> Taxonomy: |
|
44
|
|
|
""" |
|
45
|
|
|
Tries, in order: |
|
46
|
|
|
|
|
47
|
|
|
1. A cached file exactly matching the taxon ID |
|
48
|
|
|
2. A taxon ID under vertebrata |
|
49
|
|
|
3. The UNIQUE name of a taxon under vertebrata |
|
50
|
|
|
4. Downloads the taxonomy with the specified ID |
|
51
|
|
|
""" |
|
52
|
|
|
tree = self.load_exact(taxon) |
|
53
|
|
|
if tree is None: |
|
54
|
|
|
vert = self.load_vertebrate(Globals.vertebrata) |
|
55
|
|
|
if taxon in vert: |
|
56
|
|
|
tree = vert.subtrees_by_ids_or_names(taxon) |
|
57
|
|
|
else: |
|
58
|
|
|
logger.info(f"Taxon {taxon} found in the vertebrata cache") |
|
59
|
|
|
tree = self._load_or_dl(taxon) |
|
60
|
|
|
logger.info(f"Taxonomy has {len(tree)} taxa with {len(tree.roots)} roots") |
|
61
|
|
|
return tree |
|
62
|
|
|
|
|
63
|
|
|
def load_exact(self, taxon: int) -> Optional[Taxonomy]: |
|
|
|
|
|
|
64
|
|
|
path = self._resolve_non_vertebrate_final(taxon) |
|
65
|
|
|
if (self._check_has(taxon, path) or self.local_only) and path.exists(): |
|
66
|
|
|
return Taxonomy.from_path(path) |
|
67
|
|
|
return None |
|
68
|
|
|
|
|
69
|
|
|
def load_vertebrate(self, taxon: Union[int, str]) -> Optional[Taxonomy]: |
|
|
|
|
|
|
70
|
|
|
vertebrata = self._load_or_dl(Globals.vertebrata) |
|
71
|
|
|
vertebrate = vertebrata.subtrees_by_ids_or_names([taxon]) |
|
72
|
|
|
return vertebrate if vertebrate.n_taxa() > 0 else None |
|
73
|
|
|
|
|
74
|
|
|
def _check_has(self, taxon: Union[str, int], path: Path) -> bool: |
|
|
|
|
|
|
75
|
|
|
if path.exists(): |
|
76
|
|
|
return not MandosResources.check_expired( |
|
77
|
|
|
path, |
|
78
|
|
|
max_sec=SETTINGS.taxon_expire_sec, |
|
79
|
|
|
what=f"Cached taxa under {taxon}", |
|
80
|
|
|
) |
|
81
|
|
|
return False |
|
82
|
|
|
|
|
83
|
|
|
def _load_or_dl(self, taxon: Union[int, str]) -> Taxonomy: |
|
84
|
|
|
path = self._resolve_non_vertebrate_final(taxon) |
|
85
|
|
|
raw_path = self._resolve_non_vertebrate_raw(taxon) |
|
86
|
|
|
if self._check_has(taxon, path) or self.local_only: |
|
|
|
|
|
|
87
|
|
|
return Taxonomy.from_path(path) |
|
88
|
|
|
else: |
|
89
|
|
|
logger.notice(f"Downloading new taxonomy file for taxon {taxon}") |
|
90
|
|
|
self._download_raw(raw_path, taxon) |
|
91
|
|
|
path = self._resolve_non_vertebrate_final(taxon) |
|
92
|
|
|
df = self._fix(raw_path, taxon, path) |
|
|
|
|
|
|
93
|
|
|
logger.notice(f"Cached taxonomy at {path} .") |
|
94
|
|
|
return df |
|
95
|
|
|
|
|
96
|
|
|
def rebuild(self, *taxa: int, replace: bool) -> None: |
|
|
|
|
|
|
97
|
|
|
if self.local_only: |
|
98
|
|
|
logger.error(f"Cannot rebuild -- local_only is set") |
|
|
|
|
|
|
99
|
|
|
for taxon in taxa: |
|
100
|
|
|
path = self.resolve_path(taxon) |
|
101
|
|
|
if replace or not path.exists(): |
|
102
|
|
|
self.delete_exact(taxon) |
|
103
|
|
|
self._load_or_dl(taxon) |
|
104
|
|
|
logger.notice(f"Regenerated {taxon} taxonomy") |
|
105
|
|
|
|
|
106
|
|
|
def delete_exact(self, taxon: int) -> None: |
|
|
|
|
|
|
107
|
|
|
raw = self._resolve_non_vertebrate_raw(taxon) |
|
108
|
|
|
raw.unlink(missing_ok=True) |
|
|
|
|
|
|
109
|
|
|
p = self._resolve_non_vertebrate_raw(taxon) |
|
|
|
|
|
|
110
|
|
|
if p.exists(): |
|
111
|
|
|
p.unlink() |
|
112
|
|
|
logger.warning(f"Deleted cached taxonomy file {p}") |
|
113
|
|
|
# delete either way: |
|
114
|
|
|
checksum_file = Checksums.get_hash_file(p, algorithm=SETTINGS.checksum_algorithm) |
|
|
|
|
|
|
115
|
|
|
checksum_file.unlink(missing_ok=True) |
|
116
|
|
|
|
|
117
|
|
|
def resolve_path(self, taxon: int) -> Path: |
|
|
|
|
|
|
118
|
|
|
return self._resolve_non_vertebrate_final(taxon) |
|
119
|
|
|
|
|
120
|
|
|
def _resolve_non_vertebrate_final(self, taxon: int) -> Path: |
|
121
|
|
|
return self._get_resource(f"{taxon}{SETTINGS.archive_filename_suffix}") |
|
122
|
|
|
|
|
123
|
|
|
def _resolve_non_vertebrate_raw(self, taxon: int) -> Path: |
|
124
|
|
|
# this is what is downloaded from PubChem |
|
125
|
|
|
# the filename is the same |
|
126
|
|
|
return self._get_resource(f"taxonomy-ancestor_{taxon}.tsv.gz") |
|
127
|
|
|
|
|
128
|
|
|
def _get_resource(self, *nodes: Union[Path, str]) -> Path: |
|
129
|
|
|
path = MandosResources.path(*nodes) |
|
130
|
|
|
if path.exists(): |
|
131
|
|
|
return path |
|
132
|
|
|
self.cache_dir.mkdir(parents=True, exist_ok=True) |
|
133
|
|
|
return Path(self.cache_dir, *nodes) |
|
134
|
|
|
|
|
135
|
|
|
def _download_raw(self, raw_path: Path, taxon: int) -> None: |
|
|
|
|
|
|
136
|
|
|
# this is faster and safer than using pd.read_csv(url) |
|
137
|
|
|
# https://uniprot.org/taxonomy/?query=ancestor:7742&format=tab&force=true&columns=id&compress=yes |
|
138
|
|
|
url = f"https://uniprot.org/taxonomy/?query=ancestor:{taxon}&format=tab&force=true&columns=id&compress=yes" |
|
|
|
|
|
|
139
|
|
|
with requests.get(url, stream=True) as r: |
|
|
|
|
|
|
140
|
|
|
with raw_path.open("wb") as f: |
|
|
|
|
|
|
141
|
|
|
shutil.copyfileobj(r.raw, f) |
|
142
|
|
|
|
|
143
|
|
|
def _fix(self, raw_path: Path, taxon: int, final_path: Path) -> TaxonomyDf: |
|
144
|
|
|
# now process it! |
|
145
|
|
|
# unfortunately it won't include an entry for the root ancestor (`taxon`) |
|
146
|
|
|
# so, we'll add it in (in ``df.append`` below) |
|
147
|
|
|
# noinspection PyPep8Naming |
|
148
|
|
|
raw_type = TypedDfs.untyped("Raw") |
|
149
|
|
|
df = raw_type.read_file(raw_path) |
|
|
|
|
|
|
150
|
|
|
# find the scientific name of the parent |
|
151
|
|
|
scientific_name = self._determine_name(df, taxon) |
|
152
|
|
|
# now fix the columns |
|
153
|
|
|
df = df[["Taxon", "Mnemonic", "Scientific name", "Common name", "Parent"]] |
|
|
|
|
|
|
154
|
|
|
df.columns = ["taxon", "mnemonic", "scientific_name", "common_name", "parent"] |
|
155
|
|
|
# now add the ancestor back in |
|
156
|
|
|
df = df.append( |
|
|
|
|
|
|
157
|
|
|
pd.Series(dict(taxon=taxon, scientific_name=scientific_name, parent=0)), |
|
158
|
|
|
ignore_index=True, |
|
159
|
|
|
) |
|
160
|
|
|
df["parent"] = df["parent"].fillna(0).astype(int) |
|
161
|
|
|
# write it to a feather / csv / whatever |
|
162
|
|
|
df = TaxonomyDf.convert(df) |
|
|
|
|
|
|
163
|
|
|
df.write_file(final_path, dir_hash=True) |
|
164
|
|
|
raw_path.unlink() |
|
165
|
|
|
return df |
|
166
|
|
|
|
|
167
|
|
|
def _determine_name(self, df: pd.DataFrame, taxon: int) -> str: |
|
|
|
|
|
|
168
|
|
|
got = df[df["Parent"] == taxon] |
|
169
|
|
|
if len(got) == 0: |
|
170
|
|
|
raise XValueError(f"Could not infer scientific name for {taxon}") |
|
171
|
|
|
z = str(list(got["Lineage"])[0]) |
|
|
|
|
|
|
172
|
|
|
return z.split("; ")[-1].strip() |
|
173
|
|
|
|
|
174
|
|
|
|
|
175
|
|
|
class FixedTaxonomyFactory(TaxonomyFactory): |
|
176
|
|
|
""" |
|
177
|
|
|
Mostly for testing. |
|
178
|
|
|
""" |
|
179
|
|
|
|
|
180
|
|
|
def __init__(self, tax: Taxonomy): |
|
181
|
|
|
self._tax = tax |
|
182
|
|
|
|
|
183
|
|
|
def load(self, taxon: Union[int, str]) -> Taxonomy: |
|
184
|
|
|
if isinstance(taxon, str): |
|
185
|
|
|
taxon = self._tax.req_only_by_name(taxon).id |
|
186
|
|
|
return self._tax.subtree(taxon) |
|
187
|
|
|
|
|
188
|
|
|
|
|
189
|
|
|
class TaxonomyFactories: |
|
190
|
|
|
""" |
|
191
|
|
|
Collection of static factory methods. |
|
192
|
|
|
""" |
|
193
|
|
|
|
|
194
|
|
|
@classmethod |
|
195
|
|
|
def list_cached_files(cls) -> Mapping[int, Path]: |
|
|
|
|
|
|
196
|
|
|
suffix = SETTINGS.archive_filename_suffix |
|
197
|
|
|
return { |
|
198
|
|
|
int(p.scientific_name.replace(suffix, "")): p |
|
199
|
|
|
for p in SETTINGS.taxonomy_cache_path.iterdir() |
|
200
|
|
|
if p.suffix.endswith(suffix) |
|
201
|
|
|
} |
|
202
|
|
|
|
|
203
|
|
|
@classmethod |
|
204
|
|
|
def main( |
|
|
|
|
|
|
205
|
|
|
cls, |
|
|
|
|
|
|
206
|
|
|
cache_dir: Path = SETTINGS.taxonomy_cache_path, |
|
|
|
|
|
|
207
|
|
|
local_only: bool = False, |
|
|
|
|
|
|
208
|
|
|
): |
|
209
|
|
|
return CachedTaxonomyCache(local_only=local_only, cache_dir=cache_dir) |
|
210
|
|
|
|
|
211
|
|
|
@classmethod |
|
212
|
|
|
def get_smart_taxonomy( |
|
|
|
|
|
|
213
|
|
|
cls, |
|
|
|
|
|
|
214
|
|
|
*, |
|
|
|
|
|
|
215
|
|
|
allow: Iterable[Union[int, str]], |
|
|
|
|
|
|
216
|
|
|
forbid: Iterable[Union[int, str]], |
|
|
|
|
|
|
217
|
|
|
ancestors: Union[int, Collection[int]] = Globals.cellular_taxon, |
|
|
|
|
|
|
218
|
|
|
cache_dir: Path = SETTINGS.taxonomy_cache_path, |
|
|
|
|
|
|
219
|
|
|
local_only: bool, |
|
|
|
|
|
|
220
|
|
|
) -> Taxonomy: |
|
221
|
|
|
cache = CachedTaxonomyCache(local_only=local_only, cache_dir=cache_dir) |
|
222
|
|
|
vertebrata = cache.load_vertebrate(Globals.vertebrata) |
|
223
|
|
|
return vertebrata.subtrees_by_ids_or_names(allow).exclude_subtrees_by_ids_or_names(forbid) |
|
224
|
|
|
# TODO: |
|
|
|
|
|
|
225
|
|
|
vertebrates: Set[Union[int, str]] = {t for t in allow if t in vertebrata} |
|
|
|
|
|
|
226
|
|
|
invertebrates: Set[Union[int, str]] = {t for t in allow if t not in vertebrata} |
|
227
|
|
|
trees: Set[Taxonomy] = {cache.load(t) for t in vertebrates} |
|
228
|
|
|
if len(invertebrates) > 0: |
|
229
|
|
|
if len(ancestors) == 0: |
|
230
|
|
|
new = {cache.load(t) for t in invertebrates} |
|
231
|
|
|
else: |
|
232
|
|
|
new = Taxonomy.from_trees({cache.load(t) for t in ancestors}) |
|
233
|
|
|
trees.add(new.subtrees_by_ids_or_names(invertebrates)) |
|
234
|
|
|
return Taxonomy.from_trees(trees).exclude_subtrees_by_ids_or_names(forbid) |
|
235
|
|
|
|
|
236
|
|
|
|
|
237
|
|
|
__all__ = ["TaxonomyFactory", "TaxonomyFactories"] |
|
238
|
|
|
|