Passed
Push — main ( 2b775d...83a9fb )
by Douglas
04:59 queued 02:43
created

mandos.model.taxonomy._fix_tax_df()   A

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
from __future__ import annotations
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
2
3
import enum
4
from collections import defaultdict
5
from dataclasses import dataclass
6
from functools import total_ordering
7
from pathlib import Path
8
from typing import FrozenSet, Iterable, List, Mapping, Optional, Sequence, Set, Union
9
10
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
11
from pocketutils.core.exceptions import LookupFailedError, DataIntegrityError
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.exceptions'
Loading history...
12
13
from mandos.model import MultipleMatchesError
14
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
15
16
from mandos.model.utils.setup import logger
0 ignored issues
show
introduced by
Imports from package mandos are not grouped
Loading history...
17
from mandos.model.utils import CleverEnum
18
19
20
class KnownTaxa:
21
    """
22
    Taxa whose IDs are used in the code.
23
    """
24
25
    biota = 131567  # 2 million nodes
26
    eukaryota = 2759  # 1.5 million nodes
27
    metazoa = 33208  # 1 million
28
    vertebrata = 7742  # 100,000 nodes
29
    euteleostomi = 117571  # 100,000 nodes
30
    human = 9606
31
    rat = 10116
32
    mouse = 10090
33
34
35
class NameType(CleverEnum):
36
    """
37
    Scientific name, common name, or mnemonic.
38
    """
39
40
    scientific = enum.auto()
41
    common = enum.auto()
42
    mnemonic = enum.auto()
43
44
45
TaxonomyDf = (
46
    TypedDfs.typed("TaxonomyDf")
47
    .require("taxon", "parent", dtype=int)
48
    .require("mnemonic", "scientific_name", "common_name", dtype=str)
49
    .strict()
50
    .secure()
51
).build()
52
53
54
@total_ordering
0 ignored issues
show
Documentation introduced by
Empty class docstring
Loading history...
55
@dataclass()
56
class Taxon:
57
    """ """
58
59
    # we can't use frozen=True because we have both parents and children
60
    # instead, just use properties
61
    __id: int
62
    __scientific_name: str
63
    __common_name: Optional[str]
64
    __mnemonic: Optional[str]
65
    __parent: Optional[Taxon]
66
    __children: Set[Taxon]
67
68
    @property
69
    def id(self) -> int:
0 ignored issues
show
Coding Style Naming introduced by
Attribute name "id" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
70
        """
71
        Returns the UniProt ID of this taxon.
72
        """
73
        return self.__id
74
75
    @property
76
    def scientific_name(self) -> str:
77
        """
78
        Returns the scientific name of this taxon.
79
        """
80
        return self.__scientific_name
81
82
    @property
83
    def common_name(self) -> Optional[str]:
84
        """
85
        Returns the common name of this taxon, or None if it has none.
86
        """
87
        return self.__common_name
88
89
    @property
90
    def mnemonic(self) -> Optional[str]:
91
        """
92
        Returns the mnemonic of this taxon, or None if it has none.
93
        Only ~16 taxa have mnemonics as of 2021-08.
94
        For example: "BOVIN" `<https://www.uniprot.org/taxonomy/9913>`_.
95
        """
96
        return self.__mnemonic
97
98
    @property
99
    def keys(self) -> FrozenSet[Union[int, str]]:
100
        """
101
        Returns the IDs and names that can be used to find this taxon.
102
        Specifically, includes the ID (int), scientific name (str),
103
        common name (str; if any), and mnemonic (str; if any).
104
        """
105
        keys = {self.id, self.scientific_name, self.common_name, self.mnemonic}
106
        return frozenset({s for s in keys if s is not None})
107
108
    @property
109
    def as_series(self) -> pd.Series:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
110
        return pd.Series(
111
            dict(
112
                taxon=self.id,
113
                scientific_name=self.scientific_name,
114
                common_name=self.common_name,
115
                mnemonic=self.mnemonic,
116
                parent=self.parent.id,
117
            )
118
        )
119
120
    @property
121
    def parent(self) -> Taxon:
122
        """
123
        Returns the parent of this taxon.
124
        """
125
        return self.__parent
126
127
    @property
128
    def children(self) -> Set[Taxon]:
129
        """
130
        Returns the immediate descendents of this taxon.
131
        """
132
        return set(self.__children)
133
134
    @property
135
    def ancestors(self) -> Sequence[Taxon]:
136
        """
137
        Returns all taxa that are ancestors of, or identical to, this taxon.
138
        """
139
        lst = []
140
        self._ancestors(lst)
141
        return lst
142
143
    @property
144
    def descendents(self) -> Sequence[Taxon]:
145
        """
146
        Returns all taxa that are descendents of, or identical to, this taxon.
147
        """
148
        lst = []
149
        self._descendents(lst)
150
        return lst
151
152
    def _ancestors(self, values: List[Taxon]) -> None:
153
        values.append(self.parent)
154
        self.parent._ancestors(values)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _ancestors was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
155
156
    def _descendents(self, values: List[Taxon]) -> None:
157
        values.extend(self.children)
158
        for child in self.children:
159
            child._descendents(values)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _descendents was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
160
161
    def __str__(self):
162
        return repr(self)
163
164
    def __repr__(self):
165
        parent = self.parent.id if self.parent else "none"
166
        return f"{self.__class__.__name__}({self.id}: {self.scientific_name} (parent={parent}))"
167
168
    def __hash__(self):
169
        return hash(self.id)
170
171
    def __eq__(self, other):
172
        return self.id == other.id
173
174
    def __lt__(self, other):
175
        return self.id < other.id
176
177
178
TaxaIdsAndNames = Union[int, str, Taxon, Iterable[Union[int, str, Taxon]]]
179
TaxonIdOrName = Union[int, str, Taxon]
180
181
182
@dataclass()
183
class _Taxon(Taxon):
184
    """
185
    An internal, modifiable taxon for building the tree.
186
    """
187
188
    def set_names(self, scientific: str, common: Optional[str], mnemonic: Optional[str]):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
189
        self.__scientific_name = scientific
190
        self.__common_name = common
191
        self.__mnemonic = mnemonic
192
193
    def set_parent(self, parent: _Taxon):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
194
        self.__parent = parent
195
196
    def add_child(self, child: _Taxon):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
197
        self.__children.add(child)
198
199
    # weirdly these are required again -- probably an issue with dataclass
200
201
    def __str__(self):
202
        return repr(self)
203
204
    def __repr__(self):
205
        return f"{self.__class__.__name__}({self.id}: {self.scientific_name} (parent={self.parent.id if self.parent else 'none'}))"
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (131/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
206
207
    def __hash__(self):
208
        return hash(self.id)
209
210
    def __eq__(self, other):
211
        return self.id == other.id
212
213
    def __lt__(self, other):
214
        return self.id < other.id
215
216
217
class Taxonomy:
0 ignored issues
show
best-practice introduced by
Too many public methods (24/20)
Loading history...
218
    """
219
    A taxonomic tree of organisms from UniProt.
220
    Elements in the tree can be looked up by name or ID using ``__getitem__`` and ``get``.
221
    """
222
223
    def __init__(self, by_id: Mapping[int, Taxon], by_name: Mapping[str, FrozenSet[Taxon]]):
224
        """
225
226
        Args:
227
            by_id:
228
        """
229
        # constructor provided for consistency with the members
230
        self._by_id = dict(by_id)
231
        self._by_name = dict(by_name)
232
        # this probably isn't actually possible
233
        if len(self) == 0:
234
            logger.warning(f"{self} contains 0 taxa")
235
236
    @classmethod
237
    def from_trees(cls, taxonomies: Sequence[Taxonomy]) -> Taxonomy:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
238
        # we need to rewrite the ancestors, which from_df already does
239
        # so we'll just use that
240
        dfs = [tree.to_df() for tree in taxonomies]
241
        df = TaxonomyDf(pd.concat(dfs, ignore_index=True))
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
242
        df = df.drop_duplicates().sort_values("taxon")
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
243
        return Taxonomy.from_df(df)
244
245
    @classmethod
246
    def from_list(cls, taxa: Sequence[Taxon]) -> Taxonomy:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
247
        by_id = {x.id: x for x in taxa}
248
        by_name = cls._build_by_name(by_id.values())
249
        tax = Taxonomy(by_id, by_name)
250
        # catch duplicate values
251
        if len(tax._by_id) != len(taxa):
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _by_id was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
252
            raise DataIntegrityError(f"{len(tax._by_id)} != {len(taxa)}")
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _by_id was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
253
        return tax
254
255
    @classmethod
256
    def from_path(cls, path: Path) -> Taxonomy:
257
        """
258
        Reads from a DataFrame file.
259
        """
260
        df = TaxonomyDf.read_file(path)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
261
        return cls.from_df(df)
262
263
    @classmethod
264
    def from_df(cls, df: TaxonomyDf) -> Taxonomy:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
265
        """
266
        Reads from a DataFrame from a file provided by a UniProt download.
267
        Strips any entries with missing or empty-string scientific names.
268
269
        Args:
270
            df: A TaxonomyDf DataFrame
271
272
        Returns:
273
            The corresponding taxonomic tree
274
        """
275
        # just build up a tree, sticking the elements in by_id
276
        tax = {}
277
        for row in df.itertuples():
278
            _new_child = _Taxon(
279
                row.taxon, row.scientific_name, row.common_name, row.mnemonic, None, set()
280
            )
281
            child = tax.setdefault(row.taxon, _new_child)
282
            child.set_names(row.scientific_name, row.common_name, row.mnemonic)
283
            if row.parent != 0:
284
                _new_parent = _Taxon(row.parent, "", None, None, None, set())
285
                parent = tax.setdefault(row.parent, _new_parent)
286
                child.set_parent(parent)
287
                parent.add_child(child)
288
        bad = [t for t in tax.values() if t.scientific_name.strip() == ""]
289
        if len(bad) > 0:
290
            raise DataIntegrityError(
291
                f"{len(bad)} taxa with missing or empty scientific names: {bad}."
292
            )
293
        for v in tax.values():
0 ignored issues
show
Coding Style Naming introduced by
Variable name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
294
            v.__class__ = Taxon
295
        by_name = cls._build_by_name(tax.values())
296
        return Taxonomy(tax, by_name)
297
298
    def to_df(self) -> TaxonomyDf:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
299
        return TaxonomyDf.convert(pd.DataFrame([taxon.as_series for taxon in self.taxa]))
300
301
    @property
302
    def taxa(self) -> Sequence[Taxon]:
303
        """
304
        Returns all taxa in the tree.
305
        """
306
        return list(self._by_id.values())
307
308
    @property
309
    def roots(self) -> Sequence[Taxon]:
310
        """
311
        Returns the roots of the tree (at least 1).
312
        """
313
        return [k for k in self.taxa if k.parent is None or k.parent not in self]
314
315
    @property
316
    def leaves(self) -> Sequence[Taxon]:
317
        """
318
        Returns the leaves (typically species or sub-species) of the tree.
319
        """
320
        return [k for k in self.taxa if len(k.children) == 0]
321
322
    def exclude_subtree(self, item: Union[int, Taxon]) -> Taxonomy:
323
        """
324
        Returns a new tree that excludes a single specified taxon and its descendents.
325
        """
326
        descendents = self.get_by_id_or_name(item)
327
        for i in set(descendents):
328
            descendents += i.descendents
329
        by_id = {d.id: d for d in descendents}
330
        by_name = self.__class__._build_by_name(by_id.values())
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _build_by_name was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
331
        return Taxonomy(by_id, by_name)
332
333
    def exclude_subtrees_by_ids_or_names(self, items: TaxaIdsAndNames) -> Taxonomy:
334
        """
335
        Returns a tree tree that excludes taxa that are descendents of the specified taxa.
336
        If a name is used in multiple taxa, all of those will be used to exclude.
337
338
        Arguments:
339
            items: A scientific name, common name, or mnemonic; or a sequence of them
340
        """
341
        if isinstance(items, (int, str, Taxon)):
342
            items = [items]
343
        bad_taxa = self.subtrees_by_ids_or_names(items).taxa
344
        by_id = {i: t for i, t in self._by_id.items() if i not in bad_taxa}
345
        by_name = self.__class__._build_by_name(by_id.values())
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _build_by_name was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
346
        return Taxonomy(by_id, by_name)
347
348
    def subtree(self, item: int) -> Taxonomy:
349
        """
350
        Returns the tree that is rooted at a single taxon (by ID).
351
        """
352
        item = self[item]
353
        descendents = {item, *item.descendents}
354
        by_id = {d.id: d for d in descendents}
355
        by_name = self.__class__._build_by_name(by_id.values())
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _build_by_name was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
356
        return Taxonomy(by_id, by_name)
357
358
    def subtrees_by_ids_or_names(self, items: TaxaIdsAndNames) -> Taxonomy:
359
        """
360
        Returns the tree that is rooted at the specified taxa (by name or ID).
361
        The tree will have *at most* ``len(items)`` roots.
362
363
        Arguments:
364
            items: A scientific name, common name, or mnemonic; or a sequence of them
365
        """
366
        if isinstance(items, (int, str, Taxon)):
367
            items = [items]
368
        descendents: Set[Taxon] = set()
369
        for item in items:
370
            for taxon in self.get_by_id_or_name(item):
371
                descendents.update({taxon, *taxon.descendents})
372
        by_id = {d.id: d for d in descendents}
373
        by_name = self.__class__._build_by_name(by_id.values())
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _build_by_name was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
374
        return Taxonomy(by_id, by_name)
375
376
    def subtrees_by_name(self, item: str) -> Taxonomy:
377
        """
378
        Returns the tree rooted at the taxa with the specified scientific name.
379
380
        Arguments:
381
            item: A scientific name, common name, or mnemonic
382
        """
383
        return self.subtrees_by_names(item)
384
385
    def subtrees_by_names(self, items: Iterable[str]) -> Taxonomy:
386
        """
387
        Returns the tree rooted at the specified taxa (by scientific name).
388
389
        Arguments:
390
            items: A sequence of scientific name, common name, and/or mnemonics
391
        """
392
        descendents: Set[Taxon] = set()
393
        for item in items:
394
            for taxon in self._by_name.get(item, []):
395
                descendents.update({taxon, *taxon.descendents})
396
        by_id = {d.id: d for d in descendents}
397
        by_name = self.__class__._build_by_name(by_id.values())
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _build_by_name was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
398
        return Taxonomy(by_id, by_name)
399
400
    def req_one_by_name(self, item: str) -> Taxon:
401
        """
402
        Gets a single taxon by its name.
403
        If there are multiple, returns the first (lowest ID).
404
        Raises an error if there are no matches.
405
406
        Arguments:
407
            item: A scientific name, common name, or mnemonic
408
409
        Raises:
410
            LookupError: If not found
411
            MultipleMatchesError: If multiple are found
412
        """
413
        one = self.get_one_by_name(item)
414
        if one is None:
415
            raise LookupFailedError(f"No taxa for {item}")
416
        return one
417
418
    def req_only_by_name(self, item: str) -> Taxon:
419
        """
420
        Gets a single taxon by its name.
421
        Raises an error if there are multiple matches for the name, or if there are no matches.
422
423
        Arguments:
424
            item: A scientific name, common name, or mnemonic
425
426
        Raises:
427
            LookupError: If not found
428
            MultipleMatchesError: If multiple are found
429
        """
430
        taxa = self.get_by_name(item)
431
        ids = ",".join([str(t.id) for t in taxa])
432
        if len(taxa) > 1:
0 ignored issues
show
Unused Code introduced by
Unnecessary "elif" after "raise"
Loading history...
433
            raise MultipleMatchesError(f"Got multiple results for {item}: {ids}")
434
        elif len(taxa) == 0:
435
            raise LookupFailedError(f"No taxa for {item}")
436
        return next(iter(taxa))
437
438
    def get_one_by_name(self, item: str) -> Optional[Taxon]:
439
        """
440
        Gets a single taxon by its name.
441
        If there are multiple, returns the first (lowest ID).
442
        If there are none, returns ``None``.
443
        Logs at warning level if multiple matched.
444
445
        Arguments:
446
            item: A scientific name, common name, or mnemonic
447
        """
448
        taxa = self.get_by_name(item)
449
        ids = ",".join([str(t.id) for t in taxa])
450
        if len(taxa) > 1:
451
            logger.warning(f"Got multiple results for {item}: {ids}")
452
        elif len(taxa) == 0:
453
            return None
454
        return next(iter(taxa))
455
456
    def get_by_name(self, item: str) -> FrozenSet[Taxon]:
457
        """
458
        Gets all taxa that match a scientific name.
459
        """
460
        if isinstance(item, Taxon):
461
            item = item.scientific_name
462
        return self._by_name.get(item, frozenset(set()))
463
464
    def get_all_by_id_or_name(self, items: Iterable[Union[int, str, Taxon]]) -> FrozenSet[Taxon]:
465
        """
466
        Gets all taxa that match any number of IDs or names.
467
        """
468
        matching = []
469
        for item in items:
470
            matching += self.get_by_id_or_name(item)
471
        # finally de-duplicates (making this fn useful)
472
        return frozenset(matching)
473
474
    def get_by_id_or_name(self, item: Union[int, str, Taxon]) -> FrozenSet[Taxon]:
475
        """
476
        Gets all taxa that match an ID or name.
477
        """
478
        if isinstance(item, Taxon):
479
            item = item.id
480
        if isinstance(item, int):
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
481
            taxon = self._by_id.get(item)
482
            return frozenset([]) if taxon is None else frozenset([taxon])
483
        elif isinstance(item, str):
484
            return self._by_name.get(item, frozenset(set()))
485
        else:
486
            raise XTypeError(f"Unknown type {type(item)} of {item}")
0 ignored issues
show
Comprehensibility Best Practice introduced by
Undefined variable 'XTypeError'
Loading history...
Comprehensibility Best Practice introduced by
The variable XTypeError does not seem to be defined.
Loading history...
487
488
    def req(self, item: int) -> Taxon:
489
        """
490
        Gets a single taxon by its ID.
491
        Raises an error if it is not found.
492
        """
493
        if isinstance(item, Taxon):
494
            item = item.id
495
        return self[item]
496
497
    def get(self, item: Union[int, Taxon]) -> Optional[Taxon]:
498
        """
499
        Corresponds to ``dict.get``.
500
501
        Args:
502
            item: The scientific name or UniProt ID
503
504
        Returns:
505
            The taxon, or None if it was not found
506
        """
507
        if isinstance(item, Taxon):
508
            item = item.id
509
        if isinstance(item, int):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
510
            return self._by_id.get(item)
511
        else:
512
            raise XTypeError(f"Type {type(item)} of {item} not applicable")
0 ignored issues
show
Comprehensibility Best Practice introduced by
Undefined variable 'XTypeError'
Loading history...
Comprehensibility Best Practice introduced by
The variable XTypeError does not seem to be defined.
Loading history...
513
514
    def __getitem__(self, item: int) -> Taxon:
515
        """
516
        Corresponds to ``dict[_]``.
517
518
        Args:
519
            item: The UniProt ID
520
521
        Returns:
522
            The taxon
523
524
        Raises:
525
            KeyError: If the taxon was not found
526
        """
527
        got = self.get(item)
528
        if got is None:
529
            raise LookupFailedError(f"{item} not found in {self}")
530
        return got
531
532
    def contains(self, item: Union[Taxon, int, str]):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
533
        return self.get(item) is not None
534
535
    def n_taxa(self) -> int:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
536
        return len(self._by_id)
537
538
    def __contains__(self, item: Union[Taxon, int, str]):
539
        return self.get(item) is not None
540
541
    def __len__(self) -> int:
542
        return len(self._by_id)
543
544
    def __str__(self) -> str:
545
        return repr(self)
546
547
    def __repr__(self) -> str:
548
        roots = ", ".join(r.scientific_name for r in self.roots)
549
        return f"{self.__class__.__name__}(n={len(self._by_id)} (roots={roots}) @ {hex(id(self))})"
550
551
    @classmethod
552
    def _build_by_name(cls, tax: Iterable[Taxon]) -> Mapping[str, FrozenSet[Taxon]]:
553
        by_name = defaultdict(set)
554
        # put these in the right order
555
        # so that we favor mnemonic, then scientific name, then common name
556
        for t in tax:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
557
            if t.mnemonic is not None:
558
                by_name[t.mnemonic].add(t)
559
        for t in tax:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
560
            by_name[t.scientific_name].add(t)
561
        for t in tax:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
562
            if t.common_name is not None:
563
                by_name[t.common_name].add(t)
564
        # NOTE: lower-casing the keys for lookup
565
        return {k.lower(): frozenset(v) for k, v in by_name.items()}
566
567
568
__all__ = ["Taxon", "Taxonomy", "TaxonomyDf", "KnownTaxa"]
569