Passed
Push — main ( 9813db...5006f2 )
by Douglas
01:43
created

mandos.model.taxonomy.Taxonomy._build_by_name()   B

Complexity

Conditions 6

Size

Total Lines 15
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 12
nop 2
dl 0
loc 15
rs 8.6666
c 0
b 0
f 0
1
from __future__ import annotations
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
2
3
import enum
4
from collections import defaultdict
5
from dataclasses import dataclass
6
from functools import total_ordering
7
from pathlib import Path
8
from typing import FrozenSet, Iterable, List, Mapping, Optional, Sequence, Set, Union
9
10
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
11
from mandos.model import MultipleMatchesError
12
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
13
14
from mandos import logger
0 ignored issues
show
introduced by
Imports from package mandos are not grouped
Loading history...
15
from mandos.model.utils import CleverEnum
16
17
18
class KnownTaxa:
19
    """
20
    Taxa whose IDs are used in the code.
21
    """
22
23
    biota = 131567  # 2 million nodes
24
    eukaryota = 2759  # 1.5 million nodes
25
    metazoa = 33208  # 1 million
26
    vertebrata = 7742  # 100,000 nodes
27
    euteleostomi = 117571  # 100,000 nodes
28
    human = 9606
29
    rat = 10116
30
    mouse = 10090
31
32
33
class NameType(CleverEnum):
34
    """
35
    Scientific name, common name, or mnemonic.
36
    """
37
38
    scientific = enum.auto()
39
    common = enum.auto()
40
    mnemonic = enum.auto()
41
42
43
def _fix_tax_df(df):
0 ignored issues
show
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
44
    df = df.rename(columns={c.replace(" ", "_").lower() for c in df.columns})
45
    df["parent"] = df["parent"].fillna(0).astype(int)
46
47
48
TaxonomyDf = (
49
    TypedDfs.typed("TaxonomyDf")
50
    .require("taxon", "parent", dtype=int)
51
    .require("scientific_name", dtype=int)
52
    .require("common_name", "mnemonic", dtype=str)
53
    .post(_fix_tax_df)
54
).build()
55
56
57
@total_ordering
0 ignored issues
show
Documentation introduced by
Empty class docstring
Loading history...
58
@dataclass()
59
class Taxon:
60
    """ """
61
62
    # we can't use frozen=True because we have both parents and children
63
    # instead, just use properties
64
    __id: int
65
    __scientific_name: str
66
    __common_name: Optional[str]
67
    __mnemonic: Optional[str]
68
    __parent: Optional[Taxon]
69
    __children: Set[Taxon]
70
71
    @property
72
    def id(self) -> int:
0 ignored issues
show
Coding Style Naming introduced by
Attribute name "id" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
73
        """
74
        Returns the UniProt ID of this taxon.
75
        """
76
        return self.__id
77
78
    @property
79
    def scientific_name(self) -> str:
80
        """
81
        Returns the scientific name of this taxon.
82
        """
83
        return self.__scientific_name
84
85
    @property
86
    def common_name(self) -> Optional[str]:
87
        """
88
        Returns the common name of this taxon, or None if it has none.
89
        """
90
        return self.__common_name
91
92
    @property
93
    def mnemonic(self) -> Optional[str]:
94
        """
95
        Returns the mnemonic of this taxon, or None if it has none.
96
        Only ~16 taxa have mnemonics as of 2021-08.
97
        For example: "BOVIN" `<https://www.uniprot.org/taxonomy/9913>`_.
98
        """
99
        return self.__mnemonic
100
101
    @property
102
    def keys(self) -> FrozenSet[Union[int, str]]:
103
        """
104
        Returns the IDs and names that can be used to find this taxon.
105
        Specifically, includes the ID (int), scientific name (str),
106
        common name (str; if any), and mnemonic (str; if any).
107
        """
108
        keys = {self.id, self.scientific_name, self.common_name, self.mnemonic}
109
        return frozenset({s for s in keys if s is not None})
110
111
    @property
112
    def as_series(self) -> pd.Series:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
113
        return pd.Series(
114
            dict(
115
                taxon=self.id,
116
                scientific_name=self.scientific_name,
117
                common_name=self.common_name,
118
                mnemonic=self.mnemonic,
119
                parent=self.parent.id,
120
            )
121
        )
122
123
    @property
124
    def parent(self) -> Taxon:
125
        """
126
        Returns the parent of this taxon.
127
        """
128
        return self.__parent
129
130
    @property
131
    def children(self) -> Set[Taxon]:
132
        """
133
        Returns the immediate descendents of this taxon.
134
        """
135
        return set(self.__children)
136
137
    @property
138
    def ancestors(self) -> Sequence[Taxon]:
139
        """
140
        Returns all taxa that are ancestors of, or identical to, this taxon.
141
        """
142
        lst = []
143
        self._ancestors(lst)
144
        return lst
145
146
    @property
147
    def descendents(self) -> Sequence[Taxon]:
148
        """
149
        Returns all taxa that are descendents of, or identical to, this taxon.
150
        """
151
        lst = []
152
        self._descendents(lst)
153
        return lst
154
155
    def _ancestors(self, values: List[Taxon]) -> None:
156
        values.append(self.parent)
157
        self.parent._ancestors(values)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _ancestors was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
158
159
    def _descendents(self, values: List[Taxon]) -> None:
160
        values.extend(self.children)
161
        for child in self.children:
162
            child._descendents(values)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _descendents was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
163
164
    def __str__(self):
165
        return repr(self)
166
167
    def __repr__(self):
168
        parent = self.parent.id if self.parent else "none"
169
        return f"{self.__class__.__name__}({self.id}: {self.scientific_name} (parent={parent}))"
170
171
    def __hash__(self):
172
        return hash(self.id)
173
174
    def __eq__(self, other):
175
        return self.id == other.id
176
177
    def __lt__(self, other):
178
        return self.id < other.id
179
180
181
TaxaIdsAndNames = Union[int, str, Taxon, Iterable[Union[int, str, Taxon]]]
182
TaxonIdOrName = Union[int, str, Taxon]
183
184
185
@dataclass()
186
class _Taxon(Taxon):
187
    """
188
    An internal, modifiable taxon for building the tree.
189
    """
190
191
    def set_names(self, scientific: str, common: Optional[str], mnemonic: Optional[str]):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
192
        self.__scientific_name = scientific
193
        self.__common_name = common
194
        self.__mnemonic = mnemonic
195
196
    def set_parent(self, parent: _Taxon):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
197
        self.__parent = parent
198
199
    def add_child(self, child: _Taxon):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
200
        self.__children.add(child)
201
202
    # weirdly these are required again -- probably an issue with dataclass
203
204
    def __str__(self):
205
        return repr(self)
206
207
    def __repr__(self):
208
        return f"{self.__class__.__name__}({self.id}: {self.scientific_name} (parent={self.parent.id if self.parent else 'none'}))"
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (131/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
209
210
    def __hash__(self):
211
        return hash(self.id)
212
213
    def __eq__(self, other):
214
        return self.id == other.id
215
216
    def __lt__(self, other):
217
        return self.id < other.id
218
219
220
class Taxonomy:
0 ignored issues
show
best-practice introduced by
Too many public methods (24/20)
Loading history...
221
    """
222
    A taxonomic tree of organisms from UniProt.
223
    Elements in the tree can be looked up by name or ID using ``__getitem__`` and ``get``.
224
    """
225
226
    def __init__(self, by_id: Mapping[int, Taxon], by_name: Mapping[str, FrozenSet[Taxon]]):
227
        """
228
229
        Args:
230
            by_id:
231
        """
232
        # constructor provided for consistency with the members
233
        self._by_id = dict(by_id)
234
        self._by_name = dict(by_name)
235
        # this probably isn't actually possible
236
        if len(self) == 0:
237
            logger.warning(f"{self} contains 0 taxa")
238
239
    @classmethod
240
    def from_trees(cls, taxonomies: Sequence[Taxonomy]) -> Taxonomy:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
241
        # we need to rewrite the ancestors, which from_df already does
242
        # so we'll just use that
243
        dfs = [tree.to_df() for tree in taxonomies]
244
        df = TaxonomyDf(pd.concat(dfs, ignore_index=True))
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
245
        df = df.drop_duplicates().sort_values("taxon")
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
246
        return Taxonomy.from_df(df)
247
248
    @classmethod
249
    def from_list(cls, taxa: Sequence[Taxon]) -> Taxonomy:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
250
        by_id = {x.id: x for x in taxa}
251
        by_name = cls._build_by_name(by_id.values())
252
        tax = Taxonomy(by_id, by_name)
253
        # catch duplicate values
254
        if len(tax._by_id) != len(taxa):
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _by_id was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
255
            raise AssertionError(f"{len(tax._by_id)} != {len(taxa)}")
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _by_id was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
256
        return tax
257
258
    @classmethod
259
    def from_path(cls, path: Path) -> Taxonomy:
260
        """
261
        Reads from a DataFrame file.
262
        """
263
        df = TaxonomyDf.read_file(path)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
264
        return cls.from_df(df)
265
266
    @classmethod
267
    def from_df(cls, df: TaxonomyDf) -> Taxonomy:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
268
        """
269
        Reads from a DataFrame from a file provided by a UniProt download.
270
        Strips any entries with missing or empty-string scientific names.
271
272
        Args:
273
            df: A TaxonomyDf DataFrame
274
275
        Returns:
276
            The corresponding taxonomic tree
277
        """
278
        # just build up a tree, sticking the elements in by_id
279
        tax = {}
280
        for row in df.itertuples():
281
            _new_child = _Taxon(
282
                row.taxon, row.scientific_name, row.common_name, row.mnemonic, None, set()
283
            )
284
            child = tax.setdefault(row.taxon, _new_child)
285
            child.set_names(row.scientific_name, row.common_name, row.mnemonic)
286
            if row.parent != 0:
287
                _new_parent = _Taxon(row.parent, "", None, None, None, set())
288
                parent = tax.setdefault(row.parent, _new_parent)
289
                child.set_parent(parent)
290
                parent.add_child(child)
291
        bad = [t for t in tax.values() if t.scientific_name.strip() == ""]
292
        if len(bad) > 0:
293
            raise ValueError(f"{len(bad)} taxa with missing or empty scientific names: {bad}.")
294
        for v in tax.values():
0 ignored issues
show
Coding Style Naming introduced by
Variable name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
295
            v.__class__ = Taxon
296
        by_name = cls._build_by_name(tax.values())
297
        return Taxonomy(tax, by_name)
298
299
    def to_df(self) -> TaxonomyDf:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
300
        return TaxonomyDf.convert(pd.DataFrame([taxon.as_series for taxon in self.taxa]))
301
302
    @property
303
    def taxa(self) -> Sequence[Taxon]:
304
        """
305
        Returns all taxa in the tree.
306
        """
307
        return list(self._by_id.values())
308
309
    @property
310
    def roots(self) -> Sequence[Taxon]:
311
        """
312
        Returns the roots of the tree (at least 1).
313
        """
314
        return [k for k in self.taxa if k.parent is None or k.parent not in self]
315
316
    @property
317
    def leaves(self) -> Sequence[Taxon]:
318
        """
319
        Returns the leaves (typically species or sub-species) of the tree.
320
        """
321
        return [k for k in self.taxa if len(k.children) == 0]
322
323
    def exclude_subtree(self, item: Union[int, Taxon]) -> Taxonomy:
324
        """
325
        Returns a new tree that excludes a single specified taxon and its descendents.
326
        """
327
        descendents = self.get_by_id_or_name(item)
328
        for i in set(descendents):
329
            descendents += i.descendents
330
        by_id = {d.id: d for d in descendents}
331
        by_name = self.__class__._build_by_name(by_id.values())
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _build_by_name was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
332
        return Taxonomy(by_id, by_name)
333
334
    def exclude_subtrees_by_ids_or_names(self, items: TaxaIdsAndNames) -> Taxonomy:
335
        """
336
        Returns a tree tree that excludes taxa that are descendents of the specified taxa.
337
        If a name is used in multiple taxa, all of those will be used to exclude.
338
339
        Arguments:
340
            items: A scientific name, common name, or mnemonic; or a sequence of them
341
        """
342
        if isinstance(items, (int, str, Taxon)):
343
            items = [items]
344
        bad_taxa = self.subtrees_by_ids_or_names(items).taxa
345
        by_id = {i: t for i, t in self._by_id.items() if i not in bad_taxa}
346
        by_name = self.__class__._build_by_name(by_id.values())
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _build_by_name was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
347
        return Taxonomy(by_id, by_name)
348
349
    def subtree(self, item: int) -> Taxonomy:
350
        """
351
        Returns the tree that is rooted at a single taxon (by ID).
352
        """
353
        item = self[item]
354
        descendents = {item, *item.descendents}
355
        by_id = {d.id: d for d in descendents}
356
        by_name = self.__class__._build_by_name(by_id.values())
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _build_by_name was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
357
        return Taxonomy(by_id, by_name)
358
359
    def subtrees_by_ids_or_names(self, items: TaxaIdsAndNames) -> Taxonomy:
360
        """
361
        Returns the tree that is rooted at the specified taxa (by name or ID).
362
        The tree will have *at most* ``len(items)`` roots.
363
364
        Arguments:
365
            items: A scientific name, common name, or mnemonic; or a sequence of them
366
        """
367
        if isinstance(items, (int, str, Taxon)):
368
            items = [items]
369
        descendents: Set[Taxon] = set()
370
        for item in items:
371
            for taxon in self.get_by_id_or_name(item):
372
                descendents.update({taxon, *taxon.descendents})
373
        by_id = {d.id: d for d in descendents}
374
        by_name = self.__class__._build_by_name(by_id.values())
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _build_by_name was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
375
        return Taxonomy(by_id, by_name)
376
377
    def subtrees_by_name(self, item: str) -> Taxonomy:
378
        """
379
        Returns the tree rooted at the taxa with the specified scientific name.
380
381
        Arguments:
382
            item: A scientific name, common name, or mnemonic
383
        """
384
        return self.subtrees_by_names(item)
385
386
    def subtrees_by_names(self, items: Iterable[str]) -> Taxonomy:
387
        """
388
        Returns the tree rooted at the specified taxa (by scientific name).
389
390
        Arguments:
391
            items: A sequence of scientific name, common name, and/or mnemonics
392
        """
393
        descendents: Set[Taxon] = set()
394
        for item in items:
395
            for taxon in self._by_name.get(item, []):
396
                descendents.update({taxon, *taxon.descendents})
397
        by_id = {d.id: d for d in descendents}
398
        by_name = self.__class__._build_by_name(by_id.values())
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _build_by_name was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
399
        return Taxonomy(by_id, by_name)
400
401
    def req_one_by_name(self, item: str) -> Taxon:
402
        """
403
        Gets a single taxon by its name.
404
        If there are multiple, returns the first (lowest ID).
405
        Raises an error if there are no matches.
406
407
        Arguments:
408
            item: A scientific name, common name, or mnemonic
409
410
        Raises:
411
            LookupError: If not found
412
            MultipleMatchesError: If multiple are found
413
        """
414
        one = self.get_one_by_name(item)
415
        if one is None:
416
            raise LookupError(f"No taxa for {item}")
417
        return one
418
419
    def req_only_by_name(self, item: str) -> Taxon:
420
        """
421
        Gets a single taxon by its name.
422
        Raises an error if there are multiple matches for the name, or if there are no matches.
423
424
        Arguments:
425
            item: A scientific name, common name, or mnemonic
426
427
        Raises:
428
            LookupError: If not found
429
            MultipleMatchesError: If multiple are found
430
        """
431
        taxa = self.get_by_name(item)
432
        ids = ",".join([str(t.id) for t in taxa])
433
        if len(taxa) > 1:
0 ignored issues
show
Unused Code introduced by
Unnecessary "elif" after "raise"
Loading history...
434
            raise MultipleMatchesError(f"Got multiple results for {item}: {ids}")
435
        elif len(taxa) == 0:
436
            raise LookupError(f"No taxa for {item}")
437
        return next(iter(taxa))
438
439
    def get_one_by_name(self, item: str) -> Optional[Taxon]:
440
        """
441
        Gets a single taxon by its name.
442
        If there are multiple, returns the first (lowest ID).
443
        If there are none, returns ``None``.
444
        Logs at warning level if multiple matched.
445
446
        Arguments:
447
            item: A scientific name, common name, or mnemonic
448
        """
449
        taxa = self.get_by_name(item)
450
        ids = ",".join([str(t.id) for t in taxa])
451
        if len(taxa) > 1:
452
            logger.warning(f"Got multiple results for {item}: {ids}")
453
        elif len(taxa) == 0:
454
            return None
455
        return next(iter(taxa))
456
457
    def get_by_name(self, item: str) -> FrozenSet[Taxon]:
458
        """
459
        Gets all taxa that match a scientific name.
460
        """
461
        if isinstance(item, Taxon):
462
            item = item.scientific_name
463
        return self._by_name.get(item, frozenset(set()))
464
465
    def get_all_by_id_or_name(self, items: Iterable[Union[int, str, Taxon]]) -> FrozenSet[Taxon]:
466
        """
467
        Gets all taxa that match any number of IDs or names.
468
        """
469
        matching = []
470
        for item in items:
471
            matching += self.get_by_id_or_name(item)
472
        # finally de-duplicates (making this fn useful)
473
        return frozenset(matching)
474
475
    def get_by_id_or_name(self, item: Union[int, str, Taxon]) -> FrozenSet[Taxon]:
476
        """
477
        Gets all taxa that match an ID or name.
478
        """
479
        if isinstance(item, Taxon):
480
            item = item.id
481
        if isinstance(item, int):
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
482
            taxon = self._by_id.get(item)
483
            return frozenset([]) if taxon is None else frozenset([taxon])
484
        elif isinstance(item, str):
485
            return self._by_name.get(item, frozenset(set()))
486
        else:
487
            raise TypeError(f"Unknown type {type(item)} of {item}")
488
489
    def req(self, item: int) -> Taxon:
490
        """
491
        Gets a single taxon by its ID.
492
        Raises an error if it is not found.
493
        """
494
        if isinstance(item, Taxon):
495
            item = item.id
496
        return self[item]
497
498
    def get(self, item: Union[int, Taxon]) -> Optional[Taxon]:
499
        """
500
        Corresponds to ``dict.get``.
501
502
        Args:
503
            item: The scientific name or UniProt ID
504
505
        Returns:
506
            The taxon, or None if it was not found
507
        """
508
        if isinstance(item, Taxon):
509
            item = item.id
510
        if isinstance(item, int):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
511
            return self._by_id.get(item)
512
        else:
513
            raise TypeError(f"Type {type(item)} of {item} not applicable")
514
515
    def __getitem__(self, item: int) -> Taxon:
516
        """
517
        Corresponds to ``dict[_]``.
518
519
        Args:
520
            item: The UniProt ID
521
522
        Returns:
523
            The taxon
524
525
        Raises:
526
            KeyError: If the taxon was not found
527
        """
528
        got = self.get(item)
529
        if got is None:
530
            raise KeyError(f"{item} not found in {self}")
531
        return got
532
533
    def contains(self, item: Union[Taxon, int, str]):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
534
        return self.get(item) is not None
535
536
    def n_taxa(self) -> int:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
537
        return len(self._by_id)
538
539
    def __contains__(self, item: Union[Taxon, int, str]):
540
        return self.get(item) is not None
541
542
    def __len__(self) -> int:
543
        return len(self._by_id)
544
545
    def __str__(self) -> str:
546
        return repr(self)
547
548
    def __repr__(self) -> str:
549
        roots = ", ".join(r.scientific_name for r in self.roots)
550
        return f"{self.__class__.__name__}(n={len(self._by_id)} (roots={roots}) @ {hex(id(self))})"
551
552
    @classmethod
553
    def _build_by_name(cls, tax: Iterable[Taxon]) -> Mapping[str, FrozenSet[Taxon]]:
554
        by_name = defaultdict(set)
555
        # put these in the right order
556
        # so that we favor mnemonic, then scientific name, then common name
557
        for t in tax:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
558
            if t.mnemonic is not None:
559
                by_name[t.mnemonic].add(t)
560
        for t in tax:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
561
            by_name[t.scientific_name].add(t)
562
        for t in tax:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
563
            if t.common_name is not None:
564
                by_name[t.common_name].add(t)
565
        # NOTE: lower-casing the keys for lookup
566
        return {k.lower(): frozenset(v) for k, v in by_name.items()}
567
568
569
__all__ = ["Taxon", "Taxonomy", "TaxonomyDf", "KnownTaxa"]
570