Passed
Push — main ( 9813db...5006f2 )
by Douglas
01:43
created

mandos.model.utils.mappings._Compiler.__init__()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 1
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
from pathlib import Path
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
2
from typing import Sequence
3
4
import regex
0 ignored issues
show
introduced by
Unable to import 'regex'
Loading history...
5
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
6
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
7
8
from mandos.model import MandosResources
9
10
MappingFrame = TypedDfs.untyped("MappingFrame")
11
12
13
def _patterns(self: pd.DataFrame) -> Sequence[str]:
14
    return self[self.columns[0]].values.tolist()
15
16
17
def _targets(self: pd.DataFrame) -> Sequence[str]:
18
    return self.columns[1:].values.tolist()
19
20
21
def _get(self: pd.DataFrame, s: str) -> Sequence[str]:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "s" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
22
    for irow, pattern in enumerate(self[self.columns[0]].values):
23
        try:
24
            match: regex.Match = pattern.fullmatch(s)
25
        except AttributeError:
26
            raise ValueError(f"Failed on regex {pattern}") from None
27
        if match is not None:
28
            return [pattern.sub(t, s.strip()) for t in self.T[irow] if isinstance(t, str)]
29
    return s
30
31
32
MappingFrame.__doc__ = r"""
33
A list of regex patterns and replacements.
34
The first column is the pattern, and the next n columns are the targets.
35
Has an important function, ``MappingFrame.get``, describe below.
36
These DataFrames are used in a few places to clean up, simplify, or otherwise process
37
predicate and object names.
38
39
Example:
40
41
    For the input string "cyp450 2A3", consider we have these two rows:
42
    row 1: ``['^Juggle protein [xy]', 'Juggle \1', 'J\1']``
43
    row 2: ``['^CYP *450 (\d+)[A-Z]\d*$', 'Cytochrome P450 \1', 'CYP\1']``
44
    First, we try to match against the first pattern. It doesn't match, so we try the next.
45
    This one does match our input string, so we return ``["Cytochrome P450 2", "CYP2"]``.
46
    The first returned element (here "Cytochrome P450 2"), is considered the primary,
47
    while the second are -- for most usages -- considered optional extras.
48
"""
49
MappingFrame.targets = _targets
50
MappingFrame.patterns = _patterns
51
MappingFrame.get = _get
52
53
54
class _Compiler:
55
    """
56
    Compiles multiple regex patterns, providing nice error messages.
57
    All patterns are global (i.e. ^ and $ are affixed) and case-insensitive.
58
    """
59
60
    def __init__(self):
61
        self._i = 0
62
63
    def compile(self, s: str) -> regex.Pattern:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "s" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
introduced by
Missing function or method docstring
Loading history...
64
        self._i += 1  # header is the first
65
        try:
66
            return regex.compile("^" + s.strip() + "$", flags=regex.V1 | regex.IGNORECASE)
67
        except Exception:
68
            raise ValueError(
69
                f"Failed to parse '{s}' on line {self._i} (excluding comments and blank lines)"
70
            ) from None
71
72
73
class Mappings:
74
    """
75
    Creates MappingFrames.
76
    See that documentation.
77
    """
78
79
    @classmethod
80
    def from_resource(cls, name: str) -> MappingFrame:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
81
        path = MandosResources.a_path("mappings", name)
82
        return cls.from_path(path)
83
84
    @classmethod
85
    def from_path(cls, path: Path) -> MappingFrame:
86
        """
87
        Reads a mapping from a CSV-like file or ``.regexes`` file.
88
        Feather and Parquet are fine, too.
89
        The ``.regexes`` suffix is a simple extension of CSV that uses ``--->`` as the delimiter.
90
        and ignores empty lines and lines beginning with ``#``.
91
        It's just nice for easily editing in a text editor.
92
        """
93
        df = MappingFrame.read_file(path)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
94
        compiler = _Compiler()
95
        df[df.columns[0]] = df[df.columns[0]].map(compiler.compile)
96
        return df
97
98
99
__all__ = ["MappingFrame", "Mappings"]
100