|
1
|
|
|
from pathlib import Path |
|
|
|
|
|
|
2
|
|
|
from typing import Sequence |
|
3
|
|
|
|
|
4
|
|
|
import regex |
|
|
|
|
|
|
5
|
|
|
import pandas as pd |
|
|
|
|
|
|
6
|
|
|
from typeddfs import TypedDfs |
|
|
|
|
|
|
7
|
|
|
|
|
8
|
|
|
from mandos.model import MandosResources |
|
9
|
|
|
|
|
10
|
|
|
MappingFrame = TypedDfs.untyped("MappingFrame") |
|
11
|
|
|
|
|
12
|
|
|
|
|
13
|
|
|
def _patterns(self: pd.DataFrame) -> Sequence[str]: |
|
14
|
|
|
return self[self.columns[0]].values.tolist() |
|
15
|
|
|
|
|
16
|
|
|
|
|
17
|
|
|
def _targets(self: pd.DataFrame) -> Sequence[str]: |
|
18
|
|
|
return self.columns[1:].values.tolist() |
|
19
|
|
|
|
|
20
|
|
|
|
|
21
|
|
|
def _get(self: pd.DataFrame, s: str) -> Sequence[str]: |
|
|
|
|
|
|
22
|
|
|
for irow, pattern in enumerate(self[self.columns[0]].values): |
|
23
|
|
|
try: |
|
24
|
|
|
match: regex.Match = pattern.fullmatch(s) |
|
25
|
|
|
except AttributeError: |
|
26
|
|
|
raise ValueError(f"Failed on regex {pattern}") from None |
|
27
|
|
|
if match is not None: |
|
28
|
|
|
return [pattern.sub(t, s.strip()) for t in self.T[irow] if isinstance(t, str)] |
|
29
|
|
|
return s |
|
30
|
|
|
|
|
31
|
|
|
|
|
32
|
|
|
MappingFrame.__doc__ = r""" |
|
33
|
|
|
A list of regex patterns and replacements. |
|
34
|
|
|
The first column is the pattern, and the next n columns are the targets. |
|
35
|
|
|
Has an important function, ``MappingFrame.get``, describe below. |
|
36
|
|
|
These DataFrames are used in a few places to clean up, simplify, or otherwise process |
|
37
|
|
|
predicate and object names. |
|
38
|
|
|
|
|
39
|
|
|
Example: |
|
40
|
|
|
|
|
41
|
|
|
For the input string "cyp450 2A3", consider we have these two rows: |
|
42
|
|
|
row 1: ``['^Juggle protein [xy]', 'Juggle \1', 'J\1']`` |
|
43
|
|
|
row 2: ``['^CYP *450 (\d+)[A-Z]\d*$', 'Cytochrome P450 \1', 'CYP\1']`` |
|
44
|
|
|
First, we try to match against the first pattern. It doesn't match, so we try the next. |
|
45
|
|
|
This one does match our input string, so we return ``["Cytochrome P450 2", "CYP2"]``. |
|
46
|
|
|
The first returned element (here "Cytochrome P450 2"), is considered the primary, |
|
47
|
|
|
while the second are -- for most usages -- considered optional extras. |
|
48
|
|
|
""" |
|
49
|
|
|
MappingFrame.targets = _targets |
|
50
|
|
|
MappingFrame.patterns = _patterns |
|
51
|
|
|
MappingFrame.get = _get |
|
52
|
|
|
|
|
53
|
|
|
|
|
54
|
|
|
class _Compiler: |
|
55
|
|
|
""" |
|
56
|
|
|
Compiles multiple regex patterns, providing nice error messages. |
|
57
|
|
|
All patterns are global (i.e. ^ and $ are affixed) and case-insensitive. |
|
58
|
|
|
""" |
|
59
|
|
|
|
|
60
|
|
|
def __init__(self): |
|
61
|
|
|
self._i = 0 |
|
62
|
|
|
|
|
63
|
|
|
def compile(self, s: str) -> regex.Pattern: |
|
|
|
|
|
|
64
|
|
|
self._i += 1 # header is the first |
|
65
|
|
|
try: |
|
66
|
|
|
return regex.compile("^" + s.strip() + "$", flags=regex.V1 | regex.IGNORECASE) |
|
67
|
|
|
except Exception: |
|
68
|
|
|
raise ValueError( |
|
69
|
|
|
f"Failed to parse '{s}' on line {self._i} (excluding comments and blank lines)" |
|
70
|
|
|
) from None |
|
71
|
|
|
|
|
72
|
|
|
|
|
73
|
|
|
class Mappings: |
|
74
|
|
|
""" |
|
75
|
|
|
Creates MappingFrames. |
|
76
|
|
|
See that documentation. |
|
77
|
|
|
""" |
|
78
|
|
|
|
|
79
|
|
|
@classmethod |
|
80
|
|
|
def from_resource(cls, name: str) -> MappingFrame: |
|
|
|
|
|
|
81
|
|
|
path = MandosResources.a_path("mappings", name) |
|
82
|
|
|
return cls.from_path(path) |
|
83
|
|
|
|
|
84
|
|
|
@classmethod |
|
85
|
|
|
def from_path(cls, path: Path) -> MappingFrame: |
|
86
|
|
|
""" |
|
87
|
|
|
Reads a mapping from a CSV-like file or ``.regexes`` file. |
|
88
|
|
|
Feather and Parquet are fine, too. |
|
89
|
|
|
The ``.regexes`` suffix is a simple extension of CSV that uses ``--->`` as the delimiter. |
|
90
|
|
|
and ignores empty lines and lines beginning with ``#``. |
|
91
|
|
|
It's just nice for easily editing in a text editor. |
|
92
|
|
|
""" |
|
93
|
|
|
df = MappingFrame.read_file(path) |
|
|
|
|
|
|
94
|
|
|
compiler = _Compiler() |
|
95
|
|
|
df[df.columns[0]] = df[df.columns[0]].map(compiler.compile) |
|
96
|
|
|
return df |
|
97
|
|
|
|
|
98
|
|
|
|
|
99
|
|
|
__all__ = ["MappingFrame", "Mappings"] |
|
100
|
|
|
|