1
|
|
|
from pathlib import Path |
|
|
|
|
2
|
|
|
from typing import Sequence |
3
|
|
|
|
4
|
|
|
import regex |
|
|
|
|
5
|
|
|
import pandas as pd |
|
|
|
|
6
|
|
|
from typeddfs import TypedDfs |
|
|
|
|
7
|
|
|
|
8
|
|
|
from mandos.model import MandosResources |
9
|
|
|
|
10
|
|
|
MappingFrame = TypedDfs.untyped("MappingFrame") |
11
|
|
|
|
12
|
|
|
|
13
|
|
|
def _patterns(self: pd.DataFrame) -> Sequence[str]: |
14
|
|
|
return self[self.columns[0]].values.tolist() |
15
|
|
|
|
16
|
|
|
|
17
|
|
|
def _targets(self: pd.DataFrame) -> Sequence[str]: |
18
|
|
|
return self.columns[1:].values.tolist() |
19
|
|
|
|
20
|
|
|
|
21
|
|
|
def _get(self: pd.DataFrame, s: str) -> Sequence[str]: |
|
|
|
|
22
|
|
|
for irow, pattern in enumerate(self[self.columns[0]].values): |
23
|
|
|
try: |
24
|
|
|
match: regex.Match = pattern.fullmatch(s) |
25
|
|
|
except AttributeError: |
26
|
|
|
raise ValueError(f"Failed on regex {pattern}") from None |
27
|
|
|
if match is not None: |
28
|
|
|
return [pattern.sub(t, s.strip()) for t in self.T[irow] if isinstance(t, str)] |
29
|
|
|
return s |
30
|
|
|
|
31
|
|
|
|
32
|
|
|
MappingFrame.__doc__ = r""" |
33
|
|
|
A list of regex patterns and replacements. |
34
|
|
|
The first column is the pattern, and the next n columns are the targets. |
35
|
|
|
Has an important function, ``MappingFrame.get``, describe below. |
36
|
|
|
These DataFrames are used in a few places to clean up, simplify, or otherwise process |
37
|
|
|
predicate and object names. |
38
|
|
|
|
39
|
|
|
Example: |
40
|
|
|
|
41
|
|
|
For the input string "cyp450 2A3", consider we have these two rows: |
42
|
|
|
row 1: ``['^Juggle protein [xy]', 'Juggle \1', 'J\1']`` |
43
|
|
|
row 2: ``['^CYP *450 (\d+)[A-Z]\d*$', 'Cytochrome P450 \1', 'CYP\1']`` |
44
|
|
|
First, we try to match against the first pattern. It doesn't match, so we try the next. |
45
|
|
|
This one does match our input string, so we return ``["Cytochrome P450 2", "CYP2"]``. |
46
|
|
|
The first returned element (here "Cytochrome P450 2"), is considered the primary, |
47
|
|
|
while the second are -- for most usages -- considered optional extras. |
48
|
|
|
""" |
49
|
|
|
MappingFrame.targets = _targets |
50
|
|
|
MappingFrame.patterns = _patterns |
51
|
|
|
MappingFrame.get = _get |
52
|
|
|
|
53
|
|
|
|
54
|
|
|
class _Compiler: |
55
|
|
|
""" |
56
|
|
|
Compiles multiple regex patterns, providing nice error messages. |
57
|
|
|
All patterns are global (i.e. ^ and $ are affixed) and case-insensitive. |
58
|
|
|
""" |
59
|
|
|
|
60
|
|
|
def __init__(self): |
61
|
|
|
self._i = 0 |
62
|
|
|
|
63
|
|
|
def compile(self, s: str) -> regex.Pattern: |
|
|
|
|
64
|
|
|
self._i += 1 # header is the first |
65
|
|
|
try: |
66
|
|
|
return regex.compile("^" + s.strip() + "$", flags=regex.V1 | regex.IGNORECASE) |
67
|
|
|
except Exception: |
68
|
|
|
raise ValueError( |
69
|
|
|
f"Failed to parse '{s}' on line {self._i} (excluding comments and blank lines)" |
70
|
|
|
) from None |
71
|
|
|
|
72
|
|
|
|
73
|
|
|
class Mappings: |
74
|
|
|
""" |
75
|
|
|
Creates MappingFrames. |
76
|
|
|
See that documentation. |
77
|
|
|
""" |
78
|
|
|
|
79
|
|
|
@classmethod |
80
|
|
|
def from_resource(cls, name: str) -> MappingFrame: |
|
|
|
|
81
|
|
|
path = MandosResources.a_path("mappings", name) |
82
|
|
|
return cls.from_path(path) |
83
|
|
|
|
84
|
|
|
@classmethod |
85
|
|
|
def from_path(cls, path: Path) -> MappingFrame: |
86
|
|
|
""" |
87
|
|
|
Reads a mapping from a CSV-like file or ``.regexes`` file. |
88
|
|
|
Feather and Parquet are fine, too. |
89
|
|
|
The ``.regexes`` suffix is a simple extension of CSV that uses ``--->`` as the delimiter. |
90
|
|
|
and ignores empty lines and lines beginning with ``#``. |
91
|
|
|
It's just nice for easily editing in a text editor. |
92
|
|
|
""" |
93
|
|
|
df = MappingFrame.read_file(path) |
|
|
|
|
94
|
|
|
compiler = _Compiler() |
95
|
|
|
df[df.columns[0]] = df[df.columns[0]].map(compiler.compile) |
96
|
|
|
return df |
97
|
|
|
|
98
|
|
|
|
99
|
|
|
__all__ = ["MappingFrame", "Mappings"] |
100
|
|
|
|