| 1 |  |  | from pathlib import Path | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | from typing import Sequence | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | import regex | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | import pandas as pd | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | from typeddfs import TypedDfs | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | from mandos.model import MandosResources | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | MappingFrame = TypedDfs.untyped("MappingFrame") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | def _patterns(self: pd.DataFrame) -> Sequence[str]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |     return self[self.columns[0]].values.tolist() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | def _targets(self: pd.DataFrame) -> Sequence[str]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |     return self.columns[1:].values.tolist() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  | def _get(self: pd.DataFrame, s: str) -> Sequence[str]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |     for irow, pattern in enumerate(self[self.columns[0]].values): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |         try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |             match: regex.Match = pattern.fullmatch(s) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |         except AttributeError: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |             raise ValueError(f"Failed on regex {pattern}") from None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |         if match is not None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |             return [pattern.sub(t, s.strip()) for t in self.T[irow] if isinstance(t, str)] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |     return s | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  | MappingFrame.__doc__ = r""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  | A list of regex patterns and replacements. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  | The first column is the pattern, and the next n columns are the targets. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  | Has an important function, ``MappingFrame.get``, describe below. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  | These DataFrames are used in a few places to clean up, simplify, or otherwise process | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  | predicate and object names. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  | Example: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |     For the input string "cyp450 2A3", consider we have these two rows: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |     row 1: ``['^Juggle protein [xy]', 'Juggle \1', 'J\1']`` | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |     row 2: ``['^CYP *450 (\d+)[A-Z]\d*$', 'Cytochrome P450 \1', 'CYP\1']`` | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |     First, we try to match against the first pattern. It doesn't match, so we try the next. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |     This one does match our input string, so we return ``["Cytochrome P450 2", "CYP2"]``. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |     The first returned element (here "Cytochrome P450 2"), is considered the primary, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |     while the second are -- for most usages -- considered optional extras. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  | MappingFrame.targets = _targets | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  | MappingFrame.patterns = _patterns | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  | MappingFrame.get = _get | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  | class _Compiler: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |     Compiles multiple regex patterns, providing nice error messages. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |     All patterns are global (i.e. ^ and $ are affixed) and case-insensitive. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |     def __init__(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |         self._i = 0 | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 62 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 63 |  |  |     def compile(self, s: str) -> regex.Pattern: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 64 |  |  |         self._i += 1  # header is the first | 
            
                                                                        
                            
            
                                    
            
            
                | 65 |  |  |         try: | 
            
                                                                        
                            
            
                                    
            
            
                | 66 |  |  |             return regex.compile("^" + s.strip() + "$", flags=regex.V1 | regex.IGNORECASE) | 
            
                                                                        
                            
            
                                    
            
            
                | 67 |  |  |         except Exception: | 
            
                                                                        
                            
            
                                    
            
            
                | 68 |  |  |             raise ValueError( | 
            
                                                                        
                            
            
                                    
            
            
                | 69 |  |  |                 f"Failed to parse '{s}' on line {self._i} (excluding comments and blank lines)" | 
            
                                                                        
                            
            
                                    
            
            
                | 70 |  |  |             ) from None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  | class Mappings: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |     Creates MappingFrames. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |     See that documentation. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |     def from_resource(cls, name: str) -> MappingFrame: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |         path = MandosResources.a_path("mappings", name) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |         return cls.from_path(path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |     def from_path(cls, path: Path) -> MappingFrame: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         Reads a mapping from a CSV-like file or ``.regexes`` file. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |         Feather and Parquet are fine, too. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |         The ``.regexes`` suffix is a simple extension of CSV that uses ``--->`` as the delimiter. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |         and ignores empty lines and lines beginning with ``#``. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         It's just nice for easily editing in a text editor. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |         df = MappingFrame.read_file(path) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |         compiler = _Compiler() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |         df[df.columns[0]] = df[df.columns[0]].map(compiler.compile) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |         return df | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 99 |  |  | __all__ = ["MappingFrame", "Mappings"] | 
            
                                                        
            
                                    
            
            
                | 100 |  |  |  |