| 1 |  |  | # Copyright 2019-2020 by Christopher C. Little. | 
            
                                                        
            
                                    
            
            
                | 2 |  |  | # This file is part of Abydos. | 
            
                                                        
            
                                    
            
            
                | 3 |  |  | # | 
            
                                                        
            
                                    
            
            
                | 4 |  |  | # Abydos is free software: you can redistribute it and/or modify | 
            
                                                        
            
                                    
            
            
                | 5 |  |  | # it under the terms of the GNU General Public License as published by | 
            
                                                        
            
                                    
            
            
                | 6 |  |  | # the Free Software Foundation, either version 3 of the License, or | 
            
                                                        
            
                                    
            
            
                | 7 |  |  | # (at your option) any later version. | 
            
                                                        
            
                                    
            
            
                | 8 |  |  | # | 
            
                                                        
            
                                    
            
            
                | 9 |  |  | # Abydos is distributed in the hope that it will be useful, | 
            
                                                        
            
                                    
            
            
                | 10 |  |  | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
            
                                                        
            
                                    
            
            
                | 11 |  |  | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 
            
                                                        
            
                                    
            
            
                | 12 |  |  | # GNU General Public License for more details. | 
            
                                                        
            
                                    
            
            
                | 13 |  |  | # | 
            
                                                        
            
                                    
            
            
                | 14 |  |  | # You should have received a copy of the GNU General Public License | 
            
                                                        
            
                                    
            
            
                | 15 |  |  | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. | 
            
                                                        
            
                                    
            
            
                | 16 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 17 |  |  | """abydos.distance._ncd_paq9a. | 
            
                                                        
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 19 | 1 |  | NCD using PAQ9A | 
            
                                                        
            
                                    
            
            
                | 20 |  |  | """ | 
            
                                                        
            
                                    
            
            
                | 21 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 22 |  |  | from ._distance import _Distance | 
            
                                                        
            
                                    
            
            
                | 23 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 24 | 1 |  | try: | 
            
                                                        
            
                                    
            
            
                | 25 |  |  |     import paq | 
            
                                                        
            
                                    
            
            
                | 26 |  |  | except ImportError:  # pragma: no cover | 
            
                                                        
            
                                    
            
            
                | 27 |  |  |     # If the system lacks the paq9a library, that's fine, but PAQ9A compression | 
            
                                                        
            
                                    
            
            
                | 28 |  |  |     # similarity won't be supported. | 
            
                                                        
            
                                    
            
            
                | 29 |  |  |     paq = None  # type: ignore | 
            
                                                        
            
                                    
            
            
                | 30 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 31 | 1 |  | __all__ = ['NCDpaq9a'] | 
            
                                                        
            
                                    
            
            
                | 32 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 33 | 1 |  |  | 
            
                                                        
            
                                    
            
            
                | 34 | 1 |  | class NCDpaq9a(_Distance): | 
            
                                                        
            
                                    
            
            
                | 35 |  |  |     """Normalized Compression Distance using PAQ9A compression. | 
            
                                                        
            
                                    
            
            
                | 36 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 37 |  |  |     Cf. http://mattmahoney.net/dc/#paq9a | 
            
                                                        
            
                                    
            
            
                | 38 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 39 |  |  |     Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. | 
            
                                                        
            
                                    
            
            
                | 40 | 1 |  |  | 
            
                                                        
            
                                    
            
            
                | 41 |  |  |     .. versionadded:: 0.4.0 | 
            
                                                        
            
                                    
            
            
                | 42 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 43 | 1 |  |  | 
            
                                                        
            
                                    
            
            
                | 44 |  |  |     def dist(self, src: str, tar: str) -> float: | 
            
                                                        
            
                                    
            
            
                | 45 |  |  |         """Return the NCD between two strings using PAQ9A compression. | 
            
                                                        
            
                                    
            
            
                | 46 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 47 |  |  |         Parameters | 
            
                                                        
            
                                    
            
            
                | 48 |  |  |         ---------- | 
            
                                                        
            
                                    
            
            
                | 49 |  |  |         src : str | 
            
                                                        
            
                                    
            
            
                | 50 |  |  |             Source string for comparison | 
            
                                                        
            
                                    
            
            
                | 51 |  |  |         tar : str | 
            
                                                        
            
                                    
            
            
                | 52 |  |  |             Target string for comparison | 
            
                                                        
            
                                    
            
            
                | 53 | 1 |  |  | 
            
                                                        
            
                                    
            
            
                | 54 |  |  |         Returns | 
            
                                                        
            
                                    
            
            
                | 55 |  |  |         ------- | 
            
                                                        
            
                                    
            
            
                | 56 |  |  |         float | 
            
                                                        
            
                                    
            
            
                | 57 |  |  |             Compression distance | 
            
                                                        
            
                                    
            
            
                | 58 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 59 |  |  |         Raises | 
            
                                                        
            
                                    
            
            
                | 60 |  |  |         ------ | 
            
                                                        
            
                                    
            
            
                | 61 |  |  |         ValueError | 
            
                                                        
            
                                    
            
            
                | 62 |  |  |             Install the paq module in order to use PAQ9A | 
            
                                                        
            
                                    
            
            
                | 63 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 64 |  |  |         Examples | 
            
                                                        
            
                                    
            
            
                | 65 |  |  |         -------- | 
            
                                                        
            
                                    
            
            
                | 66 |  |  |         >>> cmp = NCDpaq9a() | 
            
                                                        
            
                                    
            
            
                | 67 |  |  |         >>> cmp.dist('cat', 'hat') | 
            
                                                        
            
                                    
            
            
                | 68 |  |  |         0.42857142857142855 | 
            
                                                        
            
                                    
            
            
                | 69 |  |  |         >>> cmp.dist('Niall', 'Neil') | 
            
                                                        
            
                                    
            
            
                | 70 |  |  |         0.5555555555555556 | 
            
                                                        
            
                                    
            
            
                | 71 |  |  |         >>> cmp.dist('aluminum', 'Catalan') | 
            
                                                        
            
                                    
            
            
                | 72 |  |  |         0.5833333333333334 | 
            
                                                        
            
                                    
            
            
                | 73 |  |  |         >>> cmp.dist('ATCG', 'TAGC') | 
            
                                                        
            
                                    
            
            
                | 74 |  |  |         0.5 | 
            
                                                        
            
                                    
            
            
                | 75 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 76 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 77 |  |  |         .. versionadded:: 0.4.0 | 
            
                                                        
            
                                    
            
            
                | 78 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 79 |  |  |         """ | 
            
                                                        
            
                                    
            
            
                | 80 |  |  |         if src == tar: | 
            
                                                        
            
                                    
            
            
                | 81 |  |  |             return 0.0 | 
            
                                                        
            
                                    
            
            
                | 82 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 83 |  |  |         src_b = src.encode('utf-8') | 
            
                                                        
            
                                    
            
            
                | 84 |  |  |         tar_b = tar.encode('utf-8') | 
            
                                                        
            
                                    
            
            
                | 85 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 86 |  |  |         if paq is not None: | 
            
                                                        
            
                                    
            
            
                | 87 |  |  |             src_comp = paq.compress(src_b) | 
            
                                                        
            
                                    
            
            
                | 88 |  |  |             tar_comp = paq.compress(tar_b) | 
            
                                                        
            
                                    
            
            
                | 89 |  |  |             concat_comp = paq.compress(src_b + tar_b) | 
            
                                                        
            
                                    
            
            
                | 90 |  |  |             concat_comp2 = paq.compress(tar_b + src_b) | 
            
                                                        
            
                                    
            
            
                | 91 |  |  |         else:  # pragma: no cover | 
            
                                                        
            
                                    
            
            
                | 92 |  |  |             raise ValueError('Install the paq module in order to use PAQ9A') | 
            
                                                        
            
                                    
            
            
                | 93 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 94 |  |  |         # Each string returned by PAQ9A's compressor has 4 header bytes | 
            
                                                        
            
                                    
            
            
                | 95 |  |  |         # followed by a byte of information then 3 null bytes. And it is | 
            
                                                        
            
                                    
            
            
                | 96 |  |  |         # concluded with 3 bytes of \xff. So 4+3+3 invariant bytes are | 
            
                                                        
            
                                    
            
            
                | 97 |  |  |         # subtracted here. | 
            
                                                        
            
                                    
            
            
                | 98 |  |  |         return ( | 
            
                                                        
            
                                    
            
            
                | 99 |  |  |             (min(len(concat_comp), len(concat_comp2)) - 10) | 
            
                                                        
            
                                    
            
            
                | 100 |  |  |             - (min(len(src_comp), len(tar_comp)) - 10) | 
            
                                                        
            
                                    
            
            
                | 101 |  |  |         ) / (max(len(src_comp), len(tar_comp)) - 10) | 
            
                                                        
            
                                    
            
            
                | 102 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 103 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 104 |  |  | if __name__ == '__main__': | 
            
                                                        
            
                                    
            
            
                | 105 |  |  |     import doctest | 
            
                                                        
            
                                    
            
            
                | 106 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 107 |  |  |     doctest.testmod() | 
            
                                                        
            
                                    
            
            
                | 108 |  |  |  |