| 1 |  |  | # -*- coding: utf-8 -*- | 
            
                                                        
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 3 |  |  | # Copyright 2014-2018 by Christopher C. Little. | 
            
                                                        
            
                                    
            
            
                | 4 |  |  | # This file is part of Abydos. | 
            
                                                        
            
                                    
            
            
                | 5 |  |  | # | 
            
                                                        
            
                                    
            
            
                | 6 |  |  | # Abydos is free software: you can redistribute it and/or modify | 
            
                                                        
            
                                    
            
            
                | 7 |  |  | # it under the terms of the GNU General Public License as published by | 
            
                                                        
            
                                    
            
            
                | 8 |  |  | # the Free Software Foundation, either version 3 of the License, or | 
            
                                                        
            
                                    
            
            
                | 9 |  |  | # (at your option) any later version. | 
            
                                                        
            
                                    
            
            
                | 10 |  |  | # | 
            
                                                        
            
                                    
            
            
                | 11 |  |  | # Abydos is distributed in the hope that it will be useful, | 
            
                                                        
            
                                    
            
            
                | 12 |  |  | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
            
                                                        
            
                                    
            
            
                | 13 |  |  | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 
            
                                                        
            
                                    
            
            
                | 14 |  |  | # GNU General Public License for more details. | 
            
                                                        
            
                                    
            
            
                | 15 |  |  | # | 
            
                                                        
            
                                    
            
            
                | 16 |  |  | # You should have received a copy of the GNU General Public License | 
            
                                                        
            
                                    
            
            
                | 17 |  |  | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. | 
            
                                                        
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 19 | 1 |  | """abydos.stemmer._caumanns. | 
            
                                                        
            
                                    
            
            
                | 20 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 21 |  |  | Caumanns German stemmer | 
            
                                                        
            
                                    
            
            
                | 22 |  |  | """ | 
            
                                                        
            
                                    
            
            
                | 23 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 24 | 1 |  | from __future__ import ( | 
            
                                                        
            
                                    
            
            
                | 25 |  |  |     absolute_import, | 
            
                                                        
            
                                    
            
            
                | 26 |  |  |     division, | 
            
                                                        
            
                                    
            
            
                | 27 |  |  |     print_function, | 
            
                                                        
            
                                    
            
            
                | 28 |  |  |     unicode_literals, | 
            
                                                        
            
                                    
            
            
                | 29 |  |  | ) | 
            
                                                        
            
                                    
            
            
                | 30 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 31 | 1 |  | from unicodedata import normalize | 
            
                                                        
            
                                    
            
            
                | 32 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 33 | 1 |  | from six import text_type | 
            
                                                        
            
                                    
            
            
                | 34 | 1 |  | from six.moves import range | 
            
                                                        
            
                                    
            
            
                | 35 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 36 | 1 |  | from ._stemmer import _Stemmer | 
            
                                                        
            
                                    
            
            
                | 37 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 38 | 1 |  | __all__ = ['Caumanns', 'caumanns'] | 
            
                                                        
            
                                    
            
            
                | 39 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 40 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 41 | 1 |  | class Caumanns(_Stemmer): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                        
            
                                    
            
            
                | 42 |  |  |     """Caumanns stemmer. | 
            
                                                        
            
                                    
            
            
                | 43 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 44 |  |  |     Jörg Caumanns' stemmer is described in his article in | 
            
                                                        
            
                                    
            
            
                | 45 |  |  |     :cite:`Caumanns:1999`. | 
            
                                                        
            
                                    
            
            
                | 46 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 47 |  |  |     This implementation is based on the GermanStemFilter described at | 
            
                                                        
            
                                    
            
            
                | 48 |  |  |     :cite:`Lang:2013`. | 
            
                                                        
            
                                    
            
            
                | 49 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 50 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 51 | 1 |  |     _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou')) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                        
            
                                    
            
            
                | 52 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 53 | 1 |  |     def stem(self, word): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                        
            
                                    
            
            
                | 54 |  |  |         """Return Caumanns German stem. | 
            
                                                        
            
                                    
            
            
                | 55 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 56 |  |  |         Parameters | 
            
                                                        
            
                                    
            
            
                | 57 |  |  |         ---------- | 
            
                                                        
            
                                    
            
            
                | 58 |  |  |         word : str | 
            
                                                        
            
                                    
            
            
                | 59 |  |  |             The word to stem | 
            
                                                        
            
                                    
            
            
                | 60 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 61 |  |  |         Returns | 
            
                                                        
            
                                    
            
            
                | 62 |  |  |         ------- | 
            
                                                        
            
                                    
            
            
                | 63 |  |  |         str | 
            
                                                        
            
                                    
            
            
                | 64 |  |  |             Word stem | 
            
                                                        
            
                                    
            
            
                | 65 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 66 |  |  |         Examples | 
            
                                                        
            
                                    
            
            
                | 67 |  |  |         -------- | 
            
                                                        
            
                                    
            
            
                | 68 |  |  |         >>> stmr = Caumanns() | 
            
                                                        
            
                                    
            
            
                | 69 |  |  |         >>> stmr.stem('lesen') | 
            
                                                        
            
                                    
            
            
                | 70 |  |  |         'les' | 
            
                                                        
            
                                    
            
            
                | 71 |  |  |         >>> stmr.stem('graues') | 
            
                                                        
            
                                    
            
            
                | 72 |  |  |         'grau' | 
            
                                                        
            
                                    
            
            
                | 73 |  |  |         >>> stmr.stem('buchstabieren') | 
            
                                                        
            
                                    
            
            
                | 74 |  |  |         'buchstabier' | 
            
                                                        
            
                                    
            
            
                | 75 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 76 |  |  |         """ | 
            
                                                        
            
                                    
            
            
                | 77 | 1 |  |         if not word: | 
            
                                                        
            
                                    
            
            
                | 78 | 1 |  |             return '' | 
            
                                                        
            
                                    
            
            
                | 79 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 80 | 1 |  |         upper_initial = word[0].isupper() | 
            
                                                        
            
                                    
            
            
                | 81 | 1 |  |         word = normalize('NFC', text_type(word.lower())) | 
            
                                                        
            
                                    
            
            
                | 82 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 83 |  |  |         # # Part 2: Substitution | 
            
                                                        
            
                                    
            
            
                | 84 |  |  |         # 1. Change umlauts to corresponding vowels & ß to ss | 
            
                                                        
            
                                    
            
            
                | 85 | 1 |  |         word = word.translate(self._umlauts) | 
            
                                                        
            
                                    
            
            
                | 86 | 1 |  |         word = word.replace('ß', 'ss') | 
            
                                                        
            
                                    
            
            
                | 87 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 88 |  |  |         # 2. Change second of doubled characters to * | 
            
                                                        
            
                                    
            
            
                | 89 | 1 |  |         new_word = word[0] | 
            
                                                        
            
                                    
            
            
                | 90 | 1 |  |         for i in range(1, len(word)): | 
            
                                                        
            
                                    
            
            
                | 91 | 1 |  |             if new_word[i - 1] == word[i]: | 
            
                                                        
            
                                    
            
            
                | 92 | 1 |  |                 new_word += '*' | 
            
                                                        
            
                                    
            
            
                | 93 |  |  |             else: | 
            
                                                        
            
                                    
            
            
                | 94 | 1 |  |                 new_word += word[i] | 
            
                                                        
            
                                    
            
            
                | 95 | 1 |  |         word = new_word | 
            
                                                        
            
                                    
            
            
                | 96 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 97 |  |  |         # 3. Replace sch, ch, ei, ie with $, §, %, & | 
            
                                                        
            
                                    
            
            
                | 98 | 1 |  |         word = word.replace('sch', '$') | 
            
                                                        
            
                                    
            
            
                | 99 | 1 |  |         word = word.replace('ch', '§') | 
            
                                                        
            
                                    
            
            
                | 100 | 1 |  |         word = word.replace('ei', '%') | 
            
                                                        
            
                                    
            
            
                | 101 | 1 |  |         word = word.replace('ie', '&') | 
            
                                                        
            
                                    
            
            
                | 102 | 1 |  |         word = word.replace('ig', '#') | 
            
                                                        
            
                                    
            
            
                | 103 | 1 |  |         word = word.replace('st', '!') | 
            
                                                        
            
                                    
            
            
                | 104 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 105 |  |  |         # # Part 1: Recursive Context-Free Stripping | 
            
                                                        
            
                                    
            
            
                | 106 |  |  |         # 1. Remove the following 7 suffixes recursively | 
            
                                                        
            
                                    
            
            
                | 107 | 1 |  |         while len(word) > 3: | 
            
                                                        
            
                                    
            
            
                | 108 | 1 |  |             if (len(word) > 4 and word[-2:] in {'em', 'er'}) or ( | 
            
                                                        
            
                                    
            
            
                | 109 |  |  |                 len(word) > 5 and word[-2:] == 'nd' | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                        
            
                                    
            
            
                | 110 |  |  |             ): | 
            
                                                        
            
                                    
            
            
                | 111 | 1 |  |                 word = word[:-2] | 
            
                                                        
            
                                    
            
            
                | 112 | 1 |  |             elif (word[-1] in {'e', 's', 'n'}) or ( | 
            
                                                        
            
                                    
            
            
                | 113 |  |  |                 not upper_initial and word[-1] in {'t', '!'} | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                        
            
                                    
            
            
                | 114 |  |  |             ): | 
            
                                                        
            
                                    
            
            
                | 115 | 1 |  |                 word = word[:-1] | 
            
                                                        
            
                                    
            
            
                | 116 |  |  |             else: | 
            
                                                        
            
                                    
            
            
                | 117 | 1 |  |                 break | 
            
                                                        
            
                                    
            
            
                | 118 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 119 |  |  |         # Additional optimizations: | 
            
                                                        
            
                                    
            
            
                | 120 | 1 |  |         if len(word) > 5 and word[-5:] == 'erin*': | 
            
                                                        
            
                                    
            
            
                | 121 | 1 |  |             word = word[:-1] | 
            
                                                        
            
                                    
            
            
                | 122 | 1 |  |         if word[-1] == 'z': | 
            
                                                        
            
                                    
            
            
                | 123 | 1 |  |             word = word[:-1] + 'x' | 
            
                                                        
            
                                    
            
            
                | 124 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 125 |  |  |         # Reverse substitutions: | 
            
                                                        
            
                                    
            
            
                | 126 | 1 |  |         word = word.replace('$', 'sch') | 
            
                                                        
            
                                    
            
            
                | 127 | 1 |  |         word = word.replace('§', 'ch') | 
            
                                                        
            
                                    
            
            
                | 128 | 1 |  |         word = word.replace('%', 'ei') | 
            
                                                        
            
                                    
            
            
                | 129 | 1 |  |         word = word.replace('&', 'ie') | 
            
                                                        
            
                                    
            
            
                | 130 | 1 |  |         word = word.replace('#', 'ig') | 
            
                                                        
            
                                    
            
            
                | 131 | 1 |  |         word = word.replace('!', 'st') | 
            
                                                        
            
                                    
            
            
                | 132 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 133 |  |  |         # Expand doubled | 
            
                                                        
            
                                    
            
            
                | 134 | 1 |  |         word = ''.join( | 
            
                                                        
            
                                    
            
            
                | 135 |  |  |             [word[0]] | 
            
                                                        
            
                                    
            
            
                | 136 |  |  |             + [ | 
            
                                                        
            
                                    
            
            
                | 137 |  |  |                 word[i - 1] if word[i] == '*' else word[i] | 
            
                                                        
            
                                    
            
            
                | 138 |  |  |                 for i in range(1, len(word)) | 
            
                                                        
            
                                    
            
            
                | 139 |  |  |             ] | 
            
                                                        
            
                                    
            
            
                | 140 |  |  |         ) | 
            
                                                        
            
                                    
            
            
                | 141 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 142 |  |  |         # Finally, convert gege to ge | 
            
                                                        
            
                                    
            
            
                | 143 | 1 |  |         if len(word) > 4: | 
            
                                                        
            
                                    
            
            
                | 144 | 1 |  |             word = word.replace('gege', 'ge', 1) | 
            
                                                        
            
                                    
            
            
                | 145 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 146 | 1 |  |         return word | 
            
                                                        
            
                                    
            
            
                | 147 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 148 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 149 | 1 |  | def caumanns(word): | 
            
                                                        
            
                                    
            
            
                | 150 |  |  |     """Return Caumanns German stem. | 
            
                                                        
            
                                    
            
            
                | 151 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 152 |  |  |     This is a wrapper for :py:meth:`Caumanns.stem`. | 
            
                                                        
            
                                    
            
            
                | 153 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 154 |  |  |     Parameters | 
            
                                                        
            
                                    
            
            
                | 155 |  |  |     ---------- | 
            
                                                        
            
                                    
            
            
                | 156 |  |  |     word : str | 
            
                                                        
            
                                    
            
            
                | 157 |  |  |         The word to stem | 
            
                                                        
            
                                    
            
            
                | 158 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 159 |  |  |     Returns | 
            
                                                        
            
                                    
            
            
                | 160 |  |  |     ------- | 
            
                                                        
            
                                    
            
            
                | 161 |  |  |     str | 
            
                                                        
            
                                    
            
            
                | 162 |  |  |         Word stem | 
            
                                                        
            
                                    
            
            
                | 163 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 164 |  |  |     Examples | 
            
                                                        
            
                                    
            
            
                | 165 |  |  |     -------- | 
            
                                                        
            
                                    
            
            
                | 166 |  |  |     >>> caumanns('lesen') | 
            
                                                        
            
                                    
            
            
                | 167 |  |  |     'les' | 
            
                                                        
            
                                    
            
            
                | 168 |  |  |     >>> caumanns('graues') | 
            
                                                        
            
                                    
            
            
                | 169 |  |  |     'grau' | 
            
                                                        
            
                                    
            
            
                | 170 |  |  |     >>> caumanns('buchstabieren') | 
            
                                                        
            
                                    
            
            
                | 171 |  |  |     'buchstabier' | 
            
                                                        
            
                                    
            
            
                | 172 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 173 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 174 | 1 |  |     return Caumanns().stem(word) | 
            
                                                        
            
                                    
            
            
                | 175 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 176 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 177 |  |  | if __name__ == '__main__': | 
            
                                                        
            
                                    
            
            
                | 178 |  |  |     import doctest | 
            
                                                        
            
                                    
            
            
                | 179 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 180 |  |  |     doctest.testmod() | 
            
                                                        
            
                                    
            
            
                | 181 |  |  |  |