PiedWeb /
TextAnalyzer
| 1 | <?php |
||
| 2 | |||
| 3 | namespace PiedWeb\TextAnalyzer; |
||
| 4 | |||
| 5 | use ForceUTF8\Encoding; |
||
| 6 | |||
| 7 | class CleanText |
||
| 8 | { |
||
| 9 | const REGEX_SENTENCE = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/'; |
||
| 10 | |||
| 11 | const STOP_WORDS = [ |
||
| 12 | // English stop words |
||
| 13 | 'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', |
||
| 14 | 'as', 'at', 'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', |
||
| 15 | 'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', |
||
| 16 | 'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', |
||
| 17 | 'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', |
||
| 18 | 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so', |
||
| 19 | 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', |
||
| 20 | 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', |
||
| 21 | 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your', |
||
| 22 | |||
| 23 | // French Stop words |
||
| 24 | 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', |
||
| 25 | 'le', 'leur', 'lui', 'plus', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', |
||
| 26 | 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', |
||
| 27 | 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi', 'comme', |
||
| 28 | |||
| 29 | 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', |
||
| 30 | |||
| 31 | 'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', |
||
| 32 | 'sans', 'soi', 'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car', |
||
| 33 | |||
| 34 | 'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', |
||
| 35 | 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', |
||
| 36 | 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', |
||
| 37 | 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', |
||
| 38 | 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait', |
||
| 39 | 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eûmes', 'eûtes', |
||
| 40 | 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', |
||
| 41 | 'eussent', 'dit', 'fait', 'peut', 'faire', 'fais', |
||
| 42 | |||
| 43 | 'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler', |
||
| 44 | 'icone', 'flèche', |
||
| 45 | 'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter', |
||
| 46 | |||
| 47 | // Weird thing happen every day |
||
| 48 | 'http//www', 'https//www', |
||
| 49 | ]; |
||
| 50 | |||
| 51 | 3 | public static function fixEncoding(string $text) |
|
| 52 | { |
||
| 53 | // fix encoding |
||
| 54 | 3 | $text = Encoding::toUTF8($text); |
|
| 55 | 3 | $text = html_entity_decode(html_entity_decode(htmlentities($text))); |
|
| 56 | 3 | $text = htmlspecialchars_decode($text, ENT_QUOTES); |
|
| 57 | 3 | $text = str_replace('’', "'", $text); // Unify ' |
|
| 58 | 3 | $text = html_entity_decode(str_replace([' ', ' '], ' ', htmlentities($text))); |
|
| 59 | |||
| 60 | 3 | return $text; |
|
| 61 | } |
||
| 62 | |||
| 63 | 3 | public static function getSentences(string $text) |
|
| 64 | { |
||
| 65 | 3 | $sentences = []; |
|
| 66 | 3 | if (preg_match_all(self::REGEX_SENTENCE, $text, $matches, PREG_SET_ORDER, 0)) { |
|
| 67 | 3 | foreach ($matches as $m) { |
|
| 68 | 3 | if (count(explode(' ', $m[0])) < 30) { // We keep only sentence with less than 30 words |
|
| 69 | 3 | $sentences[] = preg_replace('/\s+/', ' ', $m[0]); |
|
| 70 | } |
||
| 71 | } |
||
| 72 | } |
||
| 73 | |||
| 74 | 3 | return $sentences; |
|
| 75 | } |
||
| 76 | |||
| 77 | public static function keepOnlySentence(string $text) |
||
| 78 | { |
||
| 79 | return implode(' ', self::getSentences($text)); |
||
| 80 | } |
||
| 81 | |||
| 82 | 3 | public static function removePunctuation(string $text) |
|
| 83 | { |
||
| 84 | 3 | return preg_replace('/,|\.|\(|\[|\]|\)|!|\?|;|…|\{|\}|"|«|»|:|\*|\/|\||>|<| - | + /', ' ', $text); |
|
| 85 | } |
||
| 86 | |||
| 87 | 3 | public static function removeDate(string $text) |
|
| 88 | { |
||
| 89 | $month = '(janvier|january|février|february|mars|march|avril|april|mai|may|juin|june|juillet|july|août|august' |
||
| 90 | .'|septembre|september|octobre|october|novembre|november|décembre|december|jan|fev|feb|mar|avr|apr|jui' |
||
| 91 | 3 | .'|jun|juil|jul|aoû|aug|aout|aou|sept|oct|nov|dec|decembre)'; |
|
| 92 | |||
| 93 | // french format |
||
| 94 | 3 | $text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text); |
|
| 95 | |||
| 96 | // Remove Year |
||
| 97 | //$text = preg_replace('/20[0-3][0-9]/', ' ', $text); |
||
| 98 | |||
| 99 | // Remove Month |
||
| 100 | //$text = preg_replace('/'.$month.'/', ' ', $text); |
||
| 101 | |||
| 102 | 3 | return $text; |
|
| 103 | } |
||
| 104 | |||
| 105 | 6 | public static function removeStopWords(string $text) |
|
| 106 | { |
||
| 107 | 6 | $text = str_replace("'", ' ', $text); |
|
| 108 | 6 | $text = str_replace(explode('|', ' '.implode(' | ', self::STOP_WORDS).' '), ' ', $text); |
|
| 109 | |||
| 110 | 6 | return trim($text); |
|
| 111 | } |
||
| 112 | |||
| 113 | 9 | public static function removeStopWordsAtExtremity(string $text) |
|
| 114 | { |
||
| 115 | 9 | $text = trim($text); |
|
| 116 | 9 | $text = str_replace("'", ' ', $text); |
|
| 117 | 9 | $text = preg_replace('@^'.implode(' |^', self::STOP_WORDS).' @', '', $text); |
|
| 118 | 9 | $text = preg_replace('@'.implode('$| ', self::STOP_WORDS).'$@', '', $text); |
|
| 119 | |||
| 120 | 9 | return trim($text); |
|
| 121 | } |
||
| 122 | |||
| 123 | 3 | public static function stripHtmlTagsOldWay(string $html) |
|
| 124 | { |
||
| 125 | // Often error because of limitation of JIT |
||
| 126 | 3 | $regex = '@<(script|style|head|iframe|noframe|noscript|object|embed|noembed)[^>]*?>((?!<\1).)*<\/\1>@si'; |
|
| 127 | 3 | $textWithoutInvisible = preg_replace($regex, ' ', $html); |
|
| 128 | 3 | if (PREG_NO_ERROR === preg_last_error()) { |
|
| 129 | // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die(); |
||
| 130 | $html = $textWithoutInvisible; |
||
| 131 | } |
||
| 132 | |||
| 133 | 3 | $html = preg_replace('/\s+/', ' ', $html); |
|
| 134 | 3 | $html = preg_replace('@</(div|p)>@si', "$0 \n\n", $html); |
|
| 135 | 3 | $html = preg_replace('@<br[^>]*>@si', "$0 \n", $html); |
|
| 136 | 3 | $html = strip_tags($html); |
|
| 137 | 3 | $html = preg_replace("/[\t\n\r]+/", "\n", $html); |
|
| 138 | 3 | $html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html))))); |
|
| 139 | |||
| 140 | 3 | return $html; |
|
| 141 | } |
||
| 142 | |||
| 143 | 6 | public static function stripHtmlTags(string $html) |
|
| 144 | { |
||
| 145 | // Permit to avoid stick words when span are used like block |
||
| 146 | 6 | $html = str_replace('<', ' <', $html); |
|
| 147 | 6 | $html = self::removeSrOnly($html); |
|
| 148 | |||
| 149 | 6 | $dom = new \simple_html_dom(); |
|
| 150 | 6 | if (false === $dom->load($html)) { // If we failed to load the html in dom |
|
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||
| 151 | $text = self::stripHtmlTagsOldWay($html); |
||
| 152 | } else { |
||
| 153 | 6 | $text = self::stipHtmlTagsFromDom($dom); |
|
| 154 | } |
||
| 155 | |||
| 156 | 6 | return $text; |
|
| 157 | } |
||
| 158 | |||
| 159 | 6 | public static function stipHtmlTagsFromDom(\simple_html_dom $dom) |
|
| 160 | { |
||
| 161 | 6 | $text = $dom->plaintext; |
|
| 162 | 6 | $text = preg_replace('/ +/s', ' ', $text); |
|
| 163 | |||
| 164 | 6 | return $text; |
|
| 165 | } |
||
| 166 | |||
| 167 | /** |
||
| 168 | * Not very good... avoid Jit error. |
||
| 169 | */ |
||
| 170 | 6 | public static function removeSrOnly(string $html) |
|
| 171 | { |
||
| 172 | 6 | $regex = '/<span[^>]+class="[^>]*(screen-reader-only|sr-only)[^>]*"[^>]*>[^<]*<\/span>/si'; |
|
| 173 | |||
| 174 | 6 | return preg_replace($regex, ' ', $html); |
|
| 175 | } |
||
| 176 | } |
||
| 177 |