RobinDev /
ExtractExpression
| 1 | <?php |
||
| 2 | |||
| 3 | namespace rOpenDev\ExtractExpression; |
||
| 4 | |||
| 5 | use ForceUTF8\Encoding; |
||
| 6 | |||
| 7 | class CleanText |
||
| 8 | { |
||
| 9 | public static $regexSentence = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/'; |
||
| 10 | |||
| 11 | public static $stopWords = [ |
||
| 12 | // English stop words |
||
| 13 | 'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', |
||
| 14 | 'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', |
||
| 15 | 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', |
||
| 16 | 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', |
||
| 17 | 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', |
||
| 18 | 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', |
||
| 19 | 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', |
||
| 20 | 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your', |
||
| 21 | |||
| 22 | // French Stop words |
||
| 23 | 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'plus', |
||
| 24 | 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', |
||
| 25 | 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi', |
||
| 26 | 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', |
||
| 27 | 'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi', |
||
| 28 | 'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car', |
||
| 29 | |||
| 30 | 'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', |
||
| 31 | 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', |
||
| 32 | 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', |
||
| 33 | 'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', |
||
| 34 | 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', |
||
| 35 | 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', |
||
| 36 | 'eussent', 'dit', 'fait', 'peut', 'faire', 'fais', |
||
| 37 | |||
| 38 | 'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler', |
||
| 39 | 'icone', 'flèche', |
||
| 40 | 'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter', |
||
| 41 | |||
| 42 | // Weird thing happen every day |
||
| 43 | 'http//www', 'https//www', |
||
| 44 | ]; |
||
| 45 | |||
| 46 | public static function fixEncoding(string $text) |
||
| 47 | { |
||
| 48 | // fix encoding |
||
| 49 | $text = Encoding::toUTF8($text); |
||
| 50 | $text = html_entity_decode(html_entity_decode(htmlentities($text))); |
||
| 51 | $text = htmlspecialchars_decode($text, ENT_QUOTES); |
||
| 52 | $text = str_replace('’', "'", $text); // Unify ' |
||
| 53 | $text = html_entity_decode(str_replace([' ', ' '], ' ', htmlentities($text))); |
||
| 54 | |||
| 55 | return $text; |
||
| 56 | } |
||
| 57 | |||
| 58 | public static function getSentences(string $text) |
||
| 59 | { |
||
| 60 | $sentences = []; |
||
| 61 | if (preg_match_all(self::$regexSentence, $text, $matches, PREG_SET_ORDER, 0)) { |
||
| 62 | foreach ($matches as $m) { |
||
| 63 | if (count(explode(' ', $m[0])) < 30) { // We keep only sentence with less than 30 words |
||
| 64 | $sentences[] = preg_replace('/\s+/', ' ', $m[0]); |
||
| 65 | } |
||
| 66 | } |
||
| 67 | } |
||
| 68 | |||
| 69 | return $sentences; |
||
| 70 | } |
||
| 71 | |||
| 72 | public static function keepOnlySentence(string $text) |
||
| 73 | { |
||
| 74 | return implode(' ', self::getSentences($text)); |
||
| 75 | } |
||
| 76 | |||
| 77 | public static function removePunctuation(string $text) |
||
| 78 | { |
||
| 79 | return preg_replace('/,|\.|\(|\[|\]|\)|!|\?|;|…|\{|\}|"|«|»|:|\*|\/|\||>|<| - | + /', ' ', $text); |
||
| 80 | } |
||
| 81 | |||
| 82 | public static function removeDate(string $text) |
||
| 83 | { |
||
| 84 | $month = '(janvier|january|février|february|mars|march|avril|april|mai|may|juin|june|juillet|july|août|august|septembre|september|octobre|october|novembre|november|décembre|december|jan|fev|feb|mar|avr|apr|jui|jun|juil|jul|aoû|aug|aout|aou|sept|oct|nov|dec|decembre)'; |
||
| 85 | // french format |
||
| 86 | $text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text); |
||
| 87 | |||
| 88 | // Remove Year |
||
| 89 | //$text = preg_replace('/20[0-3][0-9]/', ' ', $text); |
||
| 90 | |||
| 91 | // Remove Month |
||
| 92 | //$text = preg_replace('/'.$month.'/', ' ', $text); |
||
| 93 | |||
| 94 | return $text; |
||
| 95 | } |
||
| 96 | |||
| 97 | public static function removeStopWords(string $text) |
||
| 98 | { |
||
| 99 | $text = str_replace("'", ' ', $text); |
||
| 100 | $text = str_replace(explode('|', ' '.implode(' | ', self::$stopWords).' '), ' ', $text); |
||
| 101 | |||
| 102 | return trim($text); |
||
| 103 | } |
||
| 104 | |||
| 105 | public static function removeStopWordsAtExtremity(string $text) |
||
| 106 | { |
||
| 107 | $text = trim($text); |
||
| 108 | $text = str_replace("'", ' ', $text); |
||
| 109 | $text = preg_replace('@^'.implode(' |^', self::$stopWords).' @', '', $text); |
||
| 110 | $text = preg_replace('@'.implode('$| ', self::$stopWords).'$@', '', $text); |
||
| 111 | |||
| 112 | return trim($text); |
||
| 113 | } |
||
| 114 | |||
| 115 | public static function stripHtmlTagsOldWay(string $html) |
||
| 116 | { |
||
| 117 | // Often error because of limitation of JIT |
||
| 118 | $textWithoutInvisible = preg_replace('@<(script|style|head|iframe|noframe|noscript|object|embed|noembed)[^>]*?>((?!<\1).)*<\/\1>@si', ' ', $html); |
||
| 119 | if (false === preg_last_error()) { // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die(); |
||
| 120 | $html = $textWithoutInvisible; |
||
| 121 | } |
||
| 122 | |||
| 123 | $html = preg_replace('/\s+/', ' ', $html); |
||
| 124 | $html = preg_replace('@</(div|p)>@si', "$0 \n\n", $html); |
||
| 125 | $html = preg_replace('@<br[^>]*>@si', "$0 \n", $html); |
||
| 126 | $html = strip_tags($html); |
||
| 127 | $html = preg_replace("/[\t\n\r]+/", "\n", $html); |
||
| 128 | $html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html))))); |
||
| 129 | |||
| 130 | return $html; |
||
| 131 | } |
||
| 132 | |||
| 133 | public static function stripHtmlTags(string $html) |
||
| 134 | { |
||
| 135 | // Permit to avoid stick words when span are used like block |
||
| 136 | $html = str_replace('<', ' <', $html); |
||
| 137 | $html = self::removeSrOnly($html); |
||
| 138 | |||
| 139 | $dom = new \simple_html_dom(); |
||
| 140 | if (false === $dom->load($html)) { // If we failed to load the html in dom |
||
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||
| 141 | $text = self::stripHtmlTagsOldWay($html); |
||
| 142 | } else { |
||
| 143 | $text = $dom->plaintext; |
||
| 144 | $text = preg_replace('/ +/s', ' ', $text); |
||
| 145 | } |
||
| 146 | |||
| 147 | return $text; |
||
| 148 | } |
||
| 149 | |||
| 150 | /** |
||
| 151 | * Not very good... avoid Jit error. |
||
| 152 | */ |
||
| 153 | public static function removeSrOnly(string $html) |
||
| 154 | { |
||
| 155 | return preg_replace('/<span[^>]+class="[^>]*(screen-reader-only|sr-only)[^>]*"[^>]*>[^<]*<\/span>/si', ' ', $html); |
||
| 156 | } |
||
| 157 | } |
||
| 158 |