1 | <?php |
||
2 | |||
3 | namespace rOpenDev\ExtractExpression; |
||
4 | |||
5 | use ForceUTF8\Encoding; |
||
6 | |||
7 | class CleanText |
||
8 | { |
||
9 | public static $regexSentence = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/'; |
||
10 | |||
11 | public static $stopWords = [ |
||
12 | // English stop words |
||
13 | 'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', |
||
14 | 'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', |
||
15 | 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', |
||
16 | 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', |
||
17 | 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', |
||
18 | 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', |
||
19 | 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', |
||
20 | 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your', |
||
21 | |||
22 | // French Stop words |
||
23 | 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'plus', |
||
24 | 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', |
||
25 | 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi', |
||
26 | 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', |
||
27 | 'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi', |
||
28 | 'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car', |
||
29 | |||
30 | 'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', |
||
31 | 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', |
||
32 | 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', |
||
33 | 'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', |
||
34 | 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', |
||
35 | 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', |
||
36 | 'eussent', 'dit', 'fait', 'peut', 'faire', 'fais', |
||
37 | |||
38 | 'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler', |
||
39 | 'icone', 'flèche', |
||
40 | 'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter', |
||
41 | |||
42 | // Weird thing happen every day |
||
43 | 'http//www', 'https//www', |
||
44 | ]; |
||
45 | |||
46 | public static function fixEncoding(string $text) |
||
47 | { |
||
48 | // fix encoding |
||
49 | $text = Encoding::toUTF8($text); |
||
50 | $text = html_entity_decode(html_entity_decode(htmlentities($text))); |
||
51 | $text = htmlspecialchars_decode($text, ENT_QUOTES); |
||
52 | $text = str_replace('’', "'", $text); // Unify ' |
||
53 | $text = html_entity_decode(str_replace([' ', ' '], ' ', htmlentities($text))); |
||
54 | |||
55 | return $text; |
||
56 | } |
||
57 | |||
58 | public static function getSentences(string $text) |
||
59 | { |
||
60 | $sentences = []; |
||
61 | if (preg_match_all(self::$regexSentence, $text, $matches, PREG_SET_ORDER, 0)) { |
||
62 | foreach ($matches as $m) { |
||
63 | if (count(explode(' ', $m[0])) < 30) { // We keep only sentence with less than 30 words |
||
64 | $sentences[] = preg_replace('/\s+/', ' ', $m[0]); |
||
65 | } |
||
66 | } |
||
67 | } |
||
68 | |||
69 | return $sentences; |
||
70 | } |
||
71 | |||
72 | public static function keepOnlySentence(string $text) |
||
73 | { |
||
74 | return implode(' ', self::getSentences($text)); |
||
75 | } |
||
76 | |||
77 | public static function removePunctuation(string $text) |
||
78 | { |
||
79 | return preg_replace('/,|\.|\(|\[|\]|\)|!|\?|;|…|\{|\}|"|«|»|:|\*|\/|\||>|<| - | + /', ' ', $text); |
||
80 | } |
||
81 | |||
82 | public static function removeDate(string $text) |
||
83 | { |
||
84 | $month = '(janvier|january|février|february|mars|march|avril|april|mai|may|juin|june|juillet|july|août|august|septembre|september|octobre|october|novembre|november|décembre|december|jan|fev|feb|mar|avr|apr|jui|jun|juil|jul|aoû|aug|aout|aou|sept|oct|nov|dec|decembre)'; |
||
85 | // french format |
||
86 | $text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text); |
||
87 | |||
88 | // Remove Year |
||
89 | //$text = preg_replace('/20[0-3][0-9]/', ' ', $text); |
||
90 | |||
91 | // Remove Month |
||
92 | //$text = preg_replace('/'.$month.'/', ' ', $text); |
||
93 | |||
94 | return $text; |
||
95 | } |
||
96 | |||
97 | public static function removeStopWords(string $text) |
||
98 | { |
||
99 | $text = str_replace("'", ' ', $text); |
||
100 | $text = str_replace(explode('|', ' '.implode(' | ', self::$stopWords).' '), ' ', $text); |
||
101 | |||
102 | return trim($text); |
||
103 | } |
||
104 | |||
105 | public static function removeStopWordsAtExtremity(string $text) |
||
106 | { |
||
107 | $text = trim($text); |
||
108 | $text = str_replace("'", ' ', $text); |
||
109 | $text = preg_replace('@^'.implode(' |^', self::$stopWords).' @', '', $text); |
||
110 | $text = preg_replace('@'.implode('$| ', self::$stopWords).'$@', '', $text); |
||
111 | |||
112 | return trim($text); |
||
113 | } |
||
114 | |||
115 | public static function stripHtmlTagsOldWay(string $html) |
||
116 | { |
||
117 | // Often error because of limitation of JIT |
||
118 | $textWithoutInvisible = preg_replace('@<(script|style|head|iframe|noframe|noscript|object|embed|noembed)[^>]*?>((?!<\1).)*<\/\1>@si', ' ', $html); |
||
119 | if (false === preg_last_error()) { // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die(); |
||
120 | $html = $textWithoutInvisible; |
||
121 | } |
||
122 | |||
123 | $html = preg_replace('/\s+/', ' ', $html); |
||
124 | $html = preg_replace('@</(div|p)>@si', "$0 \n\n", $html); |
||
125 | $html = preg_replace('@<br[^>]*>@si', "$0 \n", $html); |
||
126 | $html = strip_tags($html); |
||
127 | $html = preg_replace("/[\t\n\r]+/", "\n", $html); |
||
128 | $html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html))))); |
||
129 | |||
130 | return $html; |
||
131 | } |
||
132 | |||
133 | public static function stripHtmlTags(string $html) |
||
134 | { |
||
135 | // Permit to avoid stick words when span are used like block |
||
136 | $html = str_replace('<', ' <', $html); |
||
137 | $html = self::removeSrOnly($html); |
||
138 | |||
139 | $dom = new \simple_html_dom(); |
||
140 | if (false === $dom->load($html)) { // If we failed to load the html in dom |
||
0 ignored issues
–
show
introduced
by
![]() |
|||
141 | $text = self::stripHtmlTagsOldWay($html); |
||
142 | } else { |
||
143 | $text = $dom->plaintext; |
||
144 | $text = preg_replace('/ +/s', ' ', $text); |
||
145 | } |
||
146 | |||
147 | return $text; |
||
148 | } |
||
149 | |||
150 | /** |
||
151 | * Not very good... avoid Jit error. |
||
152 | */ |
||
153 | public static function removeSrOnly(string $html) |
||
154 | { |
||
155 | return preg_replace('/<span[^>]+class="[^>]*(screen-reader-only|sr-only)[^>]*"[^>]*>[^<]*<\/span>/si', ' ', $html); |
||
156 | } |
||
157 | } |
||
158 |