1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace rOpenDev\ExtractExpression; |
4
|
|
|
|
5
|
|
|
use ForceUTF8\Encoding; |
6
|
|
|
|
7
|
|
|
class CleanText |
8
|
|
|
{ |
9
|
|
|
public static $regexSentence = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/'; |
10
|
|
|
|
11
|
|
|
public static $stopWords = [ |
12
|
|
|
// English stop words |
13
|
|
|
'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', |
14
|
|
|
'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', |
15
|
|
|
'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', |
16
|
|
|
'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', |
17
|
|
|
'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', |
18
|
|
|
'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', |
19
|
|
|
'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', |
20
|
|
|
'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your', |
21
|
|
|
|
22
|
|
|
// French Stop words |
23
|
|
|
'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'plus', |
24
|
|
|
'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', |
25
|
|
|
'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi', |
26
|
|
|
'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', |
27
|
|
|
'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi', |
28
|
|
|
'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car', |
29
|
|
|
|
30
|
|
|
'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', |
31
|
|
|
'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', |
32
|
|
|
'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', |
33
|
|
|
'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', |
34
|
|
|
'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', |
35
|
|
|
'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', |
36
|
|
|
'eussent', 'dit', 'fait', 'peut', 'faire', 'fais', |
37
|
|
|
|
38
|
|
|
'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler', |
39
|
|
|
'icone', 'flèche', |
40
|
|
|
'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter', |
41
|
|
|
|
42
|
|
|
// Weird thing happen every day |
43
|
|
|
'http//www', 'https//www', |
44
|
|
|
]; |
45
|
|
|
|
46
|
|
|
public static function fixEncoding(string $text) |
47
|
|
|
{ |
48
|
|
|
// fix encoding |
49
|
|
|
$text = Encoding::toUTF8($text); |
50
|
|
|
$text = html_entity_decode(html_entity_decode(htmlentities($text))); |
51
|
|
|
$text = htmlspecialchars_decode($text, ENT_QUOTES); |
52
|
|
|
$text = str_replace('’', "'", $text); // Unify ' |
53
|
|
|
$text = html_entity_decode(str_replace([' ', ' '], ' ', htmlentities($text))); |
54
|
|
|
|
55
|
|
|
return $text; |
56
|
|
|
} |
57
|
|
|
|
58
|
|
|
public static function getSentences(string $text) |
59
|
|
|
{ |
60
|
|
|
$sentences = []; |
61
|
|
|
if (preg_match_all(self::$regexSentence, $text, $matches, PREG_SET_ORDER, 0)) { |
62
|
|
|
foreach ($matches as $m) { |
63
|
|
|
$sentences[] = preg_replace('/\s+/', ' ', $m[0]); |
64
|
|
|
} |
65
|
|
|
} |
66
|
|
|
|
67
|
|
|
return $sentences; |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
public static function keepOnlySentence(string $text) |
71
|
|
|
{ |
72
|
|
|
$textFiltered = ''; |
73
|
|
|
if (preg_match_all(self::$regexSentence, $text, $matches, PREG_SET_ORDER, 0)) { |
74
|
|
|
foreach ($matches as $m) { |
75
|
|
|
$textFiltered .= $m[0].' '; |
76
|
|
|
} |
77
|
|
|
$textFiltered = preg_replace('/\s+/', ' ', $text); |
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
return $textFiltered; |
81
|
|
|
} |
82
|
|
|
|
83
|
|
|
public static function removePunctuation(string $text) |
84
|
|
|
{ |
85
|
|
|
return preg_replace('/,|\.|\(|\[|\]|\)|!|\?|;|…|\{|\}|"|«|»|:|\*|\/|\||>|<| - | + /', ' ', $text); |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
public static function removeDate(string $text) |
89
|
|
|
{ |
90
|
|
|
$month = '(janvier|january|février|february|mars|march|avril|april|mai|may|juin|june|juillet|july|août|august|septembre|september|octobre|october|novembre|november|décembre|december|jan|fev|feb|mar|avr|apr|jui|jun|juil|jul|aoû|aug|aout|aou|sept|oct|nov|dec|decembre)'; |
91
|
|
|
// french format |
92
|
|
|
$text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text); |
93
|
|
|
|
94
|
|
|
// Remove Year |
95
|
|
|
//$text = preg_replace('/20[0-3][0-9]/', ' ', $text); |
96
|
|
|
|
97
|
|
|
// Remove Month |
98
|
|
|
//$text = preg_replace('/'.$month.'/', ' ', $text); |
99
|
|
|
|
100
|
|
|
return $text; |
101
|
|
|
} |
102
|
|
|
|
103
|
|
|
public static function removeStopWords(string $text) |
104
|
|
|
{ |
105
|
|
|
$text = str_replace("'", ' ', $text); |
106
|
|
|
$text = str_replace(explode('|', ' '.implode(' | ', self::$stopWords).' '), ' ', $text); |
107
|
|
|
|
108
|
|
|
return trim($text); |
109
|
|
|
} |
110
|
|
|
|
111
|
|
|
public static function removeStopWordsAtExtremity(string $text) |
112
|
|
|
{ |
113
|
|
|
$text = trim($text); |
114
|
|
|
$text = str_replace("'", ' ', $text); |
115
|
|
|
$text = preg_replace('@^'.implode(' |^', self::$stopWords).' @', '', $text); |
116
|
|
|
$text = preg_replace('@'.implode('$| ', self::$stopWords).'$@', '', $text); |
117
|
|
|
|
118
|
|
|
return trim($text); |
119
|
|
|
} |
120
|
|
|
|
121
|
|
|
public static function stripHtmlTagsOldWay(string $html) |
122
|
|
|
{ |
123
|
|
|
// Often error because of limitation of JIT |
124
|
|
|
$textWithoutInvisible = preg_replace('@<(script|style|head|iframe|noframe|noscript|object|embed|noembed)[^>]*?>((?!<\1).)*<\/\1>@si', ' ', $html); |
125
|
|
|
if (false === preg_last_error()) { // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die(); |
126
|
|
|
$html = $textWithoutInvisible; |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
$html = preg_replace('/\s+/', ' ', $html); |
130
|
|
|
$html = preg_replace('@</(div|p)>@si', "$0 \n\n", $html); |
131
|
|
|
$html = preg_replace('@<br[^>]*>@si', "$0 \n", $html); |
132
|
|
|
$html = strip_tags($html); |
133
|
|
|
$html = preg_replace("/[\t\n\r]+/", "\n", $html); |
134
|
|
|
$html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html))))); |
135
|
|
|
|
136
|
|
|
return $html; |
137
|
|
|
} |
138
|
|
|
|
139
|
|
|
public static function stripHtmlTags(string $html) |
140
|
|
|
{ |
141
|
|
|
// Permit to avoid stick words when span are used like block |
142
|
|
|
$html = str_replace('<', ' <', $html); |
143
|
|
|
$html = self::removeSrOnly($html); |
144
|
|
|
|
145
|
|
|
$dom = new \simple_html_dom(); |
146
|
|
|
if (false === $dom->load($html)) { // If we failed to load the html in dom |
|
|
|
|
147
|
|
|
$text = self::stripHtmlTagsOldWay($html); |
148
|
|
|
} else { |
149
|
|
|
$text = $dom->plaintext; |
150
|
|
|
$text = preg_replace('/ +/s', ' ', $text); |
151
|
|
|
} |
152
|
|
|
|
153
|
|
|
return $text; |
154
|
|
|
} |
155
|
|
|
|
156
|
|
|
/** |
157
|
|
|
* Not very good... avoid Jit error. |
158
|
|
|
*/ |
159
|
|
|
public static function removeSrOnly(string $html) |
160
|
|
|
{ |
161
|
|
|
return preg_replace('/<span[^>]+class="[^>]*(screen-reader-only|sr-only)[^>]*"[^>]*>[^<]*<\/span>/si', ' ', $html); |
162
|
|
|
} |
163
|
|
|
} |
164
|
|
|
|