Issues (1)

src/CleanText.php (1 issue)

Severity
1
<?php
2
3
namespace PiedWeb\TextAnalyzer;
4
5
use ForceUTF8\Encoding;
6
7
class CleanText
8
{
9
    const REGEX_SENTENCE = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/';
10
11
    const STOP_WORDS = [
12
        // English stop words
13
        'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are',
14
        'as', 'at', 'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',
15
        'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers',
16
        'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
17
        'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often',
18
        'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said',    'say', 'says', 'she', 'should', 'since', 'so',
19
        'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',    'these', 'they', 'this', 'tis', 'to',
20
        'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where',    'which', 'while', 'who',
21
        'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your',
22
23
        // French Stop words
24
        'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la',
25
        'le', 'leur', 'lui', 'plus', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous',
26
        'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes',
27
        'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi', 'comme',
28
29
        'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y',
30
31
        'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles',
32
        'sans', 'soi', 'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car',
33
34
        'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras',
35
        'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était',
36
        'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez',
37
        'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus',
38
        'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait',
39
        'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eûmes', 'eûtes',
40
        'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez',
41
        'eussent', 'dit', 'fait', 'peut', 'faire', 'fais',
42
43
        'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler',
44
        'icone', 'flèche',
45
        'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter',
46
47
        // Weird thing happen every day
48
        'http//www', 'https//www',
49
    ];
50
51 3
    public static function fixEncoding(string $text)
52
    {
53
        // fix encoding
54 3
        $text = Encoding::toUTF8($text);
55 3
        $text = html_entity_decode(html_entity_decode(htmlentities($text)));
56 3
        $text = htmlspecialchars_decode($text, ENT_QUOTES);
57 3
        $text = str_replace('’', "'", $text); // Unify '
58 3
        $text = html_entity_decode(str_replace(['  ', '&nbsp;'], ' ', htmlentities($text)));
59
60 3
        return $text;
61
    }
62
63 3
    public static function getSentences(string $text)
64
    {
65 3
        $sentences = [];
66 3
        if (preg_match_all(self::REGEX_SENTENCE, $text, $matches, PREG_SET_ORDER, 0)) {
67 3
            foreach ($matches as $m) {
68 3
                if (count(explode(' ', $m[0])) < 30) { // We keep only sentence with less than 30 words
69 3
                    $sentences[] = preg_replace('/\s+/', ' ', $m[0]);
70
                }
71
            }
72
        }
73
74 3
        return $sentences;
75
    }
76
77
    public static function keepOnlySentence(string $text)
78
    {
79
        return implode(' ', self::getSentences($text));
80
    }
81
82 3
    public static function removePunctuation(string $text)
83
    {
84 3
        return preg_replace('/,|\.|\(|\[|\]|\)|!|\?|;|…|\{|\}|"|«|»|:|\*|\/|\||>|<| - | + /', ' ', $text);
85
    }
86
87 3
    public static function removeDate(string $text)
88
    {
89
        $month = '(janvier|january|février|february|mars|march|avril|april|mai|may|juin|june|juillet|july|août|august'
90
                .'|septembre|september|octobre|october|novembre|november|décembre|december|jan|fev|feb|mar|avr|apr|jui'
91 3
                .'|jun|juil|jul|aoû|aug|aout|aou|sept|oct|nov|dec|decembre)';
92
93
        // french format
94 3
        $text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text);
95
96
        // Remove Year
97
        //$text = preg_replace('/20[0-3][0-9]/', ' ', $text);
98
99
        // Remove Month
100
        //$text = preg_replace('/'.$month.'/', ' ', $text);
101
102 3
        return $text;
103
    }
104
105 6
    public static function removeStopWords(string $text)
106
    {
107 6
        $text = str_replace("'", ' ', $text);
108 6
        $text = str_replace(explode('|', ' '.implode(' | ', self::STOP_WORDS).' '), ' ', $text);
109
110 6
        return trim($text);
111
    }
112
113 9
    public static function removeStopWordsAtExtremity(string $text)
114
    {
115 9
        $text = trim($text);
116 9
        $text = str_replace("'", ' ', $text);
117 9
        $text = preg_replace('@^'.implode(' |^', self::STOP_WORDS).' @', '', $text);
118 9
        $text = preg_replace('@'.implode('$| ', self::STOP_WORDS).'$@', '', $text);
119
120 9
        return trim($text);
121
    }
122
123 3
    public static function stripHtmlTagsOldWay(string $html)
124
    {
125
        // Often error because of limitation of JIT
126 3
        $regex = '@<(script|style|head|iframe|noframe|noscript|object|embed|noembed)[^>]*?>((?!<\1).)*<\/\1>@si';
127 3
        $textWithoutInvisible = preg_replace($regex, ' ', $html);
128 3
        if (PREG_NO_ERROR === preg_last_error()) {
129
            // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die();
130
            $html = $textWithoutInvisible;
131
        }
132
133 3
        $html = preg_replace('/\s+/', ' ', $html);
134 3
        $html = preg_replace('@</(div|p)>@si', "$0 \n\n", $html);
135 3
        $html = preg_replace('@<br[^>]*>@si', "$0 \n", $html);
136 3
        $html = strip_tags($html);
137 3
        $html = preg_replace("/[\t\n\r]+/", "\n", $html);
138 3
        $html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html)))));
139
140 3
        return $html;
141
    }
142
143 6
    public static function stripHtmlTags(string $html)
144
    {
145
        // Permit to avoid stick words when span are used like block
146 6
        $html = str_replace('<', ' <', $html);
147 6
        $html = self::removeSrOnly($html);
148
149 6
        $dom = new \simple_html_dom();
150 6
        if (false === $dom->load($html)) { // If we failed to load the html in dom
0 ignored issues
show
The condition false === $dom->load($html) is always false.
Loading history...
151
            $text = self::stripHtmlTagsOldWay($html);
152
        } else {
153 6
            $text = self::stipHtmlTagsFromDom($dom);
154
        }
155
156 6
        return $text;
157
    }
158
159 6
    public static function stipHtmlTagsFromDom(\simple_html_dom $dom)
160
    {
161 6
        $text = $dom->plaintext;
162 6
        $text = preg_replace('/ +/s', ' ', $text);
163
164 6
        return $text;
165
    }
166
167
    /**
168
     * Not very good... avoid Jit error.
169
     */
170 6
    public static function removeSrOnly(string $html)
171
    {
172 6
        $regex = '/<span[^>]+class="[^>]*(screen-reader-only|sr-only)[^>]*"[^>]*>[^<]*<\/span>/si';
173
174 6
        return preg_replace($regex, ' ', $html);
175
    }
176
}
177