CleanText   A
last analyzed

Complexity

Total Complexity 15

Size/Duplication

Total Lines 149
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 73
dl 0
loc 149
rs 10
c 0
b 0
f 0
wmc 15

10 Methods

Rating   Name   Duplication   Size   Complexity  
A fixEncoding() 0 10 1
A removeDate() 0 13 1
A keepOnlySentence() 0 3 1
A stripHtmlTagsOldWay() 0 16 2
A getSentences() 0 12 4
A stripHtmlTags() 0 15 2
A removeStopWordsAtExtremity() 0 8 1
A removeSrOnly() 0 3 1
A removeStopWords() 0 6 1
A removePunctuation() 0 3 1
1
<?php
2
3
namespace rOpenDev\ExtractExpression;
4
5
use ForceUTF8\Encoding;
6
7
class CleanText
8
{
9
    public static $regexSentence = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/';
10
11
    public static $stopWords = [
12
        // English stop words
13
        'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at',
14
        'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever',
15
        'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
16
        'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must',
17
        'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said',
18
        'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',
19
        'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where',
20
        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your',
21
22
        // French Stop words
23
        'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'plus',
24
        'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui',
25
        'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi',
26
        'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y',
27
        'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi',
28
        'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car',
29
30
        'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons',
31
        'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient',
32
        'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions',
33
        'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons',
34
        'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut',
35
        'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez',
36
        'eussent', 'dit', 'fait', 'peut', 'faire', 'fais',
37
38
        'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler',
39
        'icone', 'flèche',
40
        'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter',
41
42
        // Weird thing happen every day
43
        'http//www', 'https//www',
44
    ];
45
46
    public static function fixEncoding(string $text)
47
    {
48
        // fix encoding
49
        $text = Encoding::toUTF8($text);
50
        $text = html_entity_decode(html_entity_decode(htmlentities($text)));
51
        $text = htmlspecialchars_decode($text, ENT_QUOTES);
52
        $text = str_replace('’', "'", $text); // Unify '
53
        $text = html_entity_decode(str_replace(['  ', '&nbsp;'], ' ', htmlentities($text)));
54
55
        return $text;
56
    }
57
58
    public static function getSentences(string $text)
59
    {
60
        $sentences = [];
61
        if (preg_match_all(self::$regexSentence, $text, $matches, PREG_SET_ORDER, 0)) {
62
            foreach ($matches as $m) {
63
                if (count(explode(' ', $m[0])) < 30) { // We keep only sentence with less than 30 words
64
                    $sentences[] = preg_replace('/\s+/', ' ', $m[0]);
65
                }
66
            }
67
        }
68
69
        return $sentences;
70
    }
71
72
    public static function keepOnlySentence(string $text)
73
    {
74
        return implode(' ', self::getSentences($text));
75
    }
76
77
    public static function removePunctuation(string $text)
78
    {
79
        return preg_replace('/,|\.|\(|\[|\]|\)|!|\?|;|…|\{|\}|"|«|»|:|\*|\/|\||>|<| - | + /', ' ', $text);
80
    }
81
82
    public static function removeDate(string $text)
83
    {
84
        $month = '(janvier|january|février|february|mars|march|avril|april|mai|may|juin|june|juillet|july|août|august|septembre|september|octobre|october|novembre|november|décembre|december|jan|fev|feb|mar|avr|apr|jui|jun|juil|jul|aoû|aug|aout|aou|sept|oct|nov|dec|decembre)';
85
        // french format
86
        $text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text);
87
88
        // Remove Year
89
        //$text = preg_replace('/20[0-3][0-9]/', ' ', $text);
90
91
        // Remove Month
92
        //$text = preg_replace('/'.$month.'/', ' ', $text);
93
94
        return $text;
95
    }
96
97
    public static function removeStopWords(string $text)
98
    {
99
        $text = str_replace("'", ' ', $text);
100
        $text = str_replace(explode('|', ' '.implode(' | ', self::$stopWords).' '), ' ', $text);
101
102
        return trim($text);
103
    }
104
105
    public static function removeStopWordsAtExtremity(string $text)
106
    {
107
        $text = trim($text);
108
        $text = str_replace("'", ' ', $text);
109
        $text = preg_replace('@^'.implode(' |^', self::$stopWords).' @', '', $text);
110
        $text = preg_replace('@'.implode('$| ', self::$stopWords).'$@', '', $text);
111
112
        return trim($text);
113
    }
114
115
    public static function stripHtmlTagsOldWay(string $html)
116
    {
117
        // Often error because of limitation of JIT
118
        $textWithoutInvisible = preg_replace('@<(script|style|head|iframe|noframe|noscript|object|embed|noembed)[^>]*?>((?!<\1).)*<\/\1>@si', ' ', $html);
119
        if (false === preg_last_error()) { // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die();
120
            $html = $textWithoutInvisible;
121
        }
122
123
        $html = preg_replace('/\s+/', ' ', $html);
124
        $html = preg_replace('@</(div|p)>@si', "$0 \n\n", $html);
125
        $html = preg_replace('@<br[^>]*>@si', "$0 \n", $html);
126
        $html = strip_tags($html);
127
        $html = preg_replace("/[\t\n\r]+/", "\n", $html);
128
        $html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html)))));
129
130
        return $html;
131
    }
132
133
    public static function stripHtmlTags(string $html)
134
    {
135
        // Permit to avoid stick words when span are used like block
136
        $html = str_replace('<', ' <', $html);
137
        $html = self::removeSrOnly($html);
138
139
        $dom = new \simple_html_dom();
140
        if (false === $dom->load($html)) { // If we failed to load the html in dom
0 ignored issues
show
introduced by
The condition false === $dom->load($html) is always false.
Loading history...
141
            $text = self::stripHtmlTagsOldWay($html);
142
        } else {
143
            $text = $dom->plaintext;
144
            $text = preg_replace('/ +/s', ' ', $text);
145
        }
146
147
        return $text;
148
    }
149
150
    /**
151
     * Not very good... avoid Jit error.
152
     */
153
    public static function removeSrOnly(string $html)
154
    {
155
        return preg_replace('/<span[^>]+class="[^>]*(screen-reader-only|sr-only)[^>]*"[^>]*>[^<]*<\/span>/si', ' ', $html);
156
    }
157
}
158