Completed
Push — master ( e37ee7...012fd7 )
by Dev
01:59
created

CleanText   A

Complexity

Total Complexity 16

Size/Duplication

Total Lines 161
Duplicated Lines 0 %

Test Coverage

Coverage 92.98%

Importance

Changes 0
Metric Value
eloc 78
dl 0
loc 161
ccs 53
cts 57
cp 0.9298
rs 10
c 0
b 0
f 0
wmc 16

11 Methods

Rating   Name   Duplication   Size   Complexity  
A fixEncoding() 0 10 1
A stripHtmlTags() 0 14 2
A getSentences() 0 12 4
A removeStopWordsAtExtremity() 0 8 1
A stripHtmlTagsOldWay() 0 16 2
A stipHtmlTagsFromDom() 0 6 1
A removeDate() 0 13 1
A removeSrOnly() 0 3 1
A removeStopWords() 0 6 1
A removePunctuation() 0 3 1
A keepOnlySentence() 0 3 1
1
<?php
2
3
namespace PiedWeb\TextAnalyzer;
4
5
use ForceUTF8\Encoding;
6
7
class CleanText
8
{
9
    const REGEX_SENTENCE = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/';
10
11
    const STOP_WORDS = [
12
        // English stop words
13
        'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are',
14
        'as', 'at', 'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',
15
        'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers',
16
        'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
17
        'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often',
18
        'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said',    'say', 'says', 'she', 'should', 'since', 'so',
19
        'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',    'these', 'they', 'this', 'tis', 'to',
20
        'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where',    'which', 'while', 'who',
21
        'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your',
22
23
        // French Stop words
24
        'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la',
25
        'le', 'leur', 'lui', 'plus', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous',
26
        'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes',
27
        'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi', 'comme',
28
29
        'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y',
30
31
        'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles',
32
        'sans', 'soi', 'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car',
33
34
        'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras',
35
        'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était',
36
        'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez',
37
        'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus',
38
        'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait',
39
        'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eûmes', 'eûtes',
40
        'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez',
41
        'eussent', 'dit', 'fait', 'peut', 'faire', 'fais',
42
43
        'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler',
44
        'icone', 'flèche',
45
        'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter',
46
47
        // Weird thing happen every day
48
        'http//www', 'https//www',
49
    ];
50
51 3
    public static function fixEncoding(string $text)
52
    {
53
        // fix encoding
54 3
        $text = Encoding::toUTF8($text);
55 3
        $text = html_entity_decode(html_entity_decode(htmlentities($text)));
56 3
        $text = htmlspecialchars_decode($text, ENT_QUOTES);
57 3
        $text = str_replace('’', "'", $text); // Unify '
58 3
        $text = html_entity_decode(str_replace(['  ', '&nbsp;'], ' ', htmlentities($text)));
59
60 3
        return $text;
61
    }
62
63 3
    public static function getSentences(string $text)
64
    {
65 3
        $sentences = [];
66 3
        if (preg_match_all(self::REGEX_SENTENCE, $text, $matches, PREG_SET_ORDER, 0)) {
67 3
            foreach ($matches as $m) {
68 3
                if (count(explode(' ', $m[0])) < 30) { // We keep only sentence with less than 30 words
69 3
                    $sentences[] = preg_replace('/\s+/', ' ', $m[0]);
70
                }
71
            }
72
        }
73
74 3
        return $sentences;
75
    }
76
77
    public static function keepOnlySentence(string $text)
78
    {
79
        return implode(' ', self::getSentences($text));
80
    }
81
82 3
    public static function removePunctuation(string $text)
83
    {
84 3
        return preg_replace('/,|\.|\(|\[|\]|\)|!|\?|;|…|\{|\}|"|«|»|:|\*|\/|\||>|<| - | + /', ' ', $text);
85
    }
86
87 3
    public static function removeDate(string $text)
88
    {
89 3
        $month = '(janvier|january|février|february|mars|march|avril|april|mai|may|juin|june|juillet|july|août|august|septembre|september|octobre|october|novembre|november|décembre|december|jan|fev|feb|mar|avr|apr|jui|jun|juil|jul|aoû|aug|aout|aou|sept|oct|nov|dec|decembre)';
90
        // french format
91 3
        $text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text);
92
93
        // Remove Year
94
        //$text = preg_replace('/20[0-3][0-9]/', ' ', $text);
95
96
        // Remove Month
97
        //$text = preg_replace('/'.$month.'/', ' ', $text);
98
99 3
        return $text;
100
    }
101
102 6
    public static function removeStopWords(string $text)
103
    {
104 6
        $text = str_replace("'", ' ', $text);
105 6
        $text = str_replace(explode('|', ' '.implode(' | ', self::STOP_WORDS).' '), ' ', $text);
106
107 6
        return trim($text);
108
    }
109
110 9
    public static function removeStopWordsAtExtremity(string $text)
111
    {
112 9
        $text = trim($text);
113 9
        $text = str_replace("'", ' ', $text);
114 9
        $text = preg_replace('@^'.implode(' |^', self::STOP_WORDS).' @', '', $text);
115 9
        $text = preg_replace('@'.implode('$| ', self::STOP_WORDS).'$@', '', $text);
116
117 9
        return trim($text);
118
    }
119
120 3
    public static function stripHtmlTagsOldWay(string $html)
121
    {
122
        // Often error because of limitation of JIT
123 3
        $textWithoutInvisible = preg_replace('@<(script|style|head|iframe|noframe|noscript|object|embed|noembed)[^>]*?>((?!<\1).)*<\/\1>@si', ' ', $html);
124 3
        if (false === preg_last_error()) { // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die();
125
            $html = $textWithoutInvisible;
126
        }
127
128 3
        $html = preg_replace('/\s+/', ' ', $html);
129 3
        $html = preg_replace('@</(div|p)>@si', "$0 \n\n", $html);
130 3
        $html = preg_replace('@<br[^>]*>@si', "$0 \n", $html);
131 3
        $html = strip_tags($html);
132 3
        $html = preg_replace("/[\t\n\r]+/", "\n", $html);
133 3
        $html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html)))));
134
135 3
        return $html;
136
    }
137
138 6
    public static function stripHtmlTags(string $html)
139
    {
140
        // Permit to avoid stick words when span are used like block
141 6
        $html = str_replace('<', ' <', $html);
142 6
        $html = self::removeSrOnly($html);
143
144 6
        $dom = new \simple_html_dom();
145 6
        if (false === $dom->load($html)) { // If we failed to load the html in dom
0 ignored issues
show
introduced by
The condition false === $dom->load($html) is always false.
Loading history...
146
            $text = self::stripHtmlTagsOldWay($html);
147
        } else {
148 6
            $text = self::stipHtmlTagsFromDom($dom);
149
        }
150
151 6
        return $text;
152
    }
153
154 6
    public static function stipHtmlTagsFromDom(\simple_html_dom $dom)
155
    {
156 6
        $text = $dom->plaintext;
157 6
        $text = preg_replace('/ +/s', ' ', $text);
158
159 6
        return $text;
160
    }
161
162
    /**
163
     * Not very good... avoid Jit error.
164
     */
165 6
    public static function removeSrOnly(string $html)
166
    {
167 6
        return preg_replace('/<span[^>]+class="[^>]*(screen-reader-only|sr-only)[^>]*"[^>]*>[^<]*<\/span>/si', ' ', $html);
168
    }
169
}
170