Passed
Push — master ( 474905...ba9227 )
by Dev
01:17
created

CleanText::removeSrOnly()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace rOpenDev\ExtractExpression;
4
5
use ForceUTF8\Encoding;
6
7
class CleanText
8
{
9
    public static $regexSentence = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/';
10
11
    public static $stopWords = [
12
        // English stop words
13
        'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at',
14
        'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever',
15
        'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
16
        'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must',
17
        'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said',
18
        'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',
19
        'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where',
20
        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your',
21
22
        // French Stop words
23
        'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'plus',
24
        'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui',
25
        'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi',
26
        'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y',
27
        'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi',
28
        'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car',
29
30
        'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons',
31
        'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient',
32
        'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions',
33
        'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons',
34
        'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut',
35
        'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez',
36
        'eussent', 'dit', 'fait', 'peut', 'faire', 'fais',
37
38
        'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler',
39
        'icone', 'flèche',
40
        'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter',
41
42
        // Weird thing happen every day
43
        'http//www', 'https//www',
44
    ];
45
46
    public static function fixEncoding(string $text)
47
    {
48
        // fix encoding
49
        $text = Encoding::toUTF8($text);
50
        $text = html_entity_decode(html_entity_decode(htmlentities($text)));
51
        $text = htmlspecialchars_decode($text, ENT_QUOTES);
52
        $text = str_replace('’', "'", $text); // Unify '
53
        $text = html_entity_decode(str_replace(['  ', '&nbsp;'], ' ', htmlentities($text)));
54
55
        return $text;
56
    }
57
58
    public static function getSentences(string $text)
59
    {
60
        $sentences = [];
61
        if (preg_match_all(self::$regexSentence, $text, $matches, PREG_SET_ORDER, 0)) {
62
            foreach ($matches as $m) {
63
                $sentences[] = preg_replace('/\s+/', ' ', $m[0]);
64
            }
65
        }
66
67
        return $sentences;
68
    }
69
70
    public static function keepOnlySentence(string $text)
71
    {
72
        $textFiltered = '';
73
        if (preg_match_all(self::$regexSentence, $text, $matches, PREG_SET_ORDER, 0)) {
74
            foreach ($matches as $m) {
75
                $textFiltered .= $m[0].' ';
76
            }
77
            $textFiltered = preg_replace('/\s+/', ' ', $text);
78
        }
79
80
        return $textFiltered;
81
    }
82
83
    public static function removePunctuation(string $text)
84
    {
85
        return preg_replace('/,|\.|\(|\[|\]|\)|!|\?|;|…|\{|\}|"|«|»|:|\*|\/|\||>|<| - | + /', ' ', $text);
86
    }
87
88
    public static function removeDate(string $text)
89
    {
90
        $month = '(janvier|january|février|february|mars|march|avril|april|mai|may|juin|june|juillet|july|août|august|septembre|september|octobre|october|novembre|november|décembre|december|jan|fev|feb|mar|avr|apr|jui|jun|juil|jul|aoû|aug|aout|aou|sept|oct|nov|dec|decembre)';
91
        // french format
92
        $text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text);
93
94
        // Remove Year
95
        //$text = preg_replace('/20[0-3][0-9]/', ' ', $text);
96
97
        // Remove Month
98
        //$text = preg_replace('/'.$month.'/', ' ', $text);
99
100
        return $text;
101
    }
102
103
    public static function removeStopWords(string $text)
104
    {
105
        $text = str_replace("'", ' ', $text);
106
        $text = str_replace(explode('|', ' '.implode(' | ', self::$stopWords).' '), ' ', $text);
107
108
        return trim($text);
109
    }
110
111
    public static function removeStopWordsAtExtremity(string $text)
112
    {
113
        $text = trim($text);
114
        $text = str_replace("'", ' ', $text);
115
        $text = preg_replace('@^'.implode(' |^', self::$stopWords).' @', '', $text);
116
        $text = preg_replace('@'.implode('$| ', self::$stopWords).'$@', '', $text);
117
118
        return trim($text);
119
    }
120
121
    public static function stripHtmlTagsOldWay(string $html)
122
    {
123
        // Often error because of limitation of JIT
124
        $textWithoutInvisible = preg_replace('@<(script|style|head|iframe|noframe|noscript|object|embed|noembed)[^>]*?>((?!<\1).)*<\/\1>@si', ' ', $html);
125
        if (false === preg_last_error()) { // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die();
126
            $html = $textWithoutInvisible;
127
        }
128
129
        $html = preg_replace('/\s+/', ' ', $html);
130
        $html = preg_replace('@</(div|p)>@si', "$0 \n\n", $html);
131
        $html = preg_replace('@<br[^>]*>@si', "$0 \n", $html);
132
        $html = strip_tags($html);
133
        $html = preg_replace("/[\t\n\r]+/", "\n", $html);
134
        $html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html)))));
135
136
        return $html;
137
    }
138
139
    public static function stripHtmlTags(string $html)
140
    {
141
        // Permit to avoid stick words when span are used like block
142
        $html = str_replace('<', ' <', $html);
143
        $html = self::removeSrOnly($html);
144
145
        $dom = new \simple_html_dom();
146
        if (false === $dom->load($html)) { // If we failed to load the html in dom
0 ignored issues
show
introduced by
The condition false === $dom->load($html) is always false.
Loading history...
147
            $text = self::stripHtmlTagsOldWay($html);
148
        } else {
149
            $text = $dom->plaintext;
150
            $text = preg_replace('/ +/s', ' ', $text);
151
        }
152
153
        return $text;
154
    }
155
156
    /**
157
     * Not very good... avoid Jit error.
158
     */
159
    public static function removeSrOnly(string $html)
160
    {
161
        return preg_replace('/<span[^>]+class="[^>]*(screen-reader-only|sr-only)[^>]*"[^>]*>[^<]*<\/span>/si', ' ', $html);
162
    }
163
}
164