CleanText::keepOnlySentence()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace rOpenDev\ExtractExpression;
4
5
use ForceUTF8\Encoding;
6
7
class CleanText
8
{
9
    public static $regexSentence = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/';
10
11
    public static $stopWords = [
12
        // English stop words
13
        'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at',
14
        'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever',
15
        'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
16
        'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must',
17
        'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said',
18
        'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',
19
        'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where',
20
        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your',
21
22
        // French Stop words
23
        'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'plus',
24
        'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui',
25
        'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi',
26
        'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y',
27
        'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi',
28
        'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car',
29
30
        'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons',
31
        'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient',
32
        'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions',
33
        'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons',
34
        'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut',
35
        'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez',
36
        'eussent', 'dit', 'fait', 'peut', 'faire', 'fais',
37
38
        'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler',
39
        'icone', 'flèche',
40
        'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter',
41
42
        // Weird thing happen every day
43
        'http//www', 'https//www',
44
    ];
45
46
    public static function fixEncoding(string $text)
47
    {
48
        // fix encoding
49
        $text = Encoding::toUTF8($text);
50
        $text = html_entity_decode(html_entity_decode(htmlentities($text)));
51
        $text = htmlspecialchars_decode($text, ENT_QUOTES);
52
        $text = str_replace('’', "'", $text); // Unify '
53
        $text = html_entity_decode(str_replace(['  ', '&nbsp;'], ' ', htmlentities($text)));
54
55
        return $text;
56
    }
57
58
    public static function getSentences(string $text)
59
    {
60
        $sentences = [];
61
        if (preg_match_all(self::$regexSentence, $text, $matches, PREG_SET_ORDER, 0)) {
62
            foreach ($matches as $m) {
63
                if (count(explode(' ', $m[0])) < 30) { // We keep only sentence with less than 30 words
64
                    $sentences[] = preg_replace('/\s+/', ' ', $m[0]);
65
                }
66
            }
67
        }
68
69
        return $sentences;
70
    }
71
72
    public static function keepOnlySentence(string $text)
73
    {
74
        return implode(' ', self::getSentences($text));
75
    }
76
77
    public static function removePunctuation(string $text)
78
    {
79
        return preg_replace('/,|\.|\(|\[|\]|\)|!|\?|;|…|\{|\}|"|«|»|:|\*|\/|\||>|<| - | + /', ' ', $text);
80
    }
81
82
    public static function removeDate(string $text)
83
    {
84
        $month = '(janvier|january|février|february|mars|march|avril|april|mai|may|juin|june|juillet|july|août|august|septembre|september|octobre|october|novembre|november|décembre|december|jan|fev|feb|mar|avr|apr|jui|jun|juil|jul|aoû|aug|aout|aou|sept|oct|nov|dec|decembre)';
85
        // french format
86
        $text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text);
87
88
        // Remove Year
89
        //$text = preg_replace('/20[0-3][0-9]/', ' ', $text);
90
91
        // Remove Month
92
        //$text = preg_replace('/'.$month.'/', ' ', $text);
93
94
        return $text;
95
    }
96
97
    public static function removeStopWords(string $text)
98
    {
99
        $text = str_replace("'", ' ', $text);
100
        $text = str_replace(explode('|', ' '.implode(' | ', self::$stopWords).' '), ' ', $text);
101
102
        return trim($text);
103
    }
104
105
    public static function removeStopWordsAtExtremity(string $text)
106
    {
107
        $text = trim($text);
108
        $text = str_replace("'", ' ', $text);
109
        $text = preg_replace('@^'.implode(' |^', self::$stopWords).' @', '', $text);
110
        $text = preg_replace('@'.implode('$| ', self::$stopWords).'$@', '', $text);
111
112
        return trim($text);
113
    }
114
115
    public static function stripHtmlTagsOldWay(string $html)
116
    {
117
        // Often error because of limitation of JIT
118
        $textWithoutInvisible = preg_replace('@<(script|style|head|iframe|noframe|noscript|object|embed|noembed)[^>]*?>((?!<\1).)*<\/\1>@si', ' ', $html);
119
        if (false === preg_last_error()) { // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die();
120
            $html = $textWithoutInvisible;
121
        }
122
123
        $html = preg_replace('/\s+/', ' ', $html);
124
        $html = preg_replace('@</(div|p)>@si', "$0 \n\n", $html);
125
        $html = preg_replace('@<br[^>]*>@si', "$0 \n", $html);
126
        $html = strip_tags($html);
127
        $html = preg_replace("/[\t\n\r]+/", "\n", $html);
128
        $html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html)))));
129
130
        return $html;
131
    }
132
133
    public static function stripHtmlTags(string $html)
134
    {
135
        // Permit to avoid stick words when span are used like block
136
        $html = str_replace('<', ' <', $html);
137
        $html = self::removeSrOnly($html);
138
139
        $dom = new \simple_html_dom();
140
        if (false === $dom->load($html)) { // If we failed to load the html in dom
0 ignored issues
show
introduced by
The condition false === $dom->load($html) is always false.
Loading history...
141
            $text = self::stripHtmlTagsOldWay($html);
142
        } else {
143
            $text = $dom->plaintext;
144
            $text = preg_replace('/ +/s', ' ', $text);
145
        }
146
147
        return $text;
148
    }
149
150
    /**
151
     * Not very good... avoid Jit error.
152
     */
153
    public static function removeSrOnly(string $html)
154
    {
155
        return preg_replace('/<span[^>]+class="[^>]*(screen-reader-only|sr-only)[^>]*"[^>]*>[^<]*<\/span>/si', ' ', $html);
156
    }
157
}
158