CleanText - Code Metrics - RobinDev/ExtractExpression - Measure and Improve Code Quality continuously with Scrutinizer

CleanText A
last analyzed 2019-01-06 07:55 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	149
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	73
dl	0
loc	149
rs	10
c	0
b	0
f	0
wmc	15

10 Methods

Rating	Name	Size	Complexity
A	fixEncoding()	10	1
A	removeDate()	13	1
A	keepOnlySentence()	3	1
A	stripHtmlTagsOldWay()	16	2
A	getSentences()	12	4
A	stripHtmlTags()	15	2
A	removeStopWordsAtExtremity()	8	1
A	removeSrOnly()	3	1
A	removeStopWords()	6	1
A	removePunctuation()	3	1

<?php

namespace rOpenDev\ExtractExpression;

use ForceUTF8\Encoding;

class CleanText
{
    public static $regexSentence = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/';

    public static $stopWords = [
        // English stop words
        'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at',
        'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever',
        'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
        'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must',
        'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said',
        'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',
        'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where',
        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your',

        // French Stop words
        'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'plus',
        'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui',
        'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi',
        'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y',
        'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi',
        'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car',

        'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons',
        'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient',
        'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions',
        'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons',
        'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut',
        'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez',
        'eussent', 'dit', 'fait', 'peut', 'faire', 'fais',

        'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler',
        'icone', 'flèche',
        'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter',

        // Weird thing happen every day
        'http//www', 'https//www',
    ];

    public static function fixEncoding(string $text)
    {
        // fix encoding
        $text = Encoding::toUTF8($text);
        $text = html_entity_decode(html_entity_decode(htmlentities($text)));
        $text = htmlspecialchars_decode($text, ENT_QUOTES);
        $text = str_replace('’', "'", $text); // Unify '
        $text = html_entity_decode(str_replace(['  ', '&nbsp;'], ' ', htmlentities($text)));

        return $text;
    }

    public static function getSentences(string $text)
    {
        $sentences = [];
        if (preg_match_all(self::$regexSentence, $text, $matches, PREG_SET_ORDER, 0)) {
            foreach ($matches as $m) {
                if (count(explode(' ', $m[0])) < 30) { // We keep only sentence with less than 30 words
                    $sentences[] = preg_replace('/\s+/', ' ', $m[0]);
                }
            }
        }

        return $sentences;
    }

    public static function keepOnlySentence(string $text)
    {
        return implode(' ', self::getSentences($text));
    }

    public static function removePunctuation(string $text)
    {
        return preg_replace('/,|\.|\(|\[|\]|\)|!|\?|;|…|\{|\}|"|«|»|:|\*|\/|\||>|<| - | + /', ' ', $text);
    }

    public static function removeDate(string $text)
    {
        $month = '(janvier|january|février|february|mars|march|avril|april|mai|may|juin|june|juillet|july|août|august|septembre|september|octobre|october|novembre|november|décembre|december|jan|fev|feb|mar|avr|apr|jui|jun|juil|jul|aoû|aug|aout|aou|sept|oct|nov|dec|decembre)';
        // french format
        $text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text);

        // Remove Year
        //$text = preg_replace('/20[0-3][0-9]/', ' ', $text);

        // Remove Month
        //$text = preg_replace('/'.$month.'/', ' ', $text);

        return $text;
    }

    public static function removeStopWords(string $text)
    {
        $text = str_replace("'", ' ', $text);
        $text = str_replace(explode('|', ' '.implode(' | ', self::$stopWords).' '), ' ', $text);

        return trim($text);
    }

    public static function removeStopWordsAtExtremity(string $text)
    {
        $text = trim($text);
        $text = str_replace("'", ' ', $text);
        $text = preg_replace('@^'.implode(' |^', self::$stopWords).' @', '', $text);
        $text = preg_replace('@'.implode('$| ', self::$stopWords).'$@', '', $text);

        return trim($text);
    }

    public static function stripHtmlTagsOldWay(string $html)
    {
        // Often error because of limitation of JIT
        $textWithoutInvisible = preg_replace('@<(script|style|head|iframe|noframe|noscript|object|embed|noembed)[^>]*?>((?!<\1).)*<\/\1>@si', ' ', $html);
        if (false === preg_last_error()) { // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die();
            $html = $textWithoutInvisible;
        }

        $html = preg_replace('/\s+/', ' ', $html);
        $html = preg_replace('@</(div|p)>@si', "$0 \n\n", $html);
        $html = preg_replace('@<br[^>]*>@si', "$0 \n", $html);
        $html = strip_tags($html);
        $html = preg_replace("/[\t\n\r]+/", "\n", $html);
        $html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html)))));

        return $html;
    }

    public static function stripHtmlTags(string $html)
    {
        // Permit to avoid stick words when span are used like block
        $html = str_replace('<', ' <', $html);
        $html = self::removeSrOnly($html);

        $dom = new \simple_html_dom();
        if (false === $dom->load($html)) { // If we failed to load the html in dom

            $text = self::stripHtmlTagsOldWay($html);
        } else {
            $text = $dom->plaintext;
            $text = preg_replace('/ +/s', ' ', $text);
        }

        return $text;
    }

    /**
     * Not very good... avoid Jit error.
     */
    public static function removeSrOnly(string $html)
    {
        return preg_replace('/<span[^>]+class="[^>]*(screen-reader-only|sr-only)[^>]*"[^>]*>[^<]*<\/span>/si', ' ', $html);
    }
}


1			<?php
2
3			namespace rOpenDev\ExtractExpression;
4
5			use ForceUTF8\Encoding;
6
7			class CleanText
8			{
9			public static $regexSentence = '/[A-Z][^\n\.\!\?…]{4,}[\.\!\?…]/'; // '/([^\n\.\!\?]{10,}[\.\!\?…])*/';
10
11			public static $stopWords = [
12			// English stop words
13			'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at',
14			'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever',
15			'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
16			'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must',
17			'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said',
18			'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',
19			'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where',
20			'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your',
21
22			// French Stop words
23			'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'plus',
24			'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui',
25			'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi',
26			'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y',
27			'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles', 'sans', 'soi',
28			'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car',
29
30			'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons',
31			'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient',
32			'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions',
33			'fussiez', 'fussent', 'ayant', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons',
34			'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut',
35			'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez',
36			'eussent', 'dit', 'fait', 'peut', 'faire', 'fais',
37
38			'répondre', 'repondre', 'réponses', 'reply', 'bonjour', 'merci', 'supprimer', 'anonyme', 'signaler',
39			'icone', 'flèche',
40			'similaires', 'fiches', 'voir', 'articles', 'favoris', 'ajouter',
41
42			// Weird thing happen every day
43			'http//www', 'https//www',
44			];
45
46			public static function fixEncoding(string $text)
47			{
48			// fix encoding
49			$text = Encoding::toUTF8($text);
50			$text = html_entity_decode(html_entity_decode(htmlentities($text)));
51			$text = htmlspecialchars_decode($text, ENT_QUOTES);
52			$text = str_replace('’', "'", $text); // Unify '
53			$text = html_entity_decode(str_replace([' ', ' '], ' ', htmlentities($text)));
54
55			return $text;
56			}
57
58			public static function getSentences(string $text)
59			{
60			$sentences = [];
61			if (preg_match_all(self::$regexSentence, $text, $matches, PREG_SET_ORDER, 0)) {
62			foreach ($matches as $m) {
63			if (count(explode(' ', $m[0])) < 30) { // We keep only sentence with less than 30 words
64			$sentences[] = preg_replace('/\s+/', ' ', $m[0]);
65			}
66			}
67			}
68
69			return $sentences;
70			}
71
72			public static function keepOnlySentence(string $text)
73			{
74			return implode(' ', self::getSentences($text));
75			}
76
77			public static function removePunctuation(string $text)
78			{
79			return preg_replace('/,\|\.\|\(\|\[\|\]\|\)\|!\|\?\|;\|…\|\{\|\}\|"\|«\|»\|:\|\*\|\/\|\\|\|>\|<\| - \| + /', ' ', $text);
80			}
81
82			public static function removeDate(string $text)
83			{
84			$month = '(janvier\|january\|février\|february\|mars\|march\|avril\|april\|mai\|may\|juin\|june\|juillet\|july\|août\|august\|septembre\|september\|octobre\|october\|novembre\|november\|décembre\|december\|jan\|fev\|feb\|mar\|avr\|apr\|jui\|jun\|juil\|jul\|aoû\|aug\|aout\|aou\|sept\|oct\|nov\|dec\|decembre)';
85			// french format
86			$text = preg_replace('/([0-3]?[0-9]\s+)?'.$month.'\s+(20)?[0-3][0-9]/i', ' ', $text);
87
88			// Remove Year
89			//$text = preg_replace('/20[0-3][0-9]/', ' ', $text);
90
91			// Remove Month
92			//$text = preg_replace('/'.$month.'/', ' ', $text);
93
94			return $text;
95			}
96
97			public static function removeStopWords(string $text)
98			{
99			$text = str_replace("'", ' ', $text);
100			$text = str_replace(explode('\|', ' '.implode(' \| ', self::$stopWords).' '), ' ', $text);
101
102			return trim($text);
103			}
104
105			public static function removeStopWordsAtExtremity(string $text)
106			{
107			$text = trim($text);
108			$text = str_replace("'", ' ', $text);
109			$text = preg_replace('@^'.implode(' \|^', self::$stopWords).' @', '', $text);
110			$text = preg_replace('@'.implode('$\| ', self::$stopWords).'$@', '', $text);
111
112			return trim($text);
113			}
114
115			public static function stripHtmlTagsOldWay(string $html)
116			{
117			// Often error because of limitation of JIT
118			$textWithoutInvisible = preg_replace('@<(script\|style\|head\|iframe\|noframe\|noscript\|object\|embed\|noembed)[^>]?>((?!<\1).)<\/\1>@si', ' ', $html);
119			if (false === preg_last_error()) { // var_dump(array_flip(get_defined_constants(true)['pcre'])[preg_last_error()]); die();
120			$html = $textWithoutInvisible;
121			}
122
123			$html = preg_replace('/\s+/', ' ', $html);
124			$html = preg_replace('@</(div\|p)>@si', "$0 \n\n", $html);
125			$html = preg_replace('@<br[^>]*>@si', "$0 \n", $html);
126			$html = strip_tags($html);
127			$html = preg_replace("/[\t\n\r]+/", "\n", $html);
128			$html = trim(implode("\n", array_map('trim', explode("\n", preg_replace('/\s+/', ' ', $html)))));
129
130			return $html;
131			}
132
133			public static function stripHtmlTags(string $html)
134			{
135			// Permit to avoid stick words when span are used like block
136			$html = str_replace('<', ' <', $html);
137			$html = self::removeSrOnly($html);
138
139			$dom = new \simple_html_dom();
140			if (false === $dom->load($html)) { // If we failed to load the html in dom
			0 ignored issues – show introduced 2019-01-05 18:51 UTC by Report Bug Copy Issue Report The condition `false === $dom->load($html)` is always `false`. Loading history...
141			$text = self::stripHtmlTagsOldWay($html);
142			} else {
143			$text = $dom->plaintext;
144			$text = preg_replace('/ +/s', ' ', $text);
145			}
146
147			return $text;
148			}
149
150			/**
151			* Not very good... avoid Jit error.
152			*/
153			public static function removeSrOnly(string $html)
154			{
155			return preg_replace('/<span[^>]+class="[^>](screen-reader-only\|sr-only)[^>]"[^>]>[^<]<\/span>/si', ' ', $html);
156			}
157			}
158

RobinDev / ExtractExpression

CleanText A last analyzed 2019-01-06 07:55 UTC

Complexity

Size/Duplication

Importance

10 Methods

Duplication Side-by-Side

Filter issues like

CleanText A
last analyzed 2019-01-06 07:55 UTC