Passed
Push — master ( f60a2c...2d7c5c )
by Dev
10:34 queued 19s
created

Helper::removeAccent()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
cc 2
eloc 5
nc 2
nop 1
dl 0
loc 9
ccs 0
cts 6
cp 0
crap 6
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use ForceUTF8\Encoding;
6
use simple_html_dom;
7
8
class Helper
9
{
10 15
    public static function clean(string $source)
11
    {
12 15
        return trim(preg_replace('/\s{2,}/', ' ', Encoding::toUTF8($source)));
13
    }
14
15 6
    public static function htmlToPlainText($str, $keepN = false)
16
    {
17 6
        $str = preg_replace('#<(style|script).*</(style|script)>#siU', ' ', $str);
18 6
        $str = preg_replace('#</?(br|p|div)>#siU', "\n", $str);
19 6
        $str = preg_replace('/<\/[a-z]+>/siU', ' ', $str);
20 6
        $str = str_replace(["\r", "\t"], ' ', $str);
21 6
        $str = strip_tags(preg_replace('/<[^<]+?>/', ' ', $str));
22 6
        if ($keepN) {
23
            $str = preg_replace('/ {2,}/', ' ', $str);
24
        } else {
25 6
            $str = preg_replace('/\s+/', ' ', $str);
26
        }
27
28 6
        return $str;
29
    }
30
31
    public static function removeAccent($str)
32
    {
33
        if ($str !== mb_convert_encoding(mb_convert_encoding($str, 'UTF-32', 'UTF-8'), 'UTF-8', 'UTF-32')) {
34
            $str = mb_convert_encoding($str, 'UTF-8');
35
        }
36
        $str = htmlentities($str, ENT_NOQUOTES, 'UTF-8');
37
        $str = preg_replace('`&([a-z]{1,2})(acute|uml|circ|grave|ring|cedil|slash|tilde|caron|lig);`i', '$1', $str);
38
39
        return $str;
40
    }
41
42
    public static function imageToTxt($txt)
43
    {
44
        $html = new simple_html_dom();
45
        $html->load($txt);
46
        foreach ($html->find('img') as $img) {
47
            $alt = isset($img->alt) ? $img->alt : '-';
48
            $alt = substr($alt, 0, 300).(strlen($alt) > 300 ? '~' : '');
49
            $src = isset($img->src) ? '('.$img->src.')' : null;
50
            $txt = str_replace($img->outertext, '!['.$alt.']'.$src, $txt);
51
        }
52
53
        return $txt;
54
    }
55
}
56