1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PiedWeb\UrlHarvester; |
4
|
|
|
|
5
|
|
|
use ForceUTF8\Encoding; |
6
|
|
|
use simple_html_dom; |
7
|
|
|
|
8
|
|
|
class Helper |
9
|
|
|
{ |
10
|
15 |
|
public static function clean(string $source) |
11
|
|
|
{ |
12
|
15 |
|
return trim(preg_replace('/\s{2,}/', ' ', Encoding::toUTF8($source))); |
13
|
|
|
} |
14
|
|
|
|
15
|
6 |
|
public static function htmlToPlainText($str, $keepN = false) |
16
|
|
|
{ |
17
|
6 |
|
$str = preg_replace('#<(style|script).*</(style|script)>#siU', ' ', $str); |
18
|
6 |
|
$str = preg_replace('#</?(br|p|div)>#siU', "\n", $str); |
19
|
6 |
|
$str = preg_replace('/<\/[a-z]+>/siU', ' ', $str); |
20
|
6 |
|
$str = str_replace(["\r", "\t"], ' ', $str); |
21
|
6 |
|
$str = strip_tags(preg_replace('/<[^<]+?>/', ' ', $str)); |
22
|
6 |
|
if ($keepN) { |
23
|
|
|
$str = preg_replace('/ {2,}/', ' ', $str); |
24
|
|
|
} else { |
25
|
6 |
|
$str = preg_replace('/\s+/', ' ', $str); |
26
|
|
|
} |
27
|
|
|
|
28
|
6 |
|
return $str; |
29
|
|
|
} |
30
|
|
|
|
31
|
|
|
public static function removeAccent($str) |
32
|
|
|
{ |
33
|
|
|
if ($str !== mb_convert_encoding(mb_convert_encoding($str, 'UTF-32', 'UTF-8'), 'UTF-8', 'UTF-32')) { |
34
|
|
|
$str = mb_convert_encoding($str, 'UTF-8'); |
35
|
|
|
} |
36
|
|
|
$str = htmlentities($str, ENT_NOQUOTES, 'UTF-8'); |
37
|
|
|
$str = preg_replace('`&([a-z]{1,2})(acute|uml|circ|grave|ring|cedil|slash|tilde|caron|lig);`i', '$1', $str); |
38
|
|
|
|
39
|
|
|
return $str; |
40
|
|
|
} |
41
|
|
|
|
42
|
|
|
public static function imageToTxt($txt) |
43
|
|
|
{ |
44
|
|
|
$html = new simple_html_dom(); |
45
|
|
|
$html->load($txt); |
46
|
|
|
foreach ($html->find('img') as $img) { |
47
|
|
|
$alt = isset($img->alt) ? $img->alt : '-'; |
48
|
|
|
$alt = substr($alt, 0, 300).(strlen($alt) > 300 ? '~' : ''); |
49
|
|
|
$src = isset($img->src) ? '('.$img->src.')' : null; |
50
|
|
|
$txt = str_replace($img->outertext, '!['.$alt.']'.$src, $txt); |
51
|
|
|
} |
52
|
|
|
|
53
|
|
|
return $txt; |
54
|
|
|
} |
55
|
|
|
} |
56
|
|
|
|