|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace PiedWeb\UrlHarvester; |
|
4
|
|
|
|
|
5
|
|
|
use ForceUTF8\Encoding; |
|
6
|
|
|
use simple_html_dom; |
|
7
|
|
|
|
|
8
|
|
|
class Helper |
|
9
|
|
|
{ |
|
10
|
|
|
public static function clean(string $source) |
|
11
|
|
|
{ |
|
12
|
|
|
return trim(preg_replace('/\s{2,}/', ' ', Encoding::toUTF8($source))); |
|
13
|
|
|
} |
|
14
|
|
|
|
|
15
|
|
|
public static function htmlToPlainText($str, $keepN = false) |
|
16
|
|
|
{ |
|
17
|
|
|
$str = preg_replace('#<(style|script).*</(style|script)>#siU', ' ', $str); |
|
18
|
|
|
$str = preg_replace('#</?(br|p|div)>#siU', "\n", $str); |
|
19
|
|
|
$str = preg_replace('/<\/[a-z]+>/siU', ' ', $str); |
|
20
|
|
|
$str = str_replace(["\r", "\t"], ' ', $str); |
|
21
|
|
|
$str = strip_tags(preg_replace('/<[^<]+?>/', ' ', $str)); |
|
22
|
|
|
if ($keepN) { |
|
23
|
|
|
$str = preg_replace('/ {2,}/', ' ', $str); |
|
24
|
|
|
} else { |
|
25
|
|
|
$str = preg_replace('/\s+/', ' ', $str); |
|
26
|
|
|
} |
|
27
|
|
|
|
|
28
|
|
|
return $str; |
|
29
|
|
|
} |
|
30
|
|
|
|
|
31
|
|
|
public static function removeAccent($str) |
|
32
|
|
|
{ |
|
33
|
|
|
if ($str !== mb_convert_encoding(mb_convert_encoding($str, 'UTF-32', 'UTF-8'), 'UTF-8', 'UTF-32')) { |
|
34
|
|
|
$str = mb_convert_encoding($str, 'UTF-8'); |
|
35
|
|
|
} |
|
36
|
|
|
$str = htmlentities($str, ENT_NOQUOTES, 'UTF-8'); |
|
37
|
|
|
$str = preg_replace('`&([a-z]{1,2})(acute|uml|circ|grave|ring|cedil|slash|tilde|caron|lig);`i', '$1', $str); |
|
38
|
|
|
|
|
39
|
|
|
return $str; |
|
40
|
|
|
} |
|
41
|
|
|
|
|
42
|
|
|
public static function imageToTxt($txt) |
|
43
|
|
|
{ |
|
44
|
|
|
$html = new simple_html_dom(); |
|
45
|
|
|
$html->load($txt); |
|
46
|
|
|
foreach ($html->find('img') as $img) { |
|
47
|
|
|
$alt = isset($img->alt) ? $img->alt : '-'; |
|
48
|
|
|
$alt = substr($alt, 0, 300).(strlen($alt) > 300 ? '~' : ''); |
|
49
|
|
|
$src = (isset($img->src) ? '('.$img->src.')'. : null; |
|
|
|
|
|
|
50
|
|
|
$txt = str_replace($img->outertext, '!['.$alt.']'.$src, $txt); |
|
51
|
|
|
} |
|
52
|
|
|
|
|
53
|
|
|
return $txt; |
|
54
|
|
|
} |
|
55
|
|
|
} |
|
56
|
|
|
|