|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace Kevintweber\HtmlTokenizer; |
|
4
|
|
|
|
|
5
|
|
|
use Kevintweber\HtmlTokenizer\Tokens\Token; |
|
6
|
|
|
use Kevintweber\HtmlTokenizer\Tokens\TokenCollection; |
|
7
|
|
|
use Kevintweber\HtmlTokenizer\Tokens\TokenFactory; |
|
8
|
|
|
|
|
9
|
|
|
class HtmlTokenizer |
|
10
|
|
|
{ |
|
11
|
|
|
/** @var boolean */ |
|
12
|
|
|
private $throwOnError; |
|
13
|
|
|
|
|
14
|
|
|
/** @var string */ |
|
15
|
|
|
private static $allHtml = ''; |
|
16
|
|
|
|
|
17
|
|
|
/** |
|
18
|
|
|
* Constructor |
|
19
|
|
|
*/ |
|
20
|
9 |
|
public function __construct(bool $throwOnError = true) |
|
21
|
|
|
{ |
|
22
|
9 |
|
$this->throwOnError = $throwOnError; |
|
23
|
9 |
|
} |
|
24
|
|
|
|
|
25
|
|
|
/** |
|
26
|
|
|
* Will parse html into tokens. |
|
27
|
|
|
* |
|
28
|
|
|
* @param $html string The HTML to tokenize. |
|
29
|
|
|
* |
|
30
|
|
|
* @return TokenCollection |
|
31
|
|
|
*/ |
|
32
|
9 |
|
public function parse(string $html) : TokenCollection |
|
33
|
|
|
{ |
|
34
|
9 |
|
self::$allHtml = $html; |
|
35
|
9 |
|
$tokens = new TokenCollection(); |
|
36
|
9 |
|
$remainingHtml = trim((string) $html); |
|
37
|
9 |
|
while (mb_strlen($remainingHtml) > 0) { |
|
38
|
9 |
|
$token = TokenFactory::buildFromHtml( |
|
39
|
|
|
$remainingHtml, |
|
40
|
9 |
|
null, |
|
41
|
9 |
|
$this->throwOnError |
|
42
|
|
|
); |
|
43
|
9 |
|
if (!$token instanceof Token) { |
|
44
|
|
|
// Error has occurred, so we stop. |
|
45
|
2 |
|
break; |
|
46
|
|
|
} |
|
47
|
|
|
|
|
48
|
9 |
|
$remainingHtml = $token->parse($remainingHtml); |
|
49
|
9 |
|
$tokens[] = $token; |
|
50
|
|
|
} |
|
51
|
|
|
|
|
52
|
8 |
|
return $tokens; |
|
53
|
|
|
} |
|
54
|
|
|
|
|
55
|
90 |
|
public static function getPosition(string $partialHtml) : array |
|
56
|
|
|
{ |
|
57
|
90 |
|
$position = mb_strrpos(self::$allHtml, $partialHtml); |
|
58
|
90 |
|
$parsedHtml = mb_substr(self::$allHtml, 0, $position); |
|
59
|
90 |
|
$line = mb_substr_count($parsedHtml, "\n"); |
|
60
|
90 |
|
if ($line === 0) { |
|
61
|
|
|
return array( |
|
62
|
90 |
|
'line' => 0, |
|
63
|
90 |
|
'position' => $position |
|
64
|
|
|
); |
|
65
|
|
|
} |
|
66
|
|
|
|
|
67
|
3 |
|
$lastNewLinePosition = mb_strrpos($parsedHtml, "\n"); |
|
68
|
|
|
|
|
69
|
|
|
return array( |
|
70
|
3 |
|
'line' => $line, |
|
71
|
3 |
|
'position' => mb_strlen(mb_substr($parsedHtml, $lastNewLinePosition)) |
|
72
|
|
|
); |
|
73
|
|
|
} |
|
74
|
|
|
} |
|
75
|
|
|
|