1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Kevintweber\HtmlTokenizer; |
4
|
|
|
|
5
|
|
|
use Kevintweber\HtmlTokenizer\Tokens\Token; |
6
|
|
|
use Kevintweber\HtmlTokenizer\Tokens\TokenCollection; |
7
|
|
|
use Kevintweber\HtmlTokenizer\Tokens\TokenFactory; |
8
|
|
|
|
9
|
|
|
class HtmlTokenizer |
10
|
|
|
{ |
11
|
|
|
/** @var boolean */ |
12
|
|
|
private $throwOnError; |
13
|
|
|
|
14
|
|
|
/** @var string */ |
15
|
|
|
private static $allHtml = ''; |
16
|
|
|
|
17
|
|
|
/** |
18
|
|
|
* Constructor |
19
|
|
|
* |
20
|
|
|
* @param bool $throwOnError |
21
|
|
|
*/ |
22
|
9 |
|
public function __construct(bool $throwOnError = true) |
23
|
|
|
{ |
24
|
9 |
|
$this->throwOnError = $throwOnError; |
25
|
9 |
|
} |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* Will parse html into tokens. |
29
|
|
|
* |
30
|
|
|
* @param $html string The HTML to tokenize. |
31
|
|
|
* |
32
|
|
|
* @return TokenCollection |
33
|
|
|
* @throws \Kevintweber\HtmlTokenizer\Exceptions\TokenMatchingException |
34
|
|
|
*/ |
35
|
9 |
|
public function parse(string $html) : TokenCollection |
36
|
|
|
{ |
37
|
9 |
|
self::$allHtml = $html; |
38
|
9 |
|
$tokens = new TokenCollection(); |
39
|
9 |
|
$remainingHtml = trim($html); |
40
|
9 |
|
while (mb_strlen($remainingHtml) > 0) { |
41
|
9 |
|
$token = TokenFactory::buildFromHtml( |
42
|
9 |
|
$remainingHtml, |
43
|
9 |
|
null, |
44
|
9 |
|
$this->throwOnError |
45
|
|
|
); |
46
|
9 |
|
if (!$token instanceof Token) { |
47
|
|
|
// Error has occurred, so we stop. |
48
|
2 |
|
break; |
49
|
|
|
} |
50
|
|
|
|
51
|
9 |
|
$remainingHtml = $token->parse($remainingHtml); |
52
|
9 |
|
$tokens[] = $token; |
53
|
|
|
} |
54
|
|
|
|
55
|
8 |
|
return $tokens; |
56
|
|
|
} |
57
|
|
|
|
58
|
90 |
|
public static function getPosition(string $partialHtml) : array |
59
|
|
|
{ |
60
|
90 |
|
$position = mb_strrpos(self::$allHtml, $partialHtml); |
61
|
90 |
|
$parsedHtml = mb_substr(self::$allHtml, 0, $position); |
62
|
90 |
|
$line = mb_substr_count($parsedHtml, "\n"); |
63
|
90 |
|
if ($line === 0) { |
64
|
|
|
return array( |
65
|
90 |
|
'line' => 0, |
66
|
90 |
|
'position' => $position |
67
|
|
|
); |
68
|
|
|
} |
69
|
|
|
|
70
|
3 |
|
$lastNewLinePosition = mb_strrpos($parsedHtml, "\n"); |
71
|
|
|
|
72
|
|
|
return array( |
73
|
3 |
|
'line' => $line, |
74
|
3 |
|
'position' => mb_strlen(mb_substr($parsedHtml, $lastNewLinePosition)) |
75
|
|
|
); |
76
|
|
|
} |
77
|
|
|
} |
78
|
|
|
|