|
1
|
|
|
<?php |
|
2
|
|
|
namespace Aoe\Asdis\Content\Scraper\Extractor; |
|
3
|
|
|
|
|
4
|
|
|
/** |
|
5
|
|
|
* Service which extracts paths from attributes in HTML tags. |
|
6
|
|
|
*/ |
|
7
|
|
|
class XmlTagAttribute |
|
8
|
|
|
{ |
|
9
|
|
|
/** |
|
10
|
|
|
* Finds attributes in HTML tags. |
|
11
|
|
|
* |
|
12
|
|
|
* @param string $tagName The name of the tag. E.g. 'img'. |
|
13
|
|
|
* @param string $attributeName The attribute's name. |
|
14
|
|
|
* @param string $content The content to parse. |
|
15
|
|
|
* @param array $requiredOtherAttributes An array of other attributes the |
|
16
|
|
|
* tag must contain. This has to be |
|
17
|
|
|
* an associative array where the key |
|
18
|
|
|
* of an element is the attribute's |
|
19
|
|
|
* name and the element's value is |
|
20
|
|
|
* the attribute's value. This param |
|
21
|
|
|
* is optional. |
|
22
|
|
|
* @return array |
|
23
|
|
|
*/ |
|
24
|
3 |
|
public function getAttributeFromTag($tagName, $attributeName, $content, array $requiredOtherAttributes = []) |
|
25
|
|
|
{ |
|
26
|
3 |
|
$paths = []; |
|
27
|
3 |
|
$masks = []; |
|
28
|
3 |
|
$matches = []; |
|
29
|
3 |
|
$pattern = ''; |
|
30
|
|
|
|
|
31
|
3 |
|
$pattern .= '~<'; |
|
32
|
3 |
|
$pattern .= $tagName; |
|
33
|
3 |
|
$pattern .= '\b[^>]*\040\b'; |
|
34
|
3 |
|
$pattern .= $attributeName; |
|
35
|
3 |
|
$pattern .= '\s?=\s?([\'"])(.*?)([\'"])[^>]*>~is'; |
|
36
|
|
|
|
|
37
|
3 |
|
$count = preg_match_all($pattern, $content, $matches, PREG_PATTERN_ORDER); |
|
38
|
|
|
|
|
39
|
3 |
|
if ($count === false || $count === 0 || false === is_array($matches[2]) || sizeof($matches[2]) < 1) { |
|
40
|
|
|
return [ |
|
41
|
|
|
'paths' => [], |
|
42
|
|
|
'masks' => [] |
|
43
|
|
|
]; |
|
44
|
|
|
} |
|
45
|
|
|
|
|
46
|
3 |
|
if (sizeof($requiredOtherAttributes) < 1) { |
|
47
|
|
|
return [ |
|
48
|
2 |
|
'paths' => $matches[2], |
|
49
|
2 |
|
'masks' => $matches[1] |
|
50
|
|
|
]; |
|
51
|
|
|
} |
|
52
|
|
|
|
|
53
|
1 |
|
foreach ($matches[2] as $mkey => $match) { |
|
54
|
1 |
|
$containsAllRequiredAttributes = true; |
|
55
|
1 |
|
foreach ($requiredOtherAttributes as $key => $attr) { |
|
56
|
1 |
|
$attrMatches = []; |
|
57
|
1 |
|
$attrPattern = '~' . preg_quote($key) . '=["\']' . preg_quote($attr) . '["\']~is'; |
|
58
|
1 |
|
if (preg_match_all($attrPattern, $matches[0][$mkey], $attrMatches, PREG_PATTERN_ORDER) === 0) { |
|
59
|
1 |
|
$containsAllRequiredAttributes = false; |
|
60
|
|
|
} |
|
61
|
|
|
} |
|
62
|
1 |
|
if ($containsAllRequiredAttributes) { |
|
63
|
1 |
|
$paths[] = $match; |
|
64
|
1 |
|
$masks[] = $matches[1][$mkey]; |
|
65
|
|
|
} |
|
66
|
|
|
} |
|
67
|
|
|
|
|
68
|
|
|
return [ |
|
69
|
1 |
|
'paths' => $paths, |
|
70
|
1 |
|
'masks' => $masks |
|
71
|
|
|
]; |
|
72
|
|
|
} |
|
73
|
|
|
} |