1
|
|
|
<?php |
2
|
|
|
namespace Aoe\Asdis\Content\Scraper\Extractor; |
3
|
|
|
|
4
|
|
|
/** |
5
|
|
|
* Service which extracts paths from attributes in HTML tags. |
6
|
|
|
*/ |
7
|
|
|
class XmlTagAttribute |
8
|
|
|
{ |
9
|
|
|
/** |
10
|
|
|
* Finds attributes in HTML tags. |
11
|
|
|
* |
12
|
|
|
* @param string $tagName The name of the tag. E.g. 'img'. |
13
|
|
|
* @param string $attributeName The attribute's name. |
14
|
|
|
* @param string $content The content to parse. |
15
|
|
|
* @param array $requiredOtherAttributes An array of other attributes the |
16
|
|
|
* tag must contain. This has to be |
17
|
|
|
* an associative array where the key |
18
|
|
|
* of an element is the attribute's |
19
|
|
|
* name and the element's value is |
20
|
|
|
* the attribute's value. This param |
21
|
|
|
* is optional. |
22
|
|
|
* @return array |
23
|
|
|
*/ |
24
|
3 |
|
public function getAttributeFromTag($tagName, $attributeName, $content, array $requiredOtherAttributes = []) |
25
|
|
|
{ |
26
|
3 |
|
$paths = []; |
27
|
3 |
|
$masks = []; |
28
|
3 |
|
$matches = []; |
29
|
3 |
|
$pattern = ''; |
30
|
|
|
|
31
|
3 |
|
$pattern .= '~<'; |
32
|
3 |
|
$pattern .= $tagName; |
33
|
3 |
|
$pattern .= '\b[^>]*\040\b'; |
34
|
3 |
|
$pattern .= $attributeName; |
35
|
3 |
|
$pattern .= '\s?=\s?([\'"])(.*?)([\'"])[^>]*>~is'; |
36
|
|
|
|
37
|
3 |
|
$count = preg_match_all($pattern, $content, $matches, PREG_PATTERN_ORDER); |
38
|
|
|
|
39
|
3 |
|
if ($count === false || $count === 0 || false === is_array($matches[2]) || sizeof($matches[2]) < 1) { |
40
|
|
|
return [ |
41
|
|
|
'paths' => [], |
42
|
|
|
'masks' => [] |
43
|
|
|
]; |
44
|
|
|
} |
45
|
|
|
|
46
|
3 |
|
if (sizeof($requiredOtherAttributes) < 1) { |
47
|
|
|
return [ |
48
|
2 |
|
'paths' => $matches[2], |
49
|
2 |
|
'masks' => $matches[1] |
50
|
|
|
]; |
51
|
|
|
} |
52
|
|
|
|
53
|
1 |
|
foreach ($matches[2] as $mkey => $match) { |
54
|
1 |
|
$containsAllRequiredAttributes = true; |
55
|
1 |
|
foreach ($requiredOtherAttributes as $key => $attr) { |
56
|
1 |
|
$attrMatches = []; |
57
|
1 |
|
$attrPattern = '~' . preg_quote($key) . '=["\']' . preg_quote($attr) . '["\']~is'; |
58
|
1 |
|
if (preg_match_all($attrPattern, $matches[0][$mkey], $attrMatches, PREG_PATTERN_ORDER) === 0) { |
59
|
1 |
|
$containsAllRequiredAttributes = false; |
60
|
|
|
} |
61
|
|
|
} |
62
|
1 |
|
if ($containsAllRequiredAttributes) { |
63
|
1 |
|
$paths[] = $match; |
64
|
1 |
|
$masks[] = $matches[1][$mkey]; |
65
|
|
|
} |
66
|
|
|
} |
67
|
|
|
|
68
|
|
|
return [ |
69
|
1 |
|
'paths' => $paths, |
70
|
1 |
|
'masks' => $masks |
71
|
|
|
]; |
72
|
|
|
} |
73
|
|
|
} |