yusufkandemir /
microdata-parser
| 1 | <?php |
||
| 2 | |||
| 3 | namespace YusufKandemir\MicrodataParser; |
||
| 4 | |||
| 5 | class MicrodataDOMElement extends \DOMElement |
||
| 6 | { |
||
| 7 | /** @var array "tag name" to "attribute name" mapping */ |
||
| 8 | private static $tagNameLookup = [ |
||
| 9 | 'audio' => 'src', |
||
| 10 | 'embed' => 'src', |
||
| 11 | 'iframe' => 'src', |
||
| 12 | 'img' => 'src', |
||
| 13 | 'source' => 'src', |
||
| 14 | 'track' => 'src', |
||
| 15 | 'video' => 'src', |
||
| 16 | 'a' => 'href', |
||
| 17 | 'area' => 'href', |
||
| 18 | 'link' => 'href', |
||
| 19 | 'object' => 'data', |
||
| 20 | 'data' => 'value', |
||
| 21 | 'meter' => 'value', |
||
| 22 | 'time' => 'datetime', |
||
| 23 | ]; |
||
| 24 | |||
| 25 | /** @var array Attributes that have absolute values */ |
||
| 26 | private static $absoluteAttributes = ['src', 'href', 'data',]; |
||
| 27 | |||
| 28 | /** |
||
| 29 | * @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-item-properties for details of algorithm |
||
| 30 | * |
||
| 31 | * @return array |
||
| 32 | */ |
||
| 33 | 39 | public function getProperties() : array |
|
| 34 | { |
||
| 35 | 39 | $results = []; |
|
| 36 | 39 | $memory = [$this]; |
|
| 37 | 39 | $pending = $this->getChildElementNodes(); |
|
| 38 | |||
| 39 | 39 | $pending = array_merge($pending, $this->getReferenceNodes()); |
|
| 40 | |||
| 41 | 39 | while ($pending) { |
|
|
0 ignored issues
–
show
|
|||
| 42 | 39 | $current = array_pop($pending); |
|
| 43 | |||
| 44 | 39 | foreach ($memory as $memory_item) { |
|
| 45 | 39 | if ($current->isSameNode($memory_item)) { |
|
| 46 | 39 | continue 2; // Skip next part and continue while loop if memory contains $current |
|
| 47 | } |
||
| 48 | } |
||
| 49 | |||
| 50 | 39 | $memory[] = $current; |
|
| 51 | |||
| 52 | 39 | if (! $current->hasAttribute('itemscope')) { |
|
| 53 | 39 | $pending = array_merge($pending, $current->getChildElementNodes()); |
|
| 54 | } |
||
| 55 | |||
| 56 | 39 | if ($current->hasAttribute('itemprop') && $current->hasPropertyNames()) { |
|
| 57 | 39 | $results[] = $current; |
|
| 58 | } |
||
| 59 | } |
||
| 60 | |||
| 61 | 39 | return array_reverse($results); |
|
| 62 | } |
||
| 63 | |||
| 64 | /** |
||
| 65 | * @return bool |
||
| 66 | */ |
||
| 67 | 39 | public function hasPropertyNames() : bool |
|
| 68 | { |
||
| 69 | 39 | return !empty($this->tokenizeAttribute('itemprop')); |
|
| 70 | } |
||
| 71 | |||
| 72 | /** |
||
| 73 | * @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-property-name |
||
| 74 | * |
||
| 75 | * @return array |
||
| 76 | */ |
||
| 77 | 39 | public function getPropertyNames() : array |
|
| 78 | { |
||
| 79 | 39 | $tokens = $this->tokenizeAttribute('itemprop'); |
|
| 80 | |||
| 81 | 39 | $properties = []; |
|
| 82 | |||
| 83 | 39 | foreach ($tokens as $token) { |
|
| 84 | 39 | if (!$this->isAbsoluteUri($token) && $this->tokenizeAttribute('itemtype')) { |
|
| 85 | 18 | $token = /*$vocabularyIdentifier . */ $token; |
|
| 86 | } |
||
| 87 | |||
| 88 | 39 | $properties[] = $token; |
|
| 89 | } |
||
| 90 | |||
| 91 | 39 | return array_unique($properties); |
|
| 92 | } |
||
| 93 | |||
| 94 | /** |
||
| 95 | * @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-property-value for details of algorithm |
||
| 96 | * |
||
| 97 | * @param callable $absoluteUriHandler |
||
| 98 | * |
||
| 99 | * @return $this|string |
||
| 100 | */ |
||
| 101 | 39 | public function getPropertyValue(callable $absoluteUriHandler = null) |
|
| 102 | { |
||
| 103 | 39 | if ($this->hasAttribute('itemscope')) { |
|
| 104 | 18 | return $this; |
|
| 105 | } |
||
| 106 | |||
| 107 | 39 | if ($this->hasAttribute('content')) { |
|
| 108 | 9 | return $this->getAttribute('content'); |
|
| 109 | } |
||
| 110 | |||
| 111 | 39 | $value = ''; |
|
| 112 | |||
| 113 | 39 | if (\array_key_exists($this->tagName, self::$tagNameLookup)) { |
|
| 114 | 39 | $attribute = self::$tagNameLookup[$this->tagName]; |
|
| 115 | 39 | $value = $this->getAttribute($attribute); |
|
| 116 | |||
| 117 | 39 | if (!empty($value) && \in_array($attribute, self::$absoluteAttributes) && !$this->isAbsoluteUri($value)) { |
|
| 118 | 30 | $value = $absoluteUriHandler($value, $this->ownerDocument->documentURI); |
|
| 119 | } |
||
| 120 | } |
||
| 121 | |||
| 122 | 39 | return $value ?: $this->textContent; |
|
| 123 | } |
||
| 124 | |||
| 125 | /** |
||
| 126 | * Checks a string to see if its absolute uri or not |
||
| 127 | * Note: As it uses a simple regex to check, it is not that reliable |
||
| 128 | * |
||
| 129 | * @see \preg_match() for return values |
||
| 130 | * |
||
| 131 | * @param string $uri |
||
| 132 | * |
||
| 133 | * @return false|int |
||
| 134 | */ |
||
| 135 | 39 | protected function isAbsoluteUri(string $uri) |
|
| 136 | { |
||
| 137 | 39 | return preg_match("/^\w+:/", trim($uri)); |
|
| 138 | } |
||
| 139 | |||
| 140 | /** |
||
| 141 | * Filters out TextNodes etc. and returns child ElementNodes as array |
||
| 142 | * |
||
| 143 | * @return array Result array which contains child ElementNodes |
||
| 144 | */ |
||
| 145 | 39 | protected function getChildElementNodes() |
|
| 146 | { |
||
| 147 | 39 | $childNodes = []; |
|
| 148 | |||
| 149 | 39 | foreach ($this->childNodes as $childNode) { |
|
| 150 | 39 | if ($childNode->nodeType == XML_ELEMENT_NODE) { |
|
| 151 | 39 | $childNodes[] = $childNode; |
|
| 152 | } |
||
| 153 | } |
||
| 154 | |||
| 155 | 39 | return $childNodes; |
|
| 156 | } |
||
| 157 | |||
| 158 | /** |
||
| 159 | * Tokenizes value of given attribute |
||
| 160 | * |
||
| 161 | * @param string $attributeName Name of the attribute |
||
| 162 | * |
||
| 163 | * @return array|array[]|false|string[] |
||
| 164 | */ |
||
| 165 | 39 | public function tokenizeAttribute(string $attributeName) |
|
| 166 | { |
||
| 167 | 39 | $attribute = []; |
|
| 168 | |||
| 169 | 39 | if ($this->hasAttribute($attributeName)) { |
|
| 170 | 39 | $attribute = $this->tokenize($this->getAttribute($attributeName)); |
|
| 171 | } |
||
| 172 | |||
| 173 | 39 | return $attribute; |
|
| 174 | } |
||
| 175 | |||
| 176 | /** |
||
| 177 | * Splits given attribute value in space characters to array |
||
| 178 | * |
||
| 179 | * @see \preg_split() for possible return values and behaviour |
||
| 180 | * |
||
| 181 | * @see https://www.w3.org/TR/2018/WD-microdata-20180426/#dfn-split-a-string-on-spaces for definition of tokens |
||
| 182 | * |
||
| 183 | * @param string $attribute |
||
| 184 | * |
||
| 185 | * @return array[]|false|string[] |
||
| 186 | */ |
||
| 187 | 39 | protected function tokenize(string $attribute) |
|
| 188 | { |
||
| 189 | 39 | return preg_split('/\s+/', trim($attribute)); |
|
| 190 | } |
||
| 191 | |||
| 192 | /** |
||
| 193 | * Finds the nodes that this node references through the document |
||
| 194 | * |
||
| 195 | * @see https://www.w3.org/TR/microdata/#dfn-item-properties 4th step |
||
| 196 | * |
||
| 197 | * @return array |
||
| 198 | */ |
||
| 199 | 39 | protected function getReferenceNodes(): array |
|
| 200 | { |
||
| 201 | 39 | $referenceNodes = []; |
|
| 202 | |||
| 203 | 39 | if ($this->hasAttribute('itemref')) { |
|
| 204 | 12 | $tokens = $this->tokenizeAttribute('itemref'); |
|
| 205 | |||
| 206 | 12 | foreach ($tokens as $token) { |
|
| 207 | 12 | $references = $this->ownerDocument->xpath->query('//*[@id="' . $token . '"]'); |
|
| 208 | |||
| 209 | 12 | if ($first = $references->item(0)) { |
|
| 210 | 12 | $referenceNodes[] = $first; |
|
| 211 | } |
||
| 212 | } |
||
| 213 | } |
||
| 214 | |||
| 215 | 39 | return $referenceNodes; |
|
| 216 | } |
||
| 217 | } |
||
| 218 |
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)or! empty(...)instead.