Passed
Pull Request — master (#34)
by
unknown
18:31
created

getTitle()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 4
c 0
b 0
f 0
nc 2
nop 1
dl 0
loc 8
rs 10
1
<?php
2
3
/**
4
 * This class overcomes a few common annoyances with the DOMDocument class,
5
 * such as saving partial HTML without automatically adding extra tags
6
 * and properly recognizing various encodings, specifically UTF-8.
7
 *
8
 * @author Artem Russakovskii
9
 *
10
 * @version 0.4.2
11
 *
12
 * @see http://beerpla.net
13
 * @see http://www.php.net/manual/en/class.domdocument.php
14
 *
15
 * @author Dimas Lanjaka <[email protected]>
16
 */
17
class SmartDOMDocument extends DOMDocument
18
{
19
  public $load_uid = '';
20
  public $root_uid = '';
21
22
  public function __construct(string $version = '', string $encoding = '')
23
  {
24
    parent::__construct($version, $encoding);
25
    $this->root_uid = $this->genHash(10);
26
    $this->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
27
  }
28
29
  public function genHash(int $bytes)
30
  {
31
    return bin2hex(openssl_random_pseudo_bytes($bytes));
32
  }
33
34
  /**
35
   * Adds an ability to use the SmartDOMDocument object as a string in a string context.
36
   * For example, echo "Here is the HTML: $dom";.
37
   */
38
  public function __toString()
39
  {
40
    return $this->saveHTMLExact();
41
  }
42
43
  /**
44
   * Load HTML with a proper encoding fix/hack.
45
   * Borrowed from the link below.
46
   *
47
   * @see http://www.php.net/manual/en/domdocument.loadhtml.php
48
   *
49
   * @param string $html
50
   * @param string $encoding
51
   *
52
   * @return bool
53
   */
54
  public function loadHTML($html, $encoding = 'UTF-8')
55
  {
56
    libxml_use_internal_errors(true);
57
    $html = mb_convert_encoding($html, 'HTML-ENTITIES', $encoding);
58
    $load = @parent::loadHTML($html); // suppress warnings
59
    libxml_use_internal_errors(false);
60
    $this->load_uid = $this->genHash(5);
61
62
    return $load;
63
  }
64
65
  /**
66
   * Return HTML while stripping the annoying auto-added <html>, <body>, and doctype.
67
   *
68
   * @see http://php.net/manual/en/migration52.methods.php
69
   *
70
   * @return string
71
   */
72
  public function saveHTMLExact()
73
  {
74
    $content = preg_replace(
75
      [
76
        "/^\<\!DOCTYPE.*?<html><body>/si",
77
        '!</body></html>$!si',
78
      ],
79
      '',
80
      $this->saveHTML()
81
    );
82
83
    return $content;
84
  }
85
86
  /**
87
   * This test functions shows an example of SmartDOMDocument in action.
88
   * A sample HTML fragment is loaded.
89
   * Then, the first image in the document is cut out and saved separately.
90
   * It also shows that Russian characters are parsed correctly.
91
   */
92
  public static function testHTML()
93
  {
94
    $content = <<<CONTENT
95
<div class='class1'>
96
  <img src='http://www.google.com/favicon.ico' />
97
  Some Text
98
  <p>русский</p>
99
</div>
100
CONTENT;
101
102
    echo "Before removing the image, the content is: \n" . htmlspecialchars($content) . "<br>\n";
103
104
    $content_doc = new SmartDOMDocument();
105
    $content_doc->loadHTML($content);
106
107
    try {
108
      $first_image = $content_doc->getElementsByTagName('img')->item(0);
109
110
      if ($first_image) {
111
        $first_image->parentNode->removeChild($first_image);
112
113
        $content = $content_doc->saveHTMLExact();
114
115
        $image_doc = new SmartDOMDocument();
116
        $image_doc->appendChild($image_doc->importNode($first_image, true));
117
        $image = $image_doc->saveHTMLExact();
118
      }
119
    } catch (Exception $e) {
0 ignored issues
show
Coding Style Comprehensibility introduced by
Consider adding a comment why this CATCH block is empty.
Loading history...
120
    }
121
122
    echo "After removing the image, the content is: \n" . htmlspecialchars($content) . "<br>\n";
123
    echo "The image is: \n" . htmlspecialchars($image);
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $image does not seem to be defined for all execution paths leading up to this point.
Loading history...
124
  }
125
126
  /**
127
   * @var DOMXPath
128
   */
129
  protected $xpath;
130
131
  /**
132
   * Query nodes by css selector.
133
   *
134
   * @param string $selector
135
   * @param bool   $first    return only first node
136
   *
137
   * @return DOMNodeList|DOMElement|null
138
   */
139
  public function querySelectorAll($selector, $first = false)
140
  {
141
    if (null === $this->xpath) {
142
      $this->xpath = new DOMXPath($this);
143
    }
144
    $elements = $this->xpath->query(static::selector2XPath($selector));
145
    if ($first) {
146
      return $elements->length ? $elements->item(0) : null;
147
    }
148
149
    return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $elements could also return false which is incompatible with the documented return type DOMElement|DOMNodeList|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
150
  }
151
152
  /**
153
   * Query node by css selector.
154
   *
155
   * @param string $selector
156
   *
157
   * @return DOMElement|null
158
   */
159
  public function querySelector($selector)
160
  {
161
    return $this->querySelectorAll($selector, true);
162
  }
163
164
  /**
165
   * Convert selector to XPath string.
166
   *
167
   * @see https://github.com/tj/php-selector/blob/master/selector.inc
168
   */
169
  public static function selector2XPath($selector)
170
  {
171
    // remove spaces around operators
172
    $selector = preg_replace('/\s*>\s*/', '>', $selector);
173
    $selector = preg_replace('/\s*~\s*/', '~', $selector);
174
    $selector = preg_replace('/\s*\+\s*/', '+', $selector);
175
    $selector = preg_replace('/\s*,\s*/', ',', $selector);
176
    $selectors = preg_split('/\s+(?![^\[]+\])/', $selector);
177
    foreach ($selectors as &$selector) {
0 ignored issues
show
introduced by
$selector is overwriting one of the parameters of this function.
Loading history...
178
      // ,
179
      $selector = preg_replace('/,/', '|descendant-or-self::', $selector);
180
      // input:checked, :disabled, etc.
181
      $selector = preg_replace('/(.+)?:(checked|disabled|required|autofocus)/', '\1[@\2="\2"]', $selector);
182
      // input:autocomplete, :autocomplete
183
      $selector = preg_replace('/(.+)?:(autocomplete)/', '\1[@\2="on"]', $selector);
184
      // input:button, input:submit, etc.
185
      $selector = preg_replace('/:(text|password|checkbox|radio|button|submit|reset|file|hidden|image|datetime|datetime-local|date|month|time|week|number|range|email|url|search|tel|color)/', 'input[@type="\1"]', $selector);
186
      // foo[id]
187
      $selector = preg_replace('/(\w+)\[([_\w-]+[_\w\d-]*)\]/', '\1[@\2]', $selector);
188
      // [id]
189
      $selector = preg_replace('/\[([_\w-]+[_\w\d-]*)\]/', '*[@\1]', $selector);
190
      // foo[id=foo]
191
      $selector = preg_replace('/\[([_\w-]+[_\w\d-]*)=[\'"]?(.*?)[\'"]?\]/', '[@\1="\2"]', $selector);
192
      // [id=foo]
193
      $selector = preg_replace('/^\[/', '*[', $selector);
194
      // div#foo
195
      $selector = preg_replace('/([_\w-]+[_\w\d-]*)\#([_\w-]+[_\w\d-]*)/', '\1[@id="\2"]', $selector);
196
      // #foo
197
      $selector = preg_replace('/\#([_\w-]+[_\w\d-]*)/', '*[@id="\1"]', $selector);
198
      // div.foo
199
      $selector = preg_replace('/([_\w-]+[_\w\d-]*)\.([_\w-]+[_\w\d-]*)/', '\1[contains(concat(" ",@class," ")," \2 ")]', $selector);
200
      // .foo
201
      $selector = preg_replace('/\.([_\w-]+[_\w\d-]*)/', '*[contains(concat(" ",@class," ")," \1 ")]', $selector);
202
      // div:first-child
203
      $selector = preg_replace('/([_\w-]+[_\w\d-]*):first-child/', '*/\1[position()=1]', $selector);
204
      // div:last-child
205
      $selector = preg_replace('/([_\w-]+[_\w\d-]*):last-child/', '*/\1[position()=last()]', $selector);
206
      // :first-child
207
      $selector = str_replace(':first-child', '*/*[position()=1]', $selector);
208
      // :last-child
209
      $selector = str_replace(':last-child', '*/*[position()=last()]', $selector);
210
      // :nth-last-child
211
      $selector = preg_replace('/:nth-last-child\((\d+)\)/', '[position()=(last() - (\1 - 1))]', $selector);
212
      // div:nth-child
213
      $selector = preg_replace('/([_\w-]+[_\w\d-]*):nth-child\((\d+)\)/', '*/*[position()=\2 and self::\1]', $selector);
214
      // :nth-child
215
      $selector = preg_replace('/:nth-child\((\d+)\)/', '*/*[position()=\1]', $selector);
216
      // :contains(Foo)
217
      $selector = preg_replace('/([_\w-]+[_\w\d-]*):contains\((.*?)\)/', '\1[contains(string(.),"\2")]', $selector);
218
      // >
219
      $selector = preg_replace('/>/', '/', $selector);
220
      // ~
221
      $selector = preg_replace('/~/', '/following-sibling::', $selector);
222
      // +
223
      $selector = preg_replace('/\+([_\w-]+[_\w\d-]*)/', '/following-sibling::\1[position()=1]', $selector);
224
      $selector = str_replace(']*', ']', $selector);
225
      $selector = str_replace(']/*', ']', $selector);
226
    }
227
    // ' '
228
    $selector = implode('/descendant::', $selectors);
0 ignored issues
show
Bug introduced by
It seems like $selectors can also be of type false; however, parameter $pieces of implode() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

228
    $selector = implode('/descendant::', /** @scrutinizer ignore-type */ $selectors);
Loading history...
229
    $selector = 'descendant-or-self::' . $selector;
230
    // :scope
231
    $selector = preg_replace('/(((\|)?descendant-or-self::):scope)/', '.\3', $selector);
232
    // $element
233
    $sub_selectors = explode(',', $selector);
234
    foreach ($sub_selectors as $key => $sub_selector) {
235
      $parts = explode('$', $sub_selector);
236
      $sub_selector = array_shift($parts);
237
      if (count($parts) && preg_match_all('/((?:[^\/]*\/?\/?)|$)/', $parts[0], $matches)) {
238
        $results = $matches[0];
239
        $results[] = str_repeat('/..', count($results) - 2);
240
        $sub_selector .= implode('', $results);
241
      }
242
      $sub_selectors[$key] = $sub_selector;
243
    }
244
    $selector = implode(',', $sub_selectors);
245
246
    return $selector;
247
  }
248
}
249
250
/**
251
 * Get pure InnerHTML.
252
 *
253
 * @param \DOMNode $element
254
 *
255
 * @return string
256
 */
257
function innerHTML(\DOMNode $element)
258
{
259
  $innerHTML = '';
260
  $children = $element->childNodes;
261
262
  foreach ($children as $child) {
263
    $innerHTML .= $element->ownerDocument->saveHTML($child);
264
  }
265
266
  return $innerHTML;
267
}
268
269
270
/**
271
 * Get HTML Title.
272
 *
273
 * @param \SmartDOMDocument $dom
274
 *
275
 * @return string
276
 */
277
function getTitle(\SmartDOMDocument $dom)
278
{
279
  $list = $dom->getElementsByTagName('title');
280
  if ($list->length > 0) {
281
    return $list->item(0)->textContent;
282
  }
283
284
  return __FUNCTION__;
285
}
286
287
/**
288
 * Get function which called current function
289
 *
290
 * @param boolean $completeTrace
291
 * @return string
292
 */
293
function getCallingFunctionName($completeTrace = false)
294
{
295
  $trace = debug_backtrace();
296
  if ($completeTrace) {
297
    $str = '';
298
    foreach ($trace as $caller) {
299
      $str .= " -- Called by {$caller['function']}";
300
      if (isset($caller['class']))
301
        $str .= " From Class {$caller['class']}";
302
    }
303
  } else {
304
    $caller = $trace[2];
305
    $str = "Called by {$caller['function']}";
306
    if (isset($caller['class']))
307
      $str .= " From Class {$caller['class']}";
308
  }
309
  return $str;
310
}
311