Total Complexity | 166 |
Total Lines | 820 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like simple_html_dom often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use simple_html_dom, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
1119 | class simple_html_dom |
||
1120 | { |
||
1121 | /** @var simple_html_dom_node $root */ |
||
1122 | public $root = null; |
||
1123 | public $nodes = []; |
||
1124 | public $callback = null; |
||
1125 | public $lowercase = false; |
||
1126 | // Used to keep track of how large the text was when we started. |
||
1127 | public $original_size; |
||
1128 | public $size; |
||
1129 | protected $pos; |
||
1130 | protected $doc; |
||
1131 | protected $char; |
||
1132 | protected $cursor; |
||
1133 | protected $parent; |
||
1134 | protected $noise = []; |
||
1135 | protected $token_blank = " \t\r\n"; |
||
1136 | protected $token_equal = ' =/>'; |
||
1137 | protected $token_slash = " />\r\n\t"; |
||
1138 | protected $token_attr = ' >'; |
||
1139 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. |
||
1140 | public $_charset = ''; |
||
1141 | public $_target_charset = ''; |
||
1142 | protected $default_br_text = ''; |
||
1143 | public $default_span_text = ''; |
||
1144 | |||
1145 | // use isset instead of in_array, performance boost about 30%... |
||
1146 | protected $self_closing_tags = ['img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1]; |
||
1147 | protected $block_tags = ['root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1]; |
||
1148 | // Known sourceforge issue #2977341 |
||
1149 | // B tags that are not closed cause us to return everything to the end of the document. |
||
1150 | protected $optional_closing_tags = [ |
||
1151 | 'tr' => ['tr'=>1, 'td'=>1, 'th'=>1], |
||
1152 | 'th' => ['th'=>1], |
||
1153 | 'td' => ['td'=>1], |
||
1154 | 'li' => ['li'=>1], |
||
1155 | 'dt' => ['dt'=>1, 'dd'=>1], |
||
1156 | 'dd' => ['dd'=>1, 'dt'=>1], |
||
1157 | 'dl' => ['dd'=>1, 'dt'=>1], |
||
1158 | 'p' => ['p'=>1], |
||
1159 | 'nobr' => ['nobr'=>1], |
||
1160 | 'b' => ['b'=>1], |
||
1161 | 'option'=> ['option'=>1], |
||
1162 | ]; |
||
1163 | |||
1164 | public function __construct($str = null, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) |
||
1165 | { |
||
1166 | if ($str) { |
||
1167 | if (preg_match("/^http:\/\//i", $str) || is_file($str)) { |
||
1168 | $this->load_file($str); |
||
1169 | } else { |
||
1170 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); |
||
1171 | } |
||
1172 | } |
||
1173 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. |
||
1174 | if (!$forceTagsClosed) { |
||
1175 | $this->optional_closing_array = []; |
||
1176 | } |
||
1177 | $this->_target_charset = $target_charset; |
||
1178 | } |
||
1179 | |||
1180 | public function __destruct() |
||
1181 | { |
||
1182 | $this->clear(); |
||
1183 | } |
||
1184 | |||
1185 | // load html from string |
||
1186 | public function load($str, $lowercase = true, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) |
||
1187 | { |
||
1188 | global $debugObject; |
||
1189 | |||
1190 | // prepare |
||
1191 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); |
||
1192 | // strip out comments |
||
1193 | $this->remove_noise("'<!--(.*?)-->'is"); |
||
1194 | // strip out cdata |
||
1195 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); |
||
1196 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 |
||
1197 | // Script tags removal now preceeds style tag removal. |
||
1198 | // strip out <script> tags |
||
1199 | $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); |
||
1200 | $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); |
||
1201 | // strip out <style> tags |
||
1202 | $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); |
||
1203 | $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); |
||
1204 | // strip out preformatted tags |
||
1205 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); |
||
1206 | // strip out server side scripts |
||
1207 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); |
||
1208 | // strip smarty scripts |
||
1209 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); |
||
1210 | |||
1211 | // parsing |
||
1212 | while ($this->parse()); |
||
1213 | // end |
||
1214 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||
1215 | $this->parse_charset(); |
||
1216 | |||
1217 | // make load function chainable |
||
1218 | return $this; |
||
1219 | } |
||
1220 | |||
1221 | // load html from file |
||
1222 | public function load_file() |
||
1223 | { |
||
1224 | $args = func_get_args(); |
||
1225 | $this->load(call_user_func_array('file_get_contents', $args), true); |
||
1226 | // Throw an error if we can't properly load the dom. |
||
1227 | if (($error = error_get_last()) !== null) { |
||
1228 | $this->clear(); |
||
1229 | |||
1230 | return false; |
||
1231 | } |
||
1232 | } |
||
1233 | |||
1234 | // set callback function |
||
1235 | public function set_callback($function_name) |
||
1236 | { |
||
1237 | $this->callback = $function_name; |
||
1238 | } |
||
1239 | |||
1240 | // remove callback function |
||
1241 | public function remove_callback() |
||
1242 | { |
||
1243 | $this->callback = null; |
||
1244 | } |
||
1245 | |||
1246 | // save dom as string |
||
1247 | public function save($filepath = '') |
||
1248 | { |
||
1249 | $ret = $this->root->innertext(); |
||
1250 | if ($filepath !== '') { |
||
1251 | file_put_contents($filepath, $ret, LOCK_EX); |
||
1252 | } |
||
1253 | |||
1254 | return $ret; |
||
1255 | } |
||
1256 | |||
1257 | // find dom node by css selector |
||
1258 | // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. |
||
1259 | public function find($selector, $idx = null, $lowercase = false) |
||
1260 | { |
||
1261 | return $this->root->find($selector, $idx, $lowercase); |
||
1262 | } |
||
1263 | |||
1264 | // clean up memory due to php5 circular references memory leak... |
||
1265 | public function clear() |
||
1266 | { |
||
1267 | foreach ($this->nodes as $n) { |
||
1268 | $n->clear(); |
||
1269 | $n = null; |
||
1270 | } |
||
1271 | // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. |
||
1272 | if (isset($this->children)) { |
||
1273 | foreach ($this->children as $n) { |
||
1274 | $n->clear(); |
||
1275 | $n = null; |
||
1276 | } |
||
1277 | } |
||
1278 | if (isset($this->parent)) { |
||
1279 | $this->parent->clear(); |
||
1280 | unset($this->parent); |
||
1281 | } |
||
1282 | if (isset($this->root)) { |
||
1283 | $this->root->clear(); |
||
1284 | unset($this->root); |
||
1285 | } |
||
1286 | unset($this->doc); |
||
1287 | unset($this->noise); |
||
1288 | } |
||
1289 | |||
1290 | public function dump($show_attr = true) |
||
1291 | { |
||
1292 | $this->root->dump($show_attr); |
||
1293 | } |
||
1294 | |||
1295 | // prepare HTML data and init everything |
||
1296 | protected function prepare($str, $lowercase = true, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) |
||
1297 | { |
||
1298 | $this->clear(); |
||
1299 | |||
1300 | // set the length of content before we do anything to it. |
||
1301 | $this->size = strlen($str); |
||
1302 | // Save the original size of the html that we got in. It might be useful to someone. |
||
1303 | $this->original_size = $this->size; |
||
1304 | |||
1305 | //before we save the string as the doc... strip out the \r \n's if we are told to. |
||
1306 | if ($stripRN) { |
||
1307 | $str = str_replace("\r", ' ', $str); |
||
1308 | $str = str_replace("\n", ' ', $str); |
||
1309 | |||
1310 | // set the length of content since we have changed it. |
||
1311 | $this->size = strlen($str); |
||
1312 | } |
||
1313 | |||
1314 | $this->doc = $str; |
||
1315 | $this->pos = 0; |
||
1316 | $this->cursor = 1; |
||
1317 | $this->noise = []; |
||
1318 | $this->nodes = []; |
||
1319 | $this->lowercase = $lowercase; |
||
1320 | $this->default_br_text = $defaultBRText; |
||
1321 | $this->default_span_text = $defaultSpanText; |
||
1322 | $this->root = new simple_html_dom_node($this); |
||
1323 | $this->root->tag = 'root'; |
||
1324 | $this->root->_[HDOM_INFO_BEGIN] = -1; |
||
1325 | $this->root->nodetype = HDOM_TYPE_ROOT; |
||
1326 | $this->parent = $this->root; |
||
1327 | if ($this->size > 0) { |
||
1328 | $this->char = $this->doc[0]; |
||
1329 | } |
||
1330 | } |
||
1331 | |||
1332 | // parse html content |
||
1333 | protected function parse() |
||
1334 | { |
||
1335 | if (($s = $this->copy_until_char('<')) === '') { |
||
1336 | return $this->read_tag(); |
||
1337 | } |
||
1338 | |||
1339 | // text |
||
1340 | $node = new simple_html_dom_node($this); |
||
1341 | $this->cursor++; |
||
1342 | $node->_[HDOM_INFO_TEXT] = $s; |
||
1343 | $this->link_nodes($node, false); |
||
1344 | |||
1345 | return true; |
||
1346 | } |
||
1347 | |||
1348 | // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. |
||
1349 | // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec |
||
1350 | // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. |
||
1351 | protected function parse_charset() |
||
1352 | { |
||
1353 | global $debugObject; |
||
1354 | |||
1355 | $charset = null; |
||
1356 | |||
1357 | if (function_exists('get_last_retrieve_url_contents_content_type')) { |
||
1358 | $contentTypeHeader = get_last_retrieve_url_contents_content_type(); |
||
1359 | $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); |
||
1360 | if ($success) { |
||
1361 | $charset = $matches[1]; |
||
1362 | if (is_object($debugObject)) { |
||
1363 | $debugObject->debugLog(2, 'header content-type found charset of: '.$charset); |
||
1364 | } |
||
1365 | } |
||
1366 | } |
||
1367 | |||
1368 | if (empty($charset)) { |
||
1369 | $el = $this->root->find('meta[http-equiv=Content-Type]', 0); |
||
1370 | if (!empty($el)) { |
||
1371 | $fullvalue = $el->content; |
||
1372 | if (is_object($debugObject)) { |
||
1373 | $debugObject->debugLog(2, 'meta content-type tag found'.$fullvalue); |
||
1374 | } |
||
1375 | |||
1376 | if (!empty($fullvalue)) { |
||
1377 | $success = preg_match('/charset=(.+)/', $fullvalue, $matches); |
||
1378 | if ($success) { |
||
1379 | $charset = $matches[1]; |
||
1380 | } else { |
||
1381 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 |
||
1382 | if (is_object($debugObject)) { |
||
1383 | $debugObject->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'); |
||
1384 | } |
||
1385 | $charset = 'ISO-8859-1'; |
||
1386 | } |
||
1387 | } |
||
1388 | } |
||
1389 | } |
||
1390 | |||
1391 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... |
||
1392 | if (empty($charset)) { |
||
1393 | // Have php try to detect the encoding from the text given to us. |
||
1394 | $charset = (function_exists('mb_detect_encoding')) ? mb_detect_encoding($this->root->plaintext.'ascii', $encoding_list = ['UTF-8', 'CP1252']) : false; |
||
1395 | if (is_object($debugObject)) { |
||
1396 | $debugObject->debugLog(2, 'mb_detect found: '.$charset); |
||
1397 | } |
||
1398 | |||
1399 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... |
||
1400 | if ($charset === false) { |
||
1401 | if (is_object($debugObject)) { |
||
1402 | $debugObject->debugLog(2, 'since mb_detect failed - using default of utf-8'); |
||
1403 | } |
||
1404 | $charset = 'UTF-8'; |
||
1405 | } |
||
1406 | } |
||
1407 | |||
1408 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. |
||
1409 | if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) { |
||
1410 | if (is_object($debugObject)) { |
||
1411 | $debugObject->debugLog(2, 'replacing '.$charset.' with CP1252 as its a superset'); |
||
1412 | } |
||
1413 | $charset = 'CP1252'; |
||
1414 | } |
||
1415 | |||
1416 | if (is_object($debugObject)) { |
||
1417 | $debugObject->debugLog(1, 'EXIT - '.$charset); |
||
1418 | } |
||
1419 | |||
1420 | return $this->_charset = $charset; |
||
1421 | } |
||
1422 | |||
1423 | // read tag info |
||
1424 | protected function read_tag() |
||
1425 | { |
||
1426 | if ($this->char !== '<') { |
||
1427 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||
1428 | |||
1429 | return false; |
||
1430 | } |
||
1431 | $begin_tag_pos = $this->pos; |
||
1432 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1433 | |||
1434 | // end tag |
||
1435 | if ($this->char === '/') { |
||
1436 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1437 | // This represents the change in the simple_html_dom trunk from revision 180 to 181. |
||
1438 | // $this->skip($this->token_blank_t); |
||
1439 | $this->skip($this->token_blank); |
||
1440 | $tag = $this->copy_until_char('>'); |
||
1441 | |||
1442 | // skip attributes in end tag |
||
1443 | if (($pos = strpos($tag, ' ')) !== false) { |
||
1444 | $tag = substr($tag, 0, $pos); |
||
1445 | } |
||
1446 | |||
1447 | $parent_lower = strtolower($this->parent->tag); |
||
1448 | $tag_lower = strtolower($tag); |
||
1449 | |||
1450 | if ($parent_lower !== $tag_lower) { |
||
1451 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { |
||
1452 | $this->parent->_[HDOM_INFO_END] = 0; |
||
1453 | $org_parent = $this->parent; |
||
1454 | |||
1455 | while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) { |
||
1456 | $this->parent = $this->parent->parent; |
||
1457 | } |
||
1458 | |||
1459 | if (strtolower($this->parent->tag) !== $tag_lower) { |
||
1460 | $this->parent = $org_parent; // restore origonal parent |
||
1461 | if ($this->parent->parent) { |
||
1462 | $this->parent = $this->parent->parent; |
||
1463 | } |
||
1464 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
1465 | |||
1466 | return $this->as_text_node($tag); |
||
1467 | } |
||
1468 | } elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower])) { |
||
1469 | $this->parent->_[HDOM_INFO_END] = 0; |
||
1470 | $org_parent = $this->parent; |
||
1471 | |||
1472 | while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) { |
||
1473 | $this->parent = $this->parent->parent; |
||
1474 | } |
||
1475 | |||
1476 | if (strtolower($this->parent->tag) !== $tag_lower) { |
||
1477 | $this->parent = $org_parent; // restore origonal parent |
||
1478 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
1479 | |||
1480 | return $this->as_text_node($tag); |
||
1481 | } |
||
1482 | } elseif (($this->parent->parent) && strtolower($this->parent->parent->tag) === $tag_lower) { |
||
1483 | $this->parent->_[HDOM_INFO_END] = 0; |
||
1484 | $this->parent = $this->parent->parent; |
||
1485 | } else { |
||
1486 | return $this->as_text_node($tag); |
||
1487 | } |
||
1488 | } |
||
1489 | |||
1490 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
1491 | if ($this->parent->parent) { |
||
1492 | $this->parent = $this->parent->parent; |
||
1493 | } |
||
1494 | |||
1495 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1496 | return true; |
||
1497 | } |
||
1498 | |||
1499 | $node = new simple_html_dom_node($this); |
||
1500 | $node->_[HDOM_INFO_BEGIN] = $this->cursor; |
||
1501 | $this->cursor++; |
||
1502 | $tag = $this->copy_until($this->token_slash); |
||
1503 | $node->tag_start = $begin_tag_pos; |
||
1504 | |||
1505 | // doctype, cdata & comments... |
||
1506 | if (isset($tag[0]) && $tag[0] === '!') { |
||
1507 | $node->_[HDOM_INFO_TEXT] = '<'.$tag.$this->copy_until_char('>'); |
||
1508 | |||
1509 | if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { |
||
1510 | $node->nodetype = HDOM_TYPE_COMMENT; |
||
1511 | $node->tag = 'comment'; |
||
1512 | } else { |
||
1513 | $node->nodetype = HDOM_TYPE_UNKNOWN; |
||
1514 | $node->tag = 'unknown'; |
||
1515 | } |
||
1516 | if ($this->char === '>') { |
||
1517 | $node->_[HDOM_INFO_TEXT] .= '>'; |
||
1518 | } |
||
1519 | $this->link_nodes($node, true); |
||
1520 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1521 | return true; |
||
1522 | } |
||
1523 | |||
1524 | // text |
||
1525 | if ($pos = strpos($tag, '<') !== false) { |
||
1526 | $tag = '<'.substr($tag, 0, -1); |
||
1527 | $node->_[HDOM_INFO_TEXT] = $tag; |
||
1528 | $this->link_nodes($node, false); |
||
1529 | $this->char = $this->doc[--$this->pos]; // prev |
||
1530 | return true; |
||
1531 | } |
||
1532 | |||
1533 | if (!preg_match("/^[\w-:]+$/", $tag)) { |
||
1534 | $node->_[HDOM_INFO_TEXT] = '<'.$tag.$this->copy_until('<>'); |
||
1535 | if ($this->char === '<') { |
||
1536 | $this->link_nodes($node, false); |
||
1537 | |||
1538 | return true; |
||
1539 | } |
||
1540 | |||
1541 | if ($this->char === '>') { |
||
1542 | $node->_[HDOM_INFO_TEXT] .= '>'; |
||
1543 | } |
||
1544 | $this->link_nodes($node, false); |
||
1545 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1546 | return true; |
||
1547 | } |
||
1548 | |||
1549 | // begin tag |
||
1550 | $node->nodetype = HDOM_TYPE_ELEMENT; |
||
1551 | $tag_lower = strtolower($tag); |
||
1552 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; |
||
1553 | |||
1554 | // handle optional closing tags |
||
1555 | if (isset($this->optional_closing_tags[$tag_lower])) { |
||
1556 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { |
||
1557 | $this->parent->_[HDOM_INFO_END] = 0; |
||
1558 | $this->parent = $this->parent->parent; |
||
1559 | } |
||
1560 | $node->parent = $this->parent; |
||
1561 | } |
||
1562 | |||
1563 | $guard = 0; // prevent infinity loop |
||
1564 | $space = [$this->copy_skip($this->token_blank), '', '']; |
||
1565 | |||
1566 | // attributes |
||
1567 | do { |
||
1568 | if ($this->char !== null && $space[0] === '') { |
||
1569 | break; |
||
1570 | } |
||
1571 | $name = $this->copy_until($this->token_equal); |
||
1572 | if ($guard === $this->pos) { |
||
1573 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1574 | continue; |
||
1575 | } |
||
1576 | $guard = $this->pos; |
||
1577 | |||
1578 | // handle endless '<' |
||
1579 | if ($this->pos >= $this->size - 1 && $this->char !== '>') { |
||
1580 | $node->nodetype = HDOM_TYPE_TEXT; |
||
1581 | $node->_[HDOM_INFO_END] = 0; |
||
1582 | $node->_[HDOM_INFO_TEXT] = '<'.$tag.$space[0].$name; |
||
1583 | $node->tag = 'text'; |
||
1584 | $this->link_nodes($node, false); |
||
1585 | |||
1586 | return true; |
||
1587 | } |
||
1588 | |||
1589 | // handle mismatch '<' |
||
1590 | if ($this->doc[$this->pos - 1] == '<') { |
||
1591 | $node->nodetype = HDOM_TYPE_TEXT; |
||
1592 | $node->tag = 'text'; |
||
1593 | $node->attr = []; |
||
1594 | $node->_[HDOM_INFO_END] = 0; |
||
1595 | $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos - $begin_tag_pos - 1); |
||
1596 | $this->pos -= 2; |
||
1597 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1598 | $this->link_nodes($node, false); |
||
1599 | |||
1600 | return true; |
||
1601 | } |
||
1602 | |||
1603 | if ($name !== '/' && $name !== '') { |
||
1604 | $space[1] = $this->copy_skip($this->token_blank); |
||
1605 | $name = $this->restore_noise($name); |
||
1606 | if ($this->lowercase) { |
||
1607 | $name = strtolower($name); |
||
1608 | } |
||
1609 | if ($this->char === '=') { |
||
1610 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1611 | $this->parse_attr($node, $name, $space); |
||
1612 | } else { |
||
1613 | //no value attr: nowrap, checked selected... |
||
1614 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||
1615 | $node->attr[$name] = true; |
||
1616 | if ($this->char != '>') { |
||
1617 | $this->char = $this->doc[--$this->pos]; |
||
1618 | } // prev |
||
1619 | } |
||
1620 | $node->_[HDOM_INFO_SPACE][] = $space; |
||
1621 | $space = [$this->copy_skip($this->token_blank), '', '']; |
||
1622 | } else { |
||
1623 | break; |
||
1624 | } |
||
1625 | } while ($this->char !== '>' && $this->char !== '/'); |
||
1626 | |||
1627 | $this->link_nodes($node, true); |
||
1628 | $node->_[HDOM_INFO_ENDSPACE] = $space[0]; |
||
1629 | |||
1630 | // check self closing |
||
1631 | if ($this->copy_until_char_escape('>') === '/') { |
||
1632 | $node->_[HDOM_INFO_ENDSPACE] .= '/'; |
||
1633 | $node->_[HDOM_INFO_END] = 0; |
||
1634 | } else { |
||
1635 | // reset parent |
||
1636 | if (!isset($this->self_closing_tags[strtolower($node->tag)])) { |
||
1637 | $this->parent = $node; |
||
1638 | } |
||
1639 | } |
||
1640 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1641 | |||
1642 | // If it's a BR tag, we need to set it's text to the default text. |
||
1643 | // This way when we see it in plaintext, we can generate formatting that the user wants. |
||
1644 | // since a br tag never has sub nodes, this works well. |
||
1645 | if ($node->tag == 'br') { |
||
1646 | $node->_[HDOM_INFO_INNER] = $this->default_br_text; |
||
1647 | } |
||
1648 | |||
1649 | return true; |
||
1650 | } |
||
1651 | |||
1652 | // parse attributes |
||
1653 | protected function parse_attr($node, $name, &$space) |
||
1654 | { |
||
1655 | // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 |
||
1656 | // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. |
||
1657 | if (isset($node->attr[$name])) { |
||
1658 | return; |
||
1659 | } |
||
1660 | |||
1661 | $space[2] = $this->copy_skip($this->token_blank); |
||
1662 | switch ($this->char) { |
||
1663 | case '"': |
||
1664 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
||
1665 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1666 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); |
||
1667 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1668 | break; |
||
1669 | case '\'': |
||
1670 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; |
||
1671 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1672 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); |
||
1673 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1674 | break; |
||
1675 | default: |
||
1676 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||
1677 | $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); |
||
1678 | } |
||
1679 | // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. |
||
1680 | $node->attr[$name] = str_replace("\r", '', $node->attr[$name]); |
||
1681 | $node->attr[$name] = str_replace("\n", '', $node->attr[$name]); |
||
1682 | // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. |
||
1683 | if ($name == 'class') { |
||
1684 | $node->attr[$name] = trim($node->attr[$name]); |
||
1685 | } |
||
1686 | } |
||
1687 | |||
1688 | // link node's parent |
||
1689 | protected function link_nodes(&$node, $is_child) |
||
1690 | { |
||
1691 | $node->parent = $this->parent; |
||
1692 | $this->parent->nodes[] = $node; |
||
1693 | if ($is_child) { |
||
1694 | $this->parent->children[] = $node; |
||
1695 | } |
||
1696 | } |
||
1697 | |||
1698 | // as a text node |
||
1699 | protected function as_text_node($tag) |
||
1700 | { |
||
1701 | $node = new simple_html_dom_node($this); |
||
1702 | $this->cursor++; |
||
1703 | $node->_[HDOM_INFO_TEXT] = '</'.$tag.'>'; |
||
1704 | $this->link_nodes($node, false); |
||
1705 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1706 | return true; |
||
1707 | } |
||
1708 | |||
1709 | protected function skip($chars) |
||
1710 | { |
||
1711 | $this->pos += strspn($this->doc, $chars, $this->pos); |
||
1712 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1713 | } |
||
1714 | |||
1715 | protected function copy_skip($chars) |
||
1716 | { |
||
1717 | $pos = $this->pos; |
||
1718 | $len = strspn($this->doc, $chars, $pos); |
||
1719 | $this->pos += $len; |
||
1720 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1721 | if ($len === 0) { |
||
1722 | return ''; |
||
1723 | } |
||
1724 | |||
1725 | return substr($this->doc, $pos, $len); |
||
1726 | } |
||
1727 | |||
1728 | protected function copy_until($chars) |
||
1729 | { |
||
1730 | $pos = $this->pos; |
||
1731 | $len = strcspn($this->doc, $chars, $pos); |
||
1732 | $this->pos += $len; |
||
1733 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
1734 | return substr($this->doc, $pos, $len); |
||
1735 | } |
||
1736 | |||
1737 | protected function copy_until_char($char) |
||
1738 | { |
||
1739 | if ($this->char === null) { |
||
1740 | return ''; |
||
1741 | } |
||
1742 | |||
1743 | if (($pos = strpos($this->doc, $char, $this->pos)) === false) { |
||
1744 | $ret = substr($this->doc, $this->pos, $this->size - $this->pos); |
||
1745 | $this->char = null; |
||
1746 | $this->pos = $this->size; |
||
1747 | |||
1748 | return $ret; |
||
1749 | } |
||
1750 | |||
1751 | if ($pos === $this->pos) { |
||
1752 | return ''; |
||
1753 | } |
||
1754 | $pos_old = $this->pos; |
||
1755 | $this->char = $this->doc[$pos]; |
||
1756 | $this->pos = $pos; |
||
1757 | |||
1758 | return substr($this->doc, $pos_old, $pos - $pos_old); |
||
1759 | } |
||
1760 | |||
1761 | protected function copy_until_char_escape($char) |
||
1762 | { |
||
1763 | if ($this->char === null) { |
||
1764 | return ''; |
||
1765 | } |
||
1766 | |||
1767 | $start = $this->pos; |
||
1768 | while (1) { |
||
1769 | if (($pos = strpos($this->doc, $char, $start)) === false) { |
||
1770 | $ret = substr($this->doc, $this->pos, $this->size - $this->pos); |
||
1771 | $this->char = null; |
||
1772 | $this->pos = $this->size; |
||
1773 | |||
1774 | return $ret; |
||
1775 | } |
||
1776 | |||
1777 | if ($pos === $this->pos) { |
||
1778 | return ''; |
||
1779 | } |
||
1780 | |||
1781 | if ($this->doc[$pos - 1] === '\\') { |
||
1782 | $start = $pos + 1; |
||
1783 | continue; |
||
1784 | } |
||
1785 | |||
1786 | $pos_old = $this->pos; |
||
1787 | $this->char = $this->doc[$pos]; |
||
1788 | $this->pos = $pos; |
||
1789 | |||
1790 | return substr($this->doc, $pos_old, $pos - $pos_old); |
||
1791 | } |
||
1792 | } |
||
1793 | |||
1794 | // remove noise from html content |
||
1795 | // save the noise in the $this->noise array. |
||
1796 | protected function remove_noise($pattern, $remove_tag = false) |
||
1797 | { |
||
1798 | global $debugObject; |
||
1799 | if (is_object($debugObject)) { |
||
1800 | $debugObject->debugLogEntry(1); |
||
1801 | } |
||
1802 | |||
1803 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); |
||
1804 | |||
1805 | for ($i = $count - 1; $i > -1; $i--) { |
||
1806 | $key = '___noise___'.sprintf('% 5d', count($this->noise) + 1000); |
||
1807 | if (is_object($debugObject)) { |
||
1808 | $debugObject->debugLog(2, 'key is: '.$key); |
||
1809 | } |
||
1810 | $idx = ($remove_tag) ? 0 : 1; |
||
1811 | $this->noise[$key] = $matches[$i][$idx][0]; |
||
1812 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
||
1813 | } |
||
1814 | |||
1815 | // reset the length of content |
||
1816 | $this->size = strlen($this->doc); |
||
1817 | if ($this->size > 0) { |
||
1818 | $this->char = $this->doc[0]; |
||
1819 | } |
||
1820 | } |
||
1821 | |||
1822 | // restore noise to html content |
||
1823 | public function restore_noise($text) |
||
1824 | { |
||
1825 | global $debugObject; |
||
1826 | if (is_object($debugObject)) { |
||
1827 | $debugObject->debugLogEntry(1); |
||
1828 | } |
||
1829 | |||
1830 | while (($pos = strpos($text, '___noise___')) !== false) { |
||
1831 | // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us... |
||
1832 | if (strlen($text) > $pos + 15) { |
||
1833 | $key = '___noise___'.$text[$pos + 11].$text[$pos + 12].$text[$pos + 13].$text[$pos + 14].$text[$pos + 15]; |
||
1834 | if (is_object($debugObject)) { |
||
1835 | $debugObject->debugLog(2, 'located key of: '.$key); |
||
1836 | } |
||
1837 | |||
1838 | if (isset($this->noise[$key])) { |
||
1839 | $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos + 16); |
||
1840 | } else { |
||
1841 | // do this to prevent an infinite loop. |
||
1842 | $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key.substr($text, $pos + 16); |
||
1843 | } |
||
1844 | } else { |
||
1845 | // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem. |
||
1846 | $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY'.substr($text, $pos + 11); |
||
1847 | } |
||
1848 | } |
||
1849 | |||
1850 | return $text; |
||
1851 | } |
||
1852 | |||
1853 | // Sometimes we NEED one of the noise elements. |
||
1854 | public function search_noise($text) |
||
1855 | { |
||
1856 | global $debugObject; |
||
1857 | if (is_object($debugObject)) { |
||
1858 | $debugObject->debugLogEntry(1); |
||
1859 | } |
||
1860 | |||
1861 | foreach ($this->noise as $noiseElement) { |
||
1862 | if (strpos($noiseElement, $text) !== false) { |
||
1863 | return $noiseElement; |
||
1864 | } |
||
1865 | } |
||
1866 | } |
||
1867 | |||
1868 | public function __toString() |
||
1869 | { |
||
1870 | return $this->root->innertext(); |
||
1871 | } |
||
1872 | |||
1873 | public function __get($name) |
||
1886 | } |
||
1887 | } |
||
1888 | |||
1889 | // camel naming conventions |
||
1890 | public function childNodes($idx = -1) |
||
1891 | { |
||
1892 | return $this->root->childNodes($idx); |
||
1893 | } |
||
1894 | |||
1895 | public function firstChild() |
||
1896 | { |
||
1897 | return $this->root->first_child(); |
||
1898 | } |
||
1899 | |||
1900 | public function lastChild() |
||
1901 | { |
||
1902 | return $this->root->last_child(); |
||
1903 | } |
||
1904 | |||
1905 | public function createElement($name, $value = null) |
||
1906 | { |
||
1907 | return @str_get_html("<$name>$value</$name>")->first_child(); |
||
1908 | } |
||
1909 | |||
1910 | public function createTextNode($value) |
||
1911 | { |
||
1912 | return @end(str_get_html($value)->nodes); |
||
1913 | } |
||
1914 | |||
1915 | public function getElementById($id) |
||
1916 | { |
||
1917 | return $this->find("#$id", 0); |
||
1918 | } |
||
1919 | |||
1920 | public function getElementsById($id, $idx = null) |
||
1921 | { |
||
1922 | return $this->find("#$id", $idx); |
||
1923 | } |
||
1924 | |||
1925 | public function getElementByTagName($name) |
||
1928 | } |
||
1929 | |||
1930 | public function getElementsByTagName($name, $idx = -1) |
||
1931 | { |
||
1932 | return $this->find($name, $idx); |
||
1933 | } |
||
1934 | |||
1935 | public function loadFile() |
||
1936 | { |
||
1939 | } |
||
1940 | } |
||
1941 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.