| Total Complexity | 166 |
| Total Lines | 820 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like simple_html_dom often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use simple_html_dom, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 1119 | class simple_html_dom |
||
| 1120 | { |
||
| 1121 | /** @var simple_html_dom_node $root */ |
||
| 1122 | public $root = null; |
||
| 1123 | public $nodes = []; |
||
| 1124 | public $callback = null; |
||
| 1125 | public $lowercase = false; |
||
| 1126 | // Used to keep track of how large the text was when we started. |
||
| 1127 | public $original_size; |
||
| 1128 | public $size; |
||
| 1129 | protected $pos; |
||
| 1130 | protected $doc; |
||
| 1131 | protected $char; |
||
| 1132 | protected $cursor; |
||
| 1133 | protected $parent; |
||
| 1134 | protected $noise = []; |
||
| 1135 | protected $token_blank = " \t\r\n"; |
||
| 1136 | protected $token_equal = ' =/>'; |
||
| 1137 | protected $token_slash = " />\r\n\t"; |
||
| 1138 | protected $token_attr = ' >'; |
||
| 1139 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. |
||
| 1140 | public $_charset = ''; |
||
| 1141 | public $_target_charset = ''; |
||
| 1142 | protected $default_br_text = ''; |
||
| 1143 | public $default_span_text = ''; |
||
| 1144 | |||
| 1145 | // use isset instead of in_array, performance boost about 30%... |
||
| 1146 | protected $self_closing_tags = ['img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1]; |
||
| 1147 | protected $block_tags = ['root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1]; |
||
| 1148 | // Known sourceforge issue #2977341 |
||
| 1149 | // B tags that are not closed cause us to return everything to the end of the document. |
||
| 1150 | protected $optional_closing_tags = [ |
||
| 1151 | 'tr' => ['tr'=>1, 'td'=>1, 'th'=>1], |
||
| 1152 | 'th' => ['th'=>1], |
||
| 1153 | 'td' => ['td'=>1], |
||
| 1154 | 'li' => ['li'=>1], |
||
| 1155 | 'dt' => ['dt'=>1, 'dd'=>1], |
||
| 1156 | 'dd' => ['dd'=>1, 'dt'=>1], |
||
| 1157 | 'dl' => ['dd'=>1, 'dt'=>1], |
||
| 1158 | 'p' => ['p'=>1], |
||
| 1159 | 'nobr' => ['nobr'=>1], |
||
| 1160 | 'b' => ['b'=>1], |
||
| 1161 | 'option'=> ['option'=>1], |
||
| 1162 | ]; |
||
| 1163 | |||
| 1164 | public function __construct($str = null, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) |
||
| 1165 | { |
||
| 1166 | if ($str) { |
||
| 1167 | if (preg_match("/^http:\/\//i", $str) || is_file($str)) { |
||
| 1168 | $this->load_file($str); |
||
| 1169 | } else { |
||
| 1170 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); |
||
| 1171 | } |
||
| 1172 | } |
||
| 1173 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. |
||
| 1174 | if (!$forceTagsClosed) { |
||
| 1175 | $this->optional_closing_array = []; |
||
| 1176 | } |
||
| 1177 | $this->_target_charset = $target_charset; |
||
| 1178 | } |
||
| 1179 | |||
| 1180 | public function __destruct() |
||
| 1181 | { |
||
| 1182 | $this->clear(); |
||
| 1183 | } |
||
| 1184 | |||
| 1185 | // load html from string |
||
| 1186 | public function load($str, $lowercase = true, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) |
||
| 1187 | { |
||
| 1188 | global $debugObject; |
||
| 1189 | |||
| 1190 | // prepare |
||
| 1191 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); |
||
| 1192 | // strip out comments |
||
| 1193 | $this->remove_noise("'<!--(.*?)-->'is"); |
||
| 1194 | // strip out cdata |
||
| 1195 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); |
||
| 1196 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 |
||
| 1197 | // Script tags removal now preceeds style tag removal. |
||
| 1198 | // strip out <script> tags |
||
| 1199 | $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); |
||
| 1200 | $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); |
||
| 1201 | // strip out <style> tags |
||
| 1202 | $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); |
||
| 1203 | $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); |
||
| 1204 | // strip out preformatted tags |
||
| 1205 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); |
||
| 1206 | // strip out server side scripts |
||
| 1207 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); |
||
| 1208 | // strip smarty scripts |
||
| 1209 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); |
||
| 1210 | |||
| 1211 | // parsing |
||
| 1212 | while ($this->parse()); |
||
| 1213 | // end |
||
| 1214 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||
| 1215 | $this->parse_charset(); |
||
| 1216 | |||
| 1217 | // make load function chainable |
||
| 1218 | return $this; |
||
| 1219 | } |
||
| 1220 | |||
| 1221 | // load html from file |
||
| 1222 | public function load_file() |
||
| 1223 | { |
||
| 1224 | $args = func_get_args(); |
||
| 1225 | $this->load(call_user_func_array('file_get_contents', $args), true); |
||
| 1226 | // Throw an error if we can't properly load the dom. |
||
| 1227 | if (($error = error_get_last()) !== null) { |
||
| 1228 | $this->clear(); |
||
| 1229 | |||
| 1230 | return false; |
||
| 1231 | } |
||
| 1232 | } |
||
| 1233 | |||
| 1234 | // set callback function |
||
| 1235 | public function set_callback($function_name) |
||
| 1236 | { |
||
| 1237 | $this->callback = $function_name; |
||
| 1238 | } |
||
| 1239 | |||
| 1240 | // remove callback function |
||
| 1241 | public function remove_callback() |
||
| 1242 | { |
||
| 1243 | $this->callback = null; |
||
| 1244 | } |
||
| 1245 | |||
| 1246 | // save dom as string |
||
| 1247 | public function save($filepath = '') |
||
| 1248 | { |
||
| 1249 | $ret = $this->root->innertext(); |
||
| 1250 | if ($filepath !== '') { |
||
| 1251 | file_put_contents($filepath, $ret, LOCK_EX); |
||
| 1252 | } |
||
| 1253 | |||
| 1254 | return $ret; |
||
| 1255 | } |
||
| 1256 | |||
| 1257 | // find dom node by css selector |
||
| 1258 | // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. |
||
| 1259 | public function find($selector, $idx = null, $lowercase = false) |
||
| 1260 | { |
||
| 1261 | return $this->root->find($selector, $idx, $lowercase); |
||
| 1262 | } |
||
| 1263 | |||
| 1264 | // clean up memory due to php5 circular references memory leak... |
||
| 1265 | public function clear() |
||
| 1266 | { |
||
| 1267 | foreach ($this->nodes as $n) { |
||
| 1268 | $n->clear(); |
||
| 1269 | $n = null; |
||
| 1270 | } |
||
| 1271 | // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. |
||
| 1272 | if (isset($this->children)) { |
||
| 1273 | foreach ($this->children as $n) { |
||
| 1274 | $n->clear(); |
||
| 1275 | $n = null; |
||
| 1276 | } |
||
| 1277 | } |
||
| 1278 | if (isset($this->parent)) { |
||
| 1279 | $this->parent->clear(); |
||
| 1280 | unset($this->parent); |
||
| 1281 | } |
||
| 1282 | if (isset($this->root)) { |
||
| 1283 | $this->root->clear(); |
||
| 1284 | unset($this->root); |
||
| 1285 | } |
||
| 1286 | unset($this->doc); |
||
| 1287 | unset($this->noise); |
||
| 1288 | } |
||
| 1289 | |||
| 1290 | public function dump($show_attr = true) |
||
| 1291 | { |
||
| 1292 | $this->root->dump($show_attr); |
||
| 1293 | } |
||
| 1294 | |||
| 1295 | // prepare HTML data and init everything |
||
| 1296 | protected function prepare($str, $lowercase = true, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) |
||
| 1297 | { |
||
| 1298 | $this->clear(); |
||
| 1299 | |||
| 1300 | // set the length of content before we do anything to it. |
||
| 1301 | $this->size = strlen($str); |
||
| 1302 | // Save the original size of the html that we got in. It might be useful to someone. |
||
| 1303 | $this->original_size = $this->size; |
||
| 1304 | |||
| 1305 | //before we save the string as the doc... strip out the \r \n's if we are told to. |
||
| 1306 | if ($stripRN) { |
||
| 1307 | $str = str_replace("\r", ' ', $str); |
||
| 1308 | $str = str_replace("\n", ' ', $str); |
||
| 1309 | |||
| 1310 | // set the length of content since we have changed it. |
||
| 1311 | $this->size = strlen($str); |
||
| 1312 | } |
||
| 1313 | |||
| 1314 | $this->doc = $str; |
||
| 1315 | $this->pos = 0; |
||
| 1316 | $this->cursor = 1; |
||
| 1317 | $this->noise = []; |
||
| 1318 | $this->nodes = []; |
||
| 1319 | $this->lowercase = $lowercase; |
||
| 1320 | $this->default_br_text = $defaultBRText; |
||
| 1321 | $this->default_span_text = $defaultSpanText; |
||
| 1322 | $this->root = new simple_html_dom_node($this); |
||
| 1323 | $this->root->tag = 'root'; |
||
| 1324 | $this->root->_[HDOM_INFO_BEGIN] = -1; |
||
| 1325 | $this->root->nodetype = HDOM_TYPE_ROOT; |
||
| 1326 | $this->parent = $this->root; |
||
| 1327 | if ($this->size > 0) { |
||
| 1328 | $this->char = $this->doc[0]; |
||
| 1329 | } |
||
| 1330 | } |
||
| 1331 | |||
| 1332 | // parse html content |
||
| 1333 | protected function parse() |
||
| 1334 | { |
||
| 1335 | if (($s = $this->copy_until_char('<')) === '') { |
||
| 1336 | return $this->read_tag(); |
||
| 1337 | } |
||
| 1338 | |||
| 1339 | // text |
||
| 1340 | $node = new simple_html_dom_node($this); |
||
| 1341 | $this->cursor++; |
||
| 1342 | $node->_[HDOM_INFO_TEXT] = $s; |
||
| 1343 | $this->link_nodes($node, false); |
||
| 1344 | |||
| 1345 | return true; |
||
| 1346 | } |
||
| 1347 | |||
| 1348 | // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. |
||
| 1349 | // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec |
||
| 1350 | // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. |
||
| 1351 | protected function parse_charset() |
||
| 1352 | { |
||
| 1353 | global $debugObject; |
||
| 1354 | |||
| 1355 | $charset = null; |
||
| 1356 | |||
| 1357 | if (function_exists('get_last_retrieve_url_contents_content_type')) { |
||
| 1358 | $contentTypeHeader = get_last_retrieve_url_contents_content_type(); |
||
| 1359 | $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); |
||
| 1360 | if ($success) { |
||
| 1361 | $charset = $matches[1]; |
||
| 1362 | if (is_object($debugObject)) { |
||
| 1363 | $debugObject->debugLog(2, 'header content-type found charset of: '.$charset); |
||
| 1364 | } |
||
| 1365 | } |
||
| 1366 | } |
||
| 1367 | |||
| 1368 | if (empty($charset)) { |
||
| 1369 | $el = $this->root->find('meta[http-equiv=Content-Type]', 0); |
||
| 1370 | if (!empty($el)) { |
||
| 1371 | $fullvalue = $el->content; |
||
| 1372 | if (is_object($debugObject)) { |
||
| 1373 | $debugObject->debugLog(2, 'meta content-type tag found'.$fullvalue); |
||
| 1374 | } |
||
| 1375 | |||
| 1376 | if (!empty($fullvalue)) { |
||
| 1377 | $success = preg_match('/charset=(.+)/', $fullvalue, $matches); |
||
| 1378 | if ($success) { |
||
| 1379 | $charset = $matches[1]; |
||
| 1380 | } else { |
||
| 1381 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 |
||
| 1382 | if (is_object($debugObject)) { |
||
| 1383 | $debugObject->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'); |
||
| 1384 | } |
||
| 1385 | $charset = 'ISO-8859-1'; |
||
| 1386 | } |
||
| 1387 | } |
||
| 1388 | } |
||
| 1389 | } |
||
| 1390 | |||
| 1391 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... |
||
| 1392 | if (empty($charset)) { |
||
| 1393 | // Have php try to detect the encoding from the text given to us. |
||
| 1394 | $charset = (function_exists('mb_detect_encoding')) ? mb_detect_encoding($this->root->plaintext.'ascii', $encoding_list = ['UTF-8', 'CP1252']) : false; |
||
| 1395 | if (is_object($debugObject)) { |
||
| 1396 | $debugObject->debugLog(2, 'mb_detect found: '.$charset); |
||
| 1397 | } |
||
| 1398 | |||
| 1399 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... |
||
| 1400 | if ($charset === false) { |
||
| 1401 | if (is_object($debugObject)) { |
||
| 1402 | $debugObject->debugLog(2, 'since mb_detect failed - using default of utf-8'); |
||
| 1403 | } |
||
| 1404 | $charset = 'UTF-8'; |
||
| 1405 | } |
||
| 1406 | } |
||
| 1407 | |||
| 1408 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. |
||
| 1409 | if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) { |
||
| 1410 | if (is_object($debugObject)) { |
||
| 1411 | $debugObject->debugLog(2, 'replacing '.$charset.' with CP1252 as its a superset'); |
||
| 1412 | } |
||
| 1413 | $charset = 'CP1252'; |
||
| 1414 | } |
||
| 1415 | |||
| 1416 | if (is_object($debugObject)) { |
||
| 1417 | $debugObject->debugLog(1, 'EXIT - '.$charset); |
||
| 1418 | } |
||
| 1419 | |||
| 1420 | return $this->_charset = $charset; |
||
| 1421 | } |
||
| 1422 | |||
| 1423 | // read tag info |
||
| 1424 | protected function read_tag() |
||
| 1425 | { |
||
| 1426 | if ($this->char !== '<') { |
||
| 1427 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||
| 1428 | |||
| 1429 | return false; |
||
| 1430 | } |
||
| 1431 | $begin_tag_pos = $this->pos; |
||
| 1432 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1433 | |||
| 1434 | // end tag |
||
| 1435 | if ($this->char === '/') { |
||
| 1436 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1437 | // This represents the change in the simple_html_dom trunk from revision 180 to 181. |
||
| 1438 | // $this->skip($this->token_blank_t); |
||
| 1439 | $this->skip($this->token_blank); |
||
| 1440 | $tag = $this->copy_until_char('>'); |
||
| 1441 | |||
| 1442 | // skip attributes in end tag |
||
| 1443 | if (($pos = strpos($tag, ' ')) !== false) { |
||
| 1444 | $tag = substr($tag, 0, $pos); |
||
| 1445 | } |
||
| 1446 | |||
| 1447 | $parent_lower = strtolower($this->parent->tag); |
||
| 1448 | $tag_lower = strtolower($tag); |
||
| 1449 | |||
| 1450 | if ($parent_lower !== $tag_lower) { |
||
| 1451 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { |
||
| 1452 | $this->parent->_[HDOM_INFO_END] = 0; |
||
| 1453 | $org_parent = $this->parent; |
||
| 1454 | |||
| 1455 | while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) { |
||
| 1456 | $this->parent = $this->parent->parent; |
||
| 1457 | } |
||
| 1458 | |||
| 1459 | if (strtolower($this->parent->tag) !== $tag_lower) { |
||
| 1460 | $this->parent = $org_parent; // restore origonal parent |
||
| 1461 | if ($this->parent->parent) { |
||
| 1462 | $this->parent = $this->parent->parent; |
||
| 1463 | } |
||
| 1464 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
| 1465 | |||
| 1466 | return $this->as_text_node($tag); |
||
| 1467 | } |
||
| 1468 | } elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower])) { |
||
| 1469 | $this->parent->_[HDOM_INFO_END] = 0; |
||
| 1470 | $org_parent = $this->parent; |
||
| 1471 | |||
| 1472 | while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) { |
||
| 1473 | $this->parent = $this->parent->parent; |
||
| 1474 | } |
||
| 1475 | |||
| 1476 | if (strtolower($this->parent->tag) !== $tag_lower) { |
||
| 1477 | $this->parent = $org_parent; // restore origonal parent |
||
| 1478 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
| 1479 | |||
| 1480 | return $this->as_text_node($tag); |
||
| 1481 | } |
||
| 1482 | } elseif (($this->parent->parent) && strtolower($this->parent->parent->tag) === $tag_lower) { |
||
| 1483 | $this->parent->_[HDOM_INFO_END] = 0; |
||
| 1484 | $this->parent = $this->parent->parent; |
||
| 1485 | } else { |
||
| 1486 | return $this->as_text_node($tag); |
||
| 1487 | } |
||
| 1488 | } |
||
| 1489 | |||
| 1490 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
| 1491 | if ($this->parent->parent) { |
||
| 1492 | $this->parent = $this->parent->parent; |
||
| 1493 | } |
||
| 1494 | |||
| 1495 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1496 | return true; |
||
| 1497 | } |
||
| 1498 | |||
| 1499 | $node = new simple_html_dom_node($this); |
||
| 1500 | $node->_[HDOM_INFO_BEGIN] = $this->cursor; |
||
| 1501 | $this->cursor++; |
||
| 1502 | $tag = $this->copy_until($this->token_slash); |
||
| 1503 | $node->tag_start = $begin_tag_pos; |
||
| 1504 | |||
| 1505 | // doctype, cdata & comments... |
||
| 1506 | if (isset($tag[0]) && $tag[0] === '!') { |
||
| 1507 | $node->_[HDOM_INFO_TEXT] = '<'.$tag.$this->copy_until_char('>'); |
||
| 1508 | |||
| 1509 | if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { |
||
| 1510 | $node->nodetype = HDOM_TYPE_COMMENT; |
||
| 1511 | $node->tag = 'comment'; |
||
| 1512 | } else { |
||
| 1513 | $node->nodetype = HDOM_TYPE_UNKNOWN; |
||
| 1514 | $node->tag = 'unknown'; |
||
| 1515 | } |
||
| 1516 | if ($this->char === '>') { |
||
| 1517 | $node->_[HDOM_INFO_TEXT] .= '>'; |
||
| 1518 | } |
||
| 1519 | $this->link_nodes($node, true); |
||
| 1520 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1521 | return true; |
||
| 1522 | } |
||
| 1523 | |||
| 1524 | // text |
||
| 1525 | if ($pos = strpos($tag, '<') !== false) { |
||
| 1526 | $tag = '<'.substr($tag, 0, -1); |
||
| 1527 | $node->_[HDOM_INFO_TEXT] = $tag; |
||
| 1528 | $this->link_nodes($node, false); |
||
| 1529 | $this->char = $this->doc[--$this->pos]; // prev |
||
| 1530 | return true; |
||
| 1531 | } |
||
| 1532 | |||
| 1533 | if (!preg_match("/^[\w-:]+$/", $tag)) { |
||
| 1534 | $node->_[HDOM_INFO_TEXT] = '<'.$tag.$this->copy_until('<>'); |
||
| 1535 | if ($this->char === '<') { |
||
| 1536 | $this->link_nodes($node, false); |
||
| 1537 | |||
| 1538 | return true; |
||
| 1539 | } |
||
| 1540 | |||
| 1541 | if ($this->char === '>') { |
||
| 1542 | $node->_[HDOM_INFO_TEXT] .= '>'; |
||
| 1543 | } |
||
| 1544 | $this->link_nodes($node, false); |
||
| 1545 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1546 | return true; |
||
| 1547 | } |
||
| 1548 | |||
| 1549 | // begin tag |
||
| 1550 | $node->nodetype = HDOM_TYPE_ELEMENT; |
||
| 1551 | $tag_lower = strtolower($tag); |
||
| 1552 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; |
||
| 1553 | |||
| 1554 | // handle optional closing tags |
||
| 1555 | if (isset($this->optional_closing_tags[$tag_lower])) { |
||
| 1556 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { |
||
| 1557 | $this->parent->_[HDOM_INFO_END] = 0; |
||
| 1558 | $this->parent = $this->parent->parent; |
||
| 1559 | } |
||
| 1560 | $node->parent = $this->parent; |
||
| 1561 | } |
||
| 1562 | |||
| 1563 | $guard = 0; // prevent infinity loop |
||
| 1564 | $space = [$this->copy_skip($this->token_blank), '', '']; |
||
| 1565 | |||
| 1566 | // attributes |
||
| 1567 | do { |
||
| 1568 | if ($this->char !== null && $space[0] === '') { |
||
| 1569 | break; |
||
| 1570 | } |
||
| 1571 | $name = $this->copy_until($this->token_equal); |
||
| 1572 | if ($guard === $this->pos) { |
||
| 1573 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1574 | continue; |
||
| 1575 | } |
||
| 1576 | $guard = $this->pos; |
||
| 1577 | |||
| 1578 | // handle endless '<' |
||
| 1579 | if ($this->pos >= $this->size - 1 && $this->char !== '>') { |
||
| 1580 | $node->nodetype = HDOM_TYPE_TEXT; |
||
| 1581 | $node->_[HDOM_INFO_END] = 0; |
||
| 1582 | $node->_[HDOM_INFO_TEXT] = '<'.$tag.$space[0].$name; |
||
| 1583 | $node->tag = 'text'; |
||
| 1584 | $this->link_nodes($node, false); |
||
| 1585 | |||
| 1586 | return true; |
||
| 1587 | } |
||
| 1588 | |||
| 1589 | // handle mismatch '<' |
||
| 1590 | if ($this->doc[$this->pos - 1] == '<') { |
||
| 1591 | $node->nodetype = HDOM_TYPE_TEXT; |
||
| 1592 | $node->tag = 'text'; |
||
| 1593 | $node->attr = []; |
||
| 1594 | $node->_[HDOM_INFO_END] = 0; |
||
| 1595 | $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos - $begin_tag_pos - 1); |
||
| 1596 | $this->pos -= 2; |
||
| 1597 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1598 | $this->link_nodes($node, false); |
||
| 1599 | |||
| 1600 | return true; |
||
| 1601 | } |
||
| 1602 | |||
| 1603 | if ($name !== '/' && $name !== '') { |
||
| 1604 | $space[1] = $this->copy_skip($this->token_blank); |
||
| 1605 | $name = $this->restore_noise($name); |
||
| 1606 | if ($this->lowercase) { |
||
| 1607 | $name = strtolower($name); |
||
| 1608 | } |
||
| 1609 | if ($this->char === '=') { |
||
| 1610 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1611 | $this->parse_attr($node, $name, $space); |
||
| 1612 | } else { |
||
| 1613 | //no value attr: nowrap, checked selected... |
||
| 1614 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||
| 1615 | $node->attr[$name] = true; |
||
| 1616 | if ($this->char != '>') { |
||
| 1617 | $this->char = $this->doc[--$this->pos]; |
||
| 1618 | } // prev |
||
| 1619 | } |
||
| 1620 | $node->_[HDOM_INFO_SPACE][] = $space; |
||
| 1621 | $space = [$this->copy_skip($this->token_blank), '', '']; |
||
| 1622 | } else { |
||
| 1623 | break; |
||
| 1624 | } |
||
| 1625 | } while ($this->char !== '>' && $this->char !== '/'); |
||
| 1626 | |||
| 1627 | $this->link_nodes($node, true); |
||
| 1628 | $node->_[HDOM_INFO_ENDSPACE] = $space[0]; |
||
| 1629 | |||
| 1630 | // check self closing |
||
| 1631 | if ($this->copy_until_char_escape('>') === '/') { |
||
| 1632 | $node->_[HDOM_INFO_ENDSPACE] .= '/'; |
||
| 1633 | $node->_[HDOM_INFO_END] = 0; |
||
| 1634 | } else { |
||
| 1635 | // reset parent |
||
| 1636 | if (!isset($this->self_closing_tags[strtolower($node->tag)])) { |
||
| 1637 | $this->parent = $node; |
||
| 1638 | } |
||
| 1639 | } |
||
| 1640 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1641 | |||
| 1642 | // If it's a BR tag, we need to set it's text to the default text. |
||
| 1643 | // This way when we see it in plaintext, we can generate formatting that the user wants. |
||
| 1644 | // since a br tag never has sub nodes, this works well. |
||
| 1645 | if ($node->tag == 'br') { |
||
| 1646 | $node->_[HDOM_INFO_INNER] = $this->default_br_text; |
||
| 1647 | } |
||
| 1648 | |||
| 1649 | return true; |
||
| 1650 | } |
||
| 1651 | |||
| 1652 | // parse attributes |
||
| 1653 | protected function parse_attr($node, $name, &$space) |
||
| 1654 | { |
||
| 1655 | // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 |
||
| 1656 | // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. |
||
| 1657 | if (isset($node->attr[$name])) { |
||
| 1658 | return; |
||
| 1659 | } |
||
| 1660 | |||
| 1661 | $space[2] = $this->copy_skip($this->token_blank); |
||
| 1662 | switch ($this->char) { |
||
| 1663 | case '"': |
||
| 1664 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
||
| 1665 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1666 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); |
||
| 1667 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1668 | break; |
||
| 1669 | case '\'': |
||
| 1670 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; |
||
| 1671 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1672 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); |
||
| 1673 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1674 | break; |
||
| 1675 | default: |
||
| 1676 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||
| 1677 | $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); |
||
| 1678 | } |
||
| 1679 | // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. |
||
| 1680 | $node->attr[$name] = str_replace("\r", '', $node->attr[$name]); |
||
| 1681 | $node->attr[$name] = str_replace("\n", '', $node->attr[$name]); |
||
| 1682 | // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. |
||
| 1683 | if ($name == 'class') { |
||
| 1684 | $node->attr[$name] = trim($node->attr[$name]); |
||
| 1685 | } |
||
| 1686 | } |
||
| 1687 | |||
| 1688 | // link node's parent |
||
| 1689 | protected function link_nodes(&$node, $is_child) |
||
| 1690 | { |
||
| 1691 | $node->parent = $this->parent; |
||
| 1692 | $this->parent->nodes[] = $node; |
||
| 1693 | if ($is_child) { |
||
| 1694 | $this->parent->children[] = $node; |
||
| 1695 | } |
||
| 1696 | } |
||
| 1697 | |||
| 1698 | // as a text node |
||
| 1699 | protected function as_text_node($tag) |
||
| 1700 | { |
||
| 1701 | $node = new simple_html_dom_node($this); |
||
| 1702 | $this->cursor++; |
||
| 1703 | $node->_[HDOM_INFO_TEXT] = '</'.$tag.'>'; |
||
| 1704 | $this->link_nodes($node, false); |
||
| 1705 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1706 | return true; |
||
| 1707 | } |
||
| 1708 | |||
| 1709 | protected function skip($chars) |
||
| 1710 | { |
||
| 1711 | $this->pos += strspn($this->doc, $chars, $this->pos); |
||
| 1712 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1713 | } |
||
| 1714 | |||
| 1715 | protected function copy_skip($chars) |
||
| 1716 | { |
||
| 1717 | $pos = $this->pos; |
||
| 1718 | $len = strspn($this->doc, $chars, $pos); |
||
| 1719 | $this->pos += $len; |
||
| 1720 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1721 | if ($len === 0) { |
||
| 1722 | return ''; |
||
| 1723 | } |
||
| 1724 | |||
| 1725 | return substr($this->doc, $pos, $len); |
||
| 1726 | } |
||
| 1727 | |||
| 1728 | protected function copy_until($chars) |
||
| 1729 | { |
||
| 1730 | $pos = $this->pos; |
||
| 1731 | $len = strcspn($this->doc, $chars, $pos); |
||
| 1732 | $this->pos += $len; |
||
| 1733 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1734 | return substr($this->doc, $pos, $len); |
||
| 1735 | } |
||
| 1736 | |||
| 1737 | protected function copy_until_char($char) |
||
| 1738 | { |
||
| 1739 | if ($this->char === null) { |
||
| 1740 | return ''; |
||
| 1741 | } |
||
| 1742 | |||
| 1743 | if (($pos = strpos($this->doc, $char, $this->pos)) === false) { |
||
| 1744 | $ret = substr($this->doc, $this->pos, $this->size - $this->pos); |
||
| 1745 | $this->char = null; |
||
| 1746 | $this->pos = $this->size; |
||
| 1747 | |||
| 1748 | return $ret; |
||
| 1749 | } |
||
| 1750 | |||
| 1751 | if ($pos === $this->pos) { |
||
| 1752 | return ''; |
||
| 1753 | } |
||
| 1754 | $pos_old = $this->pos; |
||
| 1755 | $this->char = $this->doc[$pos]; |
||
| 1756 | $this->pos = $pos; |
||
| 1757 | |||
| 1758 | return substr($this->doc, $pos_old, $pos - $pos_old); |
||
| 1759 | } |
||
| 1760 | |||
| 1761 | protected function copy_until_char_escape($char) |
||
| 1762 | { |
||
| 1763 | if ($this->char === null) { |
||
| 1764 | return ''; |
||
| 1765 | } |
||
| 1766 | |||
| 1767 | $start = $this->pos; |
||
| 1768 | while (1) { |
||
| 1769 | if (($pos = strpos($this->doc, $char, $start)) === false) { |
||
| 1770 | $ret = substr($this->doc, $this->pos, $this->size - $this->pos); |
||
| 1771 | $this->char = null; |
||
| 1772 | $this->pos = $this->size; |
||
| 1773 | |||
| 1774 | return $ret; |
||
| 1775 | } |
||
| 1776 | |||
| 1777 | if ($pos === $this->pos) { |
||
| 1778 | return ''; |
||
| 1779 | } |
||
| 1780 | |||
| 1781 | if ($this->doc[$pos - 1] === '\\') { |
||
| 1782 | $start = $pos + 1; |
||
| 1783 | continue; |
||
| 1784 | } |
||
| 1785 | |||
| 1786 | $pos_old = $this->pos; |
||
| 1787 | $this->char = $this->doc[$pos]; |
||
| 1788 | $this->pos = $pos; |
||
| 1789 | |||
| 1790 | return substr($this->doc, $pos_old, $pos - $pos_old); |
||
| 1791 | } |
||
| 1792 | } |
||
| 1793 | |||
| 1794 | // remove noise from html content |
||
| 1795 | // save the noise in the $this->noise array. |
||
| 1796 | protected function remove_noise($pattern, $remove_tag = false) |
||
| 1797 | { |
||
| 1798 | global $debugObject; |
||
| 1799 | if (is_object($debugObject)) { |
||
| 1800 | $debugObject->debugLogEntry(1); |
||
| 1801 | } |
||
| 1802 | |||
| 1803 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); |
||
| 1804 | |||
| 1805 | for ($i = $count - 1; $i > -1; $i--) { |
||
| 1806 | $key = '___noise___'.sprintf('% 5d', count($this->noise) + 1000); |
||
| 1807 | if (is_object($debugObject)) { |
||
| 1808 | $debugObject->debugLog(2, 'key is: '.$key); |
||
| 1809 | } |
||
| 1810 | $idx = ($remove_tag) ? 0 : 1; |
||
| 1811 | $this->noise[$key] = $matches[$i][$idx][0]; |
||
| 1812 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
||
| 1813 | } |
||
| 1814 | |||
| 1815 | // reset the length of content |
||
| 1816 | $this->size = strlen($this->doc); |
||
| 1817 | if ($this->size > 0) { |
||
| 1818 | $this->char = $this->doc[0]; |
||
| 1819 | } |
||
| 1820 | } |
||
| 1821 | |||
| 1822 | // restore noise to html content |
||
| 1823 | public function restore_noise($text) |
||
| 1824 | { |
||
| 1825 | global $debugObject; |
||
| 1826 | if (is_object($debugObject)) { |
||
| 1827 | $debugObject->debugLogEntry(1); |
||
| 1828 | } |
||
| 1829 | |||
| 1830 | while (($pos = strpos($text, '___noise___')) !== false) { |
||
| 1831 | // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us... |
||
| 1832 | if (strlen($text) > $pos + 15) { |
||
| 1833 | $key = '___noise___'.$text[$pos + 11].$text[$pos + 12].$text[$pos + 13].$text[$pos + 14].$text[$pos + 15]; |
||
| 1834 | if (is_object($debugObject)) { |
||
| 1835 | $debugObject->debugLog(2, 'located key of: '.$key); |
||
| 1836 | } |
||
| 1837 | |||
| 1838 | if (isset($this->noise[$key])) { |
||
| 1839 | $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos + 16); |
||
| 1840 | } else { |
||
| 1841 | // do this to prevent an infinite loop. |
||
| 1842 | $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key.substr($text, $pos + 16); |
||
| 1843 | } |
||
| 1844 | } else { |
||
| 1845 | // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem. |
||
| 1846 | $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY'.substr($text, $pos + 11); |
||
| 1847 | } |
||
| 1848 | } |
||
| 1849 | |||
| 1850 | return $text; |
||
| 1851 | } |
||
| 1852 | |||
| 1853 | // Sometimes we NEED one of the noise elements. |
||
| 1854 | public function search_noise($text) |
||
| 1855 | { |
||
| 1856 | global $debugObject; |
||
| 1857 | if (is_object($debugObject)) { |
||
| 1858 | $debugObject->debugLogEntry(1); |
||
| 1859 | } |
||
| 1860 | |||
| 1861 | foreach ($this->noise as $noiseElement) { |
||
| 1862 | if (strpos($noiseElement, $text) !== false) { |
||
| 1863 | return $noiseElement; |
||
| 1864 | } |
||
| 1865 | } |
||
| 1866 | } |
||
| 1867 | |||
| 1868 | public function __toString() |
||
| 1869 | { |
||
| 1870 | return $this->root->innertext(); |
||
| 1871 | } |
||
| 1872 | |||
| 1873 | public function __get($name) |
||
| 1886 | } |
||
| 1887 | } |
||
| 1888 | |||
| 1889 | // camel naming conventions |
||
| 1890 | public function childNodes($idx = -1) |
||
| 1891 | { |
||
| 1892 | return $this->root->childNodes($idx); |
||
| 1893 | } |
||
| 1894 | |||
| 1895 | public function firstChild() |
||
| 1896 | { |
||
| 1897 | return $this->root->first_child(); |
||
| 1898 | } |
||
| 1899 | |||
| 1900 | public function lastChild() |
||
| 1901 | { |
||
| 1902 | return $this->root->last_child(); |
||
| 1903 | } |
||
| 1904 | |||
| 1905 | public function createElement($name, $value = null) |
||
| 1906 | { |
||
| 1907 | return @str_get_html("<$name>$value</$name>")->first_child(); |
||
| 1908 | } |
||
| 1909 | |||
| 1910 | public function createTextNode($value) |
||
| 1911 | { |
||
| 1912 | return @end(str_get_html($value)->nodes); |
||
| 1913 | } |
||
| 1914 | |||
| 1915 | public function getElementById($id) |
||
| 1916 | { |
||
| 1917 | return $this->find("#$id", 0); |
||
| 1918 | } |
||
| 1919 | |||
| 1920 | public function getElementsById($id, $idx = null) |
||
| 1921 | { |
||
| 1922 | return $this->find("#$id", $idx); |
||
| 1923 | } |
||
| 1924 | |||
| 1925 | public function getElementByTagName($name) |
||
| 1928 | } |
||
| 1929 | |||
| 1930 | public function getElementsByTagName($name, $idx = -1) |
||
| 1931 | { |
||
| 1932 | return $this->find($name, $idx); |
||
| 1933 | } |
||
| 1934 | |||
| 1935 | public function loadFile() |
||
| 1936 | { |
||
| 1939 | } |
||
| 1940 | } |
||
| 1941 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.