Total Complexity | 166 |
Total Lines | 794 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like simple_html_dom often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use simple_html_dom, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
1075 | class simple_html_dom |
||
1076 | { |
||
1077 | public $root = null; |
||
1078 | public $nodes = []; |
||
1079 | public $callback = null; |
||
1080 | public $lowercase = false; |
||
1081 | // Used to keep track of how large the text was when we started. |
||
1082 | public $original_size; |
||
1083 | public $size; |
||
1084 | protected $pos; |
||
1085 | protected $doc; |
||
1086 | protected $char; |
||
1087 | protected $cursor; |
||
1088 | protected $parent; |
||
1089 | protected $noise = []; |
||
1090 | protected $token_blank = " \t\r\n"; |
||
1091 | protected $token_equal = ' =/>'; |
||
1092 | protected $token_slash = " />\r\n\t"; |
||
1093 | protected $token_attr = ' >'; |
||
1094 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. |
||
1095 | public $_charset = ''; |
||
1096 | public $_target_charset = ''; |
||
1097 | protected $default_br_text = ""; |
||
1098 | public $default_span_text = ""; |
||
1099 | |||
1100 | // use isset instead of in_array, performance boost about 30%... |
||
1101 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); |
||
1102 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); |
||
1103 | // Known sourceforge issue #2977341 |
||
1104 | // B tags that are not closed cause us to return everything to the end of the document. |
||
1105 | protected $optional_closing_tags = array( |
||
1106 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), |
||
1107 | 'th'=>array('th'=>1), |
||
1108 | 'td'=>array('td'=>1), |
||
1109 | 'li'=>array('li'=>1), |
||
1110 | 'dt'=>array('dt'=>1, 'dd'=>1), |
||
1111 | 'dd'=>array('dd'=>1, 'dt'=>1), |
||
1112 | 'dl'=>array('dd'=>1, 'dt'=>1), |
||
1113 | 'p'=>array('p'=>1), |
||
1114 | 'nobr'=>array('nobr'=>1), |
||
1115 | 'b'=>array('b'=>1), |
||
1116 | 'option'=>array('option'=>1), |
||
1117 | ); |
||
1118 | |||
1119 | public function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||
1133 | } |
||
1134 | |||
1135 | public function __destruct() |
||
1136 | { |
||
1137 | $this->clear(); |
||
1138 | } |
||
1139 | |||
1140 | // load html from string |
||
1141 | public function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||
1142 | { |
||
1143 | global $debug_object; |
||
1144 | |||
1145 | // prepare |
||
1146 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); |
||
1147 | // strip out comments |
||
1148 | $this->remove_noise("'<!--(.*?)-->'is"); |
||
1149 | // strip out cdata |
||
1150 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); |
||
1151 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 |
||
1152 | // Script tags removal now preceeds style tag removal. |
||
1153 | // strip out <script> tags |
||
1154 | $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); |
||
1155 | $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); |
||
1156 | // strip out <style> tags |
||
1157 | $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); |
||
1158 | $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); |
||
1159 | // strip out preformatted tags |
||
1160 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); |
||
1161 | // strip out server side scripts |
||
1162 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); |
||
1163 | // strip smarty scripts |
||
1164 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); |
||
1165 | |||
1166 | // parsing |
||
1167 | while ($this->parse()); |
||
1168 | // end |
||
1169 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||
1170 | $this->parse_charset(); |
||
1171 | |||
1172 | // make load function chainable |
||
1173 | return $this; |
||
1174 | } |
||
1175 | |||
1176 | // load html from file |
||
1177 | public function load_file() |
||
1178 | { |
||
1179 | $args = func_get_args(); |
||
1180 | $this->load(call_user_func_array('file_get_contents', $args), true); |
||
1181 | // Throw an error if we can't properly load the dom. |
||
1182 | if (($error=error_get_last())!==null) { |
||
1183 | $this->clear(); |
||
1184 | return false; |
||
1185 | } |
||
1186 | } |
||
1187 | |||
1188 | // set callback function |
||
1189 | public function set_callback($function_name) |
||
1192 | } |
||
1193 | |||
1194 | // remove callback function |
||
1195 | public function remove_callback() |
||
1196 | { |
||
1197 | $this->callback = null; |
||
1198 | } |
||
1199 | |||
1200 | // save dom as string |
||
1201 | public function save($filepath='') |
||
1202 | { |
||
1203 | $ret = $this->root->innertext(); |
||
1204 | if ($filepath!=='') { |
||
1205 | file_put_contents($filepath, $ret, LOCK_EX); |
||
1206 | } |
||
1207 | return $ret; |
||
1208 | } |
||
1209 | |||
1210 | // find dom node by css selector |
||
1211 | // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. |
||
1212 | public function find($selector, $idx=null, $lowercase=false) |
||
1213 | { |
||
1214 | return $this->root->find($selector, $idx, $lowercase); |
||
1215 | } |
||
1216 | |||
1217 | // clean up memory due to php5 circular references memory leak... |
||
1218 | public function clear() |
||
1219 | { |
||
1220 | foreach ($this->nodes as $n) { |
||
1221 | $n->clear(); |
||
1222 | $n = null; |
||
1223 | } |
||
1224 | // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. |
||
1225 | if (isset($this->children)) { |
||
1226 | foreach ($this->children as $n) { |
||
1227 | $n->clear(); |
||
1228 | $n = null; |
||
1229 | } |
||
1230 | } |
||
1231 | if (isset($this->parent)) { |
||
1232 | $this->parent->clear(); |
||
1233 | unset($this->parent); |
||
1234 | } |
||
1235 | if (isset($this->root)) { |
||
1236 | $this->root->clear(); |
||
1237 | unset($this->root); |
||
1238 | } |
||
1239 | unset($this->doc); |
||
1240 | unset($this->noise); |
||
1241 | } |
||
1242 | |||
1243 | public function dump($show_attr=true) |
||
1244 | { |
||
1245 | $this->root->dump($show_attr); |
||
1246 | } |
||
1247 | |||
1248 | // prepare HTML data and init everything |
||
1249 | protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||
1250 | { |
||
1251 | $this->clear(); |
||
1252 | |||
1253 | // set the length of content before we do anything to it. |
||
1254 | $this->size = strlen($str); |
||
1255 | // Save the original size of the html that we got in. It might be useful to someone. |
||
1256 | $this->original_size = $this->size; |
||
1257 | |||
1258 | //before we save the string as the doc... strip out the \r \n's if we are told to. |
||
1259 | if ($stripRN) { |
||
1260 | $str = str_replace("\r", " ", $str); |
||
1261 | $str = str_replace("\n", " ", $str); |
||
1262 | |||
1263 | // set the length of content since we have changed it. |
||
1264 | $this->size = strlen($str); |
||
1265 | } |
||
1266 | |||
1267 | $this->doc = $str; |
||
1268 | $this->pos = 0; |
||
1269 | $this->cursor = 1; |
||
1270 | $this->noise = []; |
||
1271 | $this->nodes = []; |
||
1272 | $this->lowercase = $lowercase; |
||
1273 | $this->default_br_text = $defaultBRText; |
||
1274 | $this->default_span_text = $defaultSpanText; |
||
1275 | $this->root = new simple_html_dom_node($this); |
||
1276 | $this->root->tag = 'root'; |
||
1277 | $this->root->_[HDOM_INFO_BEGIN] = -1; |
||
1278 | $this->root->nodetype = HDOM_TYPE_ROOT; |
||
1279 | $this->parent = $this->root; |
||
1280 | if ($this->size>0) { |
||
1281 | $this->char = $this->doc[0]; |
||
1282 | } |
||
1283 | } |
||
1284 | |||
1285 | // parse html content |
||
1286 | protected function parse() |
||
1287 | { |
||
1288 | if (($s = $this->copy_until_char('<'))==='') { |
||
1289 | return $this->read_tag(); |
||
1290 | } |
||
1291 | |||
1292 | // text |
||
1293 | $node = new simple_html_dom_node($this); |
||
1294 | ++$this->cursor; |
||
1295 | $node->_[HDOM_INFO_TEXT] = $s; |
||
1296 | $this->link_nodes($node, false); |
||
1297 | return true; |
||
1298 | } |
||
1299 | |||
1300 | // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. |
||
1301 | // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec |
||
1302 | // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. |
||
1303 | protected function parse_charset() |
||
1304 | { |
||
1305 | global $debug_object; |
||
1306 | |||
1307 | $charset = null; |
||
1308 | |||
1309 | if (function_exists('get_last_retrieve_url_contents_content_type')) { |
||
1310 | $contentTypeHeader = get_last_retrieve_url_contents_content_type(); |
||
1311 | $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); |
||
1312 | if ($success) { |
||
1313 | $charset = $matches[1]; |
||
1314 | if (is_object($debug_object)) { |
||
1315 | $debug_object->debugLog(2, 'header content-type found charset of: ' . $charset); |
||
1316 | } |
||
1317 | } |
||
1318 | } |
||
1319 | |||
1320 | if (empty($charset)) { |
||
1321 | $el = $this->root->find('meta[http-equiv=Content-Type]', 0); |
||
1322 | if (!empty($el)) { |
||
1323 | $fullvalue = $el->content; |
||
1324 | if (is_object($debug_object)) { |
||
1325 | $debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue); |
||
1326 | } |
||
1327 | |||
1328 | if (!empty($fullvalue)) { |
||
1329 | $success = preg_match('/charset=(.+)/', $fullvalue, $matches); |
||
1330 | if ($success) { |
||
1331 | $charset = $matches[1]; |
||
1332 | } else { |
||
1333 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 |
||
1334 | if (is_object($debug_object)) { |
||
1335 | $debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'); |
||
1336 | } |
||
1337 | $charset = 'ISO-8859-1'; |
||
1338 | } |
||
1339 | } |
||
1340 | } |
||
1341 | } |
||
1342 | |||
1343 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... |
||
1344 | if (empty($charset)) { |
||
1345 | // Have php try to detect the encoding from the text given to us. |
||
1346 | $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" )); |
||
1347 | if (is_object($debug_object)) { |
||
1348 | $debug_object->debugLog(2, 'mb_detect found: ' . $charset); |
||
1349 | } |
||
1350 | |||
1351 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... |
||
1352 | if ($charset === false) { |
||
1353 | if (is_object($debug_object)) { |
||
1354 | $debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8'); |
||
1355 | } |
||
1356 | $charset = 'UTF-8'; |
||
1357 | } |
||
1358 | } |
||
1359 | |||
1360 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. |
||
1361 | if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) { |
||
1362 | if (is_object($debug_object)) { |
||
1363 | $debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset'); |
||
1364 | } |
||
1365 | $charset = 'CP1252'; |
||
1366 | } |
||
1367 | |||
1368 | if (is_object($debug_object)) { |
||
1369 | $debug_object->debugLog(1, 'EXIT - ' . $charset); |
||
1370 | } |
||
1371 | |||
1372 | return $this->_charset = $charset; |
||
1373 | } |
||
1374 | |||
1375 | // read tag info |
||
1376 | protected function read_tag() |
||
1377 | { |
||
1378 | if ($this->char!=='<') { |
||
1379 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||
1380 | return false; |
||
1381 | } |
||
1382 | $begin_tag_pos = $this->pos; |
||
1383 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1384 | |||
1385 | // end tag |
||
1386 | if ($this->char==='/') { |
||
1387 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1388 | // This represents the change in the simple_html_dom trunk from revision 180 to 181. |
||
1389 | // $this->skip($this->token_blank_t); |
||
1390 | $this->skip($this->token_blank); |
||
1391 | $tag = $this->copy_until_char('>'); |
||
1392 | |||
1393 | // skip attributes in end tag |
||
1394 | if (($pos = strpos($tag, ' '))!==false) { |
||
1395 | $tag = substr($tag, 0, $pos); |
||
1396 | } |
||
1397 | |||
1398 | $parent_lower = strtolower($this->parent->tag); |
||
1399 | $tag_lower = strtolower($tag); |
||
1400 | |||
1401 | if ($parent_lower!==$tag_lower) { |
||
1402 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { |
||
1403 | $this->parent->_[HDOM_INFO_END] = 0; |
||
1404 | $org_parent = $this->parent; |
||
1405 | |||
1406 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) { |
||
1407 | $this->parent = $this->parent->parent; |
||
1408 | } |
||
1409 | |||
1410 | if (strtolower($this->parent->tag)!==$tag_lower) { |
||
1411 | $this->parent = $org_parent; // restore origonal parent |
||
1412 | if ($this->parent->parent) { |
||
1413 | $this->parent = $this->parent->parent; |
||
1414 | } |
||
1415 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
1416 | return $this->as_text_node($tag); |
||
1417 | } |
||
1418 | } elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower])) { |
||
1419 | $this->parent->_[HDOM_INFO_END] = 0; |
||
1420 | $org_parent = $this->parent; |
||
1421 | |||
1422 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) { |
||
1423 | $this->parent = $this->parent->parent; |
||
1424 | } |
||
1425 | |||
1426 | if (strtolower($this->parent->tag)!==$tag_lower) { |
||
1427 | $this->parent = $org_parent; // restore origonal parent |
||
1428 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
1429 | return $this->as_text_node($tag); |
||
1430 | } |
||
1431 | } elseif (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) { |
||
1432 | $this->parent->_[HDOM_INFO_END] = 0; |
||
1433 | $this->parent = $this->parent->parent; |
||
1434 | } else { |
||
1435 | return $this->as_text_node($tag); |
||
1436 | } |
||
1437 | } |
||
1438 | |||
1439 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
1440 | if ($this->parent->parent) { |
||
1441 | $this->parent = $this->parent->parent; |
||
1442 | } |
||
1443 | |||
1444 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1445 | return true; |
||
1446 | } |
||
1447 | |||
1448 | $node = new simple_html_dom_node($this); |
||
1449 | $node->_[HDOM_INFO_BEGIN] = $this->cursor; |
||
1450 | ++$this->cursor; |
||
1451 | $tag = $this->copy_until($this->token_slash); |
||
1452 | $node->tag_start = $begin_tag_pos; |
||
1453 | |||
1454 | // doctype, cdata & comments... |
||
1455 | if (isset($tag[0]) && $tag[0]==='!') { |
||
1456 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); |
||
1457 | |||
1458 | if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { |
||
1459 | $node->nodetype = HDOM_TYPE_COMMENT; |
||
1460 | $node->tag = 'comment'; |
||
1461 | } else { |
||
1462 | $node->nodetype = HDOM_TYPE_UNKNOWN; |
||
1463 | $node->tag = 'unknown'; |
||
1464 | } |
||
1465 | if ($this->char==='>') { |
||
1466 | $node->_[HDOM_INFO_TEXT].='>'; |
||
1467 | } |
||
1468 | $this->link_nodes($node, true); |
||
1469 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1470 | return true; |
||
1471 | } |
||
1472 | |||
1473 | // text |
||
1474 | if ($pos=strpos($tag, '<')!==false) { |
||
1475 | $tag = '<' . substr($tag, 0, -1); |
||
1476 | $node->_[HDOM_INFO_TEXT] = $tag; |
||
1477 | $this->link_nodes($node, false); |
||
1478 | $this->char = $this->doc[--$this->pos]; // prev |
||
1479 | return true; |
||
1480 | } |
||
1481 | |||
1482 | if (!preg_match("/^[\w-:]+$/", $tag)) { |
||
1483 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); |
||
1484 | if ($this->char==='<') { |
||
1485 | $this->link_nodes($node, false); |
||
1486 | return true; |
||
1487 | } |
||
1488 | |||
1489 | if ($this->char==='>') { |
||
1490 | $node->_[HDOM_INFO_TEXT].='>'; |
||
1491 | } |
||
1492 | $this->link_nodes($node, false); |
||
1493 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1494 | return true; |
||
1495 | } |
||
1496 | |||
1497 | // begin tag |
||
1498 | $node->nodetype = HDOM_TYPE_ELEMENT; |
||
1499 | $tag_lower = strtolower($tag); |
||
1500 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; |
||
1501 | |||
1502 | // handle optional closing tags |
||
1503 | if (isset($this->optional_closing_tags[$tag_lower])) { |
||
1504 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { |
||
1505 | $this->parent->_[HDOM_INFO_END] = 0; |
||
1506 | $this->parent = $this->parent->parent; |
||
1507 | } |
||
1508 | $node->parent = $this->parent; |
||
1509 | } |
||
1510 | |||
1511 | $guard = 0; // prevent infinity loop |
||
1512 | $space = array($this->copy_skip($this->token_blank), '', ''); |
||
1513 | |||
1514 | // attributes |
||
1515 | do { |
||
1516 | if ($this->char!==null && $space[0]==='') { |
||
1517 | break; |
||
1518 | } |
||
1519 | $name = $this->copy_until($this->token_equal); |
||
1520 | if ($guard===$this->pos) { |
||
1521 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1522 | continue; |
||
1523 | } |
||
1524 | $guard = $this->pos; |
||
1525 | |||
1526 | // handle endless '<' |
||
1527 | if ($this->pos>=$this->size-1 && $this->char!=='>') { |
||
1528 | $node->nodetype = HDOM_TYPE_TEXT; |
||
1529 | $node->_[HDOM_INFO_END] = 0; |
||
1530 | $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; |
||
1531 | $node->tag = 'text'; |
||
1532 | $this->link_nodes($node, false); |
||
1533 | return true; |
||
1534 | } |
||
1535 | |||
1536 | // handle mismatch '<' |
||
1537 | if ($this->doc[$this->pos-1]=='<') { |
||
1538 | $node->nodetype = HDOM_TYPE_TEXT; |
||
1539 | $node->tag = 'text'; |
||
1540 | $node->attr = []; |
||
1541 | $node->_[HDOM_INFO_END] = 0; |
||
1542 | $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1); |
||
1543 | $this->pos -= 2; |
||
1544 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1545 | $this->link_nodes($node, false); |
||
1546 | return true; |
||
1547 | } |
||
1548 | |||
1549 | if ($name!=='/' && $name!=='') { |
||
1550 | $space[1] = $this->copy_skip($this->token_blank); |
||
1551 | $name = $this->restore_noise($name); |
||
1552 | if ($this->lowercase) { |
||
1553 | $name = strtolower($name); |
||
1554 | } |
||
1555 | if ($this->char==='=') { |
||
1556 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1557 | $this->parse_attr($node, $name, $space); |
||
1558 | } else { |
||
1559 | //no value attr: nowrap, checked selected... |
||
1560 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||
1561 | $node->attr[$name] = true; |
||
1562 | if ($this->char!='>') { |
||
1563 | $this->char = $this->doc[--$this->pos]; |
||
1564 | } // prev |
||
1565 | } |
||
1566 | $node->_[HDOM_INFO_SPACE][] = $space; |
||
1567 | $space = array($this->copy_skip($this->token_blank), '', ''); |
||
1568 | } else { |
||
1569 | break; |
||
1570 | } |
||
1571 | } while ($this->char!=='>' && $this->char!=='/'); |
||
1572 | |||
1573 | $this->link_nodes($node, true); |
||
1574 | $node->_[HDOM_INFO_ENDSPACE] = $space[0]; |
||
1575 | |||
1576 | // check self closing |
||
1577 | if ($this->copy_until_char_escape('>')==='/') { |
||
1578 | $node->_[HDOM_INFO_ENDSPACE] .= '/'; |
||
1579 | $node->_[HDOM_INFO_END] = 0; |
||
1580 | } else { |
||
1581 | // reset parent |
||
1582 | if (!isset($this->self_closing_tags[strtolower($node->tag)])) { |
||
1583 | $this->parent = $node; |
||
1584 | } |
||
1585 | } |
||
1586 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1587 | |||
1588 | // If it's a BR tag, we need to set it's text to the default text. |
||
1589 | // This way when we see it in plaintext, we can generate formatting that the user wants. |
||
1590 | // since a br tag never has sub nodes, this works well. |
||
1591 | if ($node->tag == "br") { |
||
1592 | $node->_[HDOM_INFO_INNER] = $this->default_br_text; |
||
1593 | } |
||
1594 | |||
1595 | return true; |
||
1596 | } |
||
1597 | |||
1598 | // parse attributes |
||
1599 | protected function parse_attr($node, $name, &$space) |
||
1600 | { |
||
1601 | // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 |
||
1602 | // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. |
||
1603 | if (isset($node->attr[$name])) { |
||
1604 | return; |
||
1605 | } |
||
1606 | |||
1607 | $space[2] = $this->copy_skip($this->token_blank); |
||
1608 | switch ($this->char) { |
||
1609 | case '"': |
||
1610 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
||
1611 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1612 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); |
||
1613 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1614 | break; |
||
1615 | case '\'': |
||
1616 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; |
||
1617 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1618 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); |
||
1619 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1620 | break; |
||
1621 | default: |
||
1622 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||
1623 | $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); |
||
1624 | } |
||
1625 | // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. |
||
1626 | $node->attr[$name] = str_replace("\r", "", $node->attr[$name]); |
||
1627 | $node->attr[$name] = str_replace("\n", "", $node->attr[$name]); |
||
1628 | // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. |
||
1629 | if ($name == "class") { |
||
1630 | $node->attr[$name] = trim($node->attr[$name]); |
||
1631 | } |
||
1632 | } |
||
1633 | |||
1634 | // link node's parent |
||
1635 | protected function link_nodes(&$node, $is_child) |
||
1636 | { |
||
1637 | $node->parent = $this->parent; |
||
1638 | $this->parent->nodes[] = $node; |
||
1639 | if ($is_child) { |
||
1640 | $this->parent->children[] = $node; |
||
1641 | } |
||
1642 | } |
||
1643 | |||
1644 | // as a text node |
||
1645 | protected function as_text_node($tag) |
||
1653 | } |
||
1654 | |||
1655 | protected function skip($chars) |
||
1656 | { |
||
1657 | $this->pos += strspn($this->doc, $chars, $this->pos); |
||
1658 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1659 | } |
||
1660 | |||
1661 | protected function copy_skip($chars) |
||
1662 | { |
||
1663 | $pos = $this->pos; |
||
1664 | $len = strspn($this->doc, $chars, $pos); |
||
1665 | $this->pos += $len; |
||
1666 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1667 | if ($len===0) { |
||
1668 | return ''; |
||
1669 | } |
||
1670 | return substr($this->doc, $pos, $len); |
||
1671 | } |
||
1672 | |||
1673 | protected function copy_until($chars) |
||
1674 | { |
||
1675 | $pos = $this->pos; |
||
1676 | $len = strcspn($this->doc, $chars, $pos); |
||
1677 | $this->pos += $len; |
||
1678 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
1679 | return substr($this->doc, $pos, $len); |
||
1680 | } |
||
1681 | |||
1682 | protected function copy_until_char($char) |
||
1683 | { |
||
1684 | if ($this->char===null) { |
||
1685 | return ''; |
||
1686 | } |
||
1687 | |||
1688 | if (($pos = strpos($this->doc, $char, $this->pos))===false) { |
||
1689 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); |
||
1690 | $this->char = null; |
||
1691 | $this->pos = $this->size; |
||
1692 | return $ret; |
||
1693 | } |
||
1694 | |||
1695 | if ($pos===$this->pos) { |
||
1696 | return ''; |
||
1697 | } |
||
1698 | $pos_old = $this->pos; |
||
1699 | $this->char = $this->doc[$pos]; |
||
1700 | $this->pos = $pos; |
||
1701 | return substr($this->doc, $pos_old, $pos-$pos_old); |
||
1702 | } |
||
1703 | |||
1704 | protected function copy_until_char_escape($char) |
||
1705 | { |
||
1706 | if ($this->char===null) { |
||
1707 | return ''; |
||
1708 | } |
||
1709 | |||
1710 | $start = $this->pos; |
||
1711 | while (1) { |
||
1712 | if (($pos = strpos($this->doc, $char, $start))===false) { |
||
1713 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); |
||
1714 | $this->char = null; |
||
1715 | $this->pos = $this->size; |
||
1716 | return $ret; |
||
1717 | } |
||
1718 | |||
1719 | if ($pos===$this->pos) { |
||
1720 | return ''; |
||
1721 | } |
||
1722 | |||
1723 | if ($this->doc[$pos-1]==='\\') { |
||
1724 | $start = $pos+1; |
||
1725 | continue; |
||
1726 | } |
||
1727 | |||
1728 | $pos_old = $this->pos; |
||
1729 | $this->char = $this->doc[$pos]; |
||
1730 | $this->pos = $pos; |
||
1731 | return substr($this->doc, $pos_old, $pos-$pos_old); |
||
1732 | } |
||
1733 | } |
||
1734 | |||
1735 | // remove noise from html content |
||
1736 | // save the noise in the $this->noise array. |
||
1737 | protected function remove_noise($pattern, $remove_tag=false) |
||
1738 | { |
||
1739 | global $debug_object; |
||
1740 | if (is_object($debug_object)) { |
||
1741 | $debug_object->debugLogEntry(1); |
||
1742 | } |
||
1743 | |||
1744 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); |
||
1745 | |||
1746 | for ($i=$count-1; $i>-1; --$i) { |
||
1747 | $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); |
||
1748 | if (is_object($debug_object)) { |
||
1749 | $debug_object->debugLog(2, 'key is: ' . $key); |
||
1750 | } |
||
1751 | $idx = ($remove_tag) ? 0 : 1; |
||
1752 | $this->noise[$key] = $matches[$i][$idx][0]; |
||
1753 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
||
1754 | } |
||
1755 | |||
1756 | // reset the length of content |
||
1757 | $this->size = strlen($this->doc); |
||
1758 | if ($this->size>0) { |
||
1759 | $this->char = $this->doc[0]; |
||
1760 | } |
||
1761 | } |
||
1762 | |||
1763 | // restore noise to html content |
||
1764 | public function restore_noise($text) |
||
1791 | } |
||
1792 | |||
1793 | // Sometimes we NEED one of the noise elements. |
||
1794 | public function search_noise($text) |
||
1795 | { |
||
1796 | global $debug_object; |
||
1797 | if (is_object($debug_object)) { |
||
1798 | $debug_object->debugLogEntry(1); |
||
1799 | } |
||
1800 | |||
1801 | foreach ($this->noise as $noiseElement) { |
||
1802 | if (strpos($noiseElement, $text)!==false) { |
||
1803 | return $noiseElement; |
||
1804 | } |
||
1805 | } |
||
1806 | } |
||
1807 | public function __toString() |
||
1808 | { |
||
1809 | return $this->root->innertext(); |
||
1810 | } |
||
1811 | |||
1812 | public function __get($name) |
||
1813 | { |
||
1814 | switch ($name) { |
||
1815 | case 'outertext': |
||
1816 | return $this->root->innertext(); |
||
1817 | case 'innertext': |
||
1818 | return $this->root->innertext(); |
||
1819 | case 'plaintext': |
||
1820 | return $this->root->text(); |
||
1821 | case 'charset': |
||
1822 | return $this->_charset; |
||
1823 | case 'target_charset': |
||
1824 | return $this->_target_charset; |
||
1825 | } |
||
1826 | } |
||
1827 | |||
1828 | // camel naming conventions |
||
1829 | public function childNodes($idx=-1) |
||
1830 | { |
||
1831 | return $this->root->childNodes($idx); |
||
1832 | } |
||
1833 | public function firstChild() |
||
1834 | { |
||
1835 | return $this->root->first_child(); |
||
1836 | } |
||
1837 | public function lastChild() |
||
1840 | } |
||
1841 | public function createElement($name, $value=null) |
||
1842 | { |
||
1843 | return @str_get_html("<$name>$value</$name>")->first_child(); |
||
1844 | } |
||
1845 | public function createTextNode($value) |
||
1846 | { |
||
1847 | return @end(str_get_html($value)->nodes); |
||
1848 | } |
||
1849 | public function getElementById($id) |
||
1850 | { |
||
1851 | return $this->find("#$id", 0); |
||
1852 | } |
||
1853 | public function getElementsById($id, $idx=null) |
||
1854 | { |
||
1855 | return $this->find("#$id", $idx); |
||
1856 | } |
||
1857 | public function getElementByTagName($name) |
||
1858 | { |
||
1859 | return $this->find($name, 0); |
||
1860 | } |
||
1861 | public function getElementsByTagName($name, $idx=-1) |
||
1864 | } |
||
1865 | public function loadFile() |
||
1866 | { |
||
1867 | $args = func_get_args(); |
||
1868 | $this->load_file($args); |
||
1869 | } |
||
1870 | } |
||
1871 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.