| Total Complexity | 166 |
| Total Lines | 794 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like simple_html_dom often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use simple_html_dom, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 1075 | class simple_html_dom |
||
| 1076 | { |
||
| 1077 | public $root = null; |
||
| 1078 | public $nodes = []; |
||
| 1079 | public $callback = null; |
||
| 1080 | public $lowercase = false; |
||
| 1081 | // Used to keep track of how large the text was when we started. |
||
| 1082 | public $original_size; |
||
| 1083 | public $size; |
||
| 1084 | protected $pos; |
||
| 1085 | protected $doc; |
||
| 1086 | protected $char; |
||
| 1087 | protected $cursor; |
||
| 1088 | protected $parent; |
||
| 1089 | protected $noise = []; |
||
| 1090 | protected $token_blank = " \t\r\n"; |
||
| 1091 | protected $token_equal = ' =/>'; |
||
| 1092 | protected $token_slash = " />\r\n\t"; |
||
| 1093 | protected $token_attr = ' >'; |
||
| 1094 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. |
||
| 1095 | public $_charset = ''; |
||
| 1096 | public $_target_charset = ''; |
||
| 1097 | protected $default_br_text = ""; |
||
| 1098 | public $default_span_text = ""; |
||
| 1099 | |||
| 1100 | // use isset instead of in_array, performance boost about 30%... |
||
| 1101 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); |
||
| 1102 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); |
||
| 1103 | // Known sourceforge issue #2977341 |
||
| 1104 | // B tags that are not closed cause us to return everything to the end of the document. |
||
| 1105 | protected $optional_closing_tags = array( |
||
| 1106 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), |
||
| 1107 | 'th'=>array('th'=>1), |
||
| 1108 | 'td'=>array('td'=>1), |
||
| 1109 | 'li'=>array('li'=>1), |
||
| 1110 | 'dt'=>array('dt'=>1, 'dd'=>1), |
||
| 1111 | 'dd'=>array('dd'=>1, 'dt'=>1), |
||
| 1112 | 'dl'=>array('dd'=>1, 'dt'=>1), |
||
| 1113 | 'p'=>array('p'=>1), |
||
| 1114 | 'nobr'=>array('nobr'=>1), |
||
| 1115 | 'b'=>array('b'=>1), |
||
| 1116 | 'option'=>array('option'=>1), |
||
| 1117 | ); |
||
| 1118 | |||
| 1119 | public function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||
| 1133 | } |
||
| 1134 | |||
| 1135 | public function __destruct() |
||
| 1136 | { |
||
| 1137 | $this->clear(); |
||
| 1138 | } |
||
| 1139 | |||
| 1140 | // load html from string |
||
| 1141 | public function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||
| 1142 | { |
||
| 1143 | global $debug_object; |
||
| 1144 | |||
| 1145 | // prepare |
||
| 1146 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); |
||
| 1147 | // strip out comments |
||
| 1148 | $this->remove_noise("'<!--(.*?)-->'is"); |
||
| 1149 | // strip out cdata |
||
| 1150 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); |
||
| 1151 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 |
||
| 1152 | // Script tags removal now preceeds style tag removal. |
||
| 1153 | // strip out <script> tags |
||
| 1154 | $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); |
||
| 1155 | $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); |
||
| 1156 | // strip out <style> tags |
||
| 1157 | $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); |
||
| 1158 | $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); |
||
| 1159 | // strip out preformatted tags |
||
| 1160 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); |
||
| 1161 | // strip out server side scripts |
||
| 1162 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); |
||
| 1163 | // strip smarty scripts |
||
| 1164 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); |
||
| 1165 | |||
| 1166 | // parsing |
||
| 1167 | while ($this->parse()); |
||
| 1168 | // end |
||
| 1169 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||
| 1170 | $this->parse_charset(); |
||
| 1171 | |||
| 1172 | // make load function chainable |
||
| 1173 | return $this; |
||
| 1174 | } |
||
| 1175 | |||
| 1176 | // load html from file |
||
| 1177 | public function load_file() |
||
| 1178 | { |
||
| 1179 | $args = func_get_args(); |
||
| 1180 | $this->load(call_user_func_array('file_get_contents', $args), true); |
||
| 1181 | // Throw an error if we can't properly load the dom. |
||
| 1182 | if (($error=error_get_last())!==null) { |
||
| 1183 | $this->clear(); |
||
| 1184 | return false; |
||
| 1185 | } |
||
| 1186 | } |
||
| 1187 | |||
| 1188 | // set callback function |
||
| 1189 | public function set_callback($function_name) |
||
| 1192 | } |
||
| 1193 | |||
| 1194 | // remove callback function |
||
| 1195 | public function remove_callback() |
||
| 1196 | { |
||
| 1197 | $this->callback = null; |
||
| 1198 | } |
||
| 1199 | |||
| 1200 | // save dom as string |
||
| 1201 | public function save($filepath='') |
||
| 1202 | { |
||
| 1203 | $ret = $this->root->innertext(); |
||
| 1204 | if ($filepath!=='') { |
||
| 1205 | file_put_contents($filepath, $ret, LOCK_EX); |
||
| 1206 | } |
||
| 1207 | return $ret; |
||
| 1208 | } |
||
| 1209 | |||
| 1210 | // find dom node by css selector |
||
| 1211 | // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. |
||
| 1212 | public function find($selector, $idx=null, $lowercase=false) |
||
| 1213 | { |
||
| 1214 | return $this->root->find($selector, $idx, $lowercase); |
||
| 1215 | } |
||
| 1216 | |||
| 1217 | // clean up memory due to php5 circular references memory leak... |
||
| 1218 | public function clear() |
||
| 1219 | { |
||
| 1220 | foreach ($this->nodes as $n) { |
||
| 1221 | $n->clear(); |
||
| 1222 | $n = null; |
||
| 1223 | } |
||
| 1224 | // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. |
||
| 1225 | if (isset($this->children)) { |
||
| 1226 | foreach ($this->children as $n) { |
||
| 1227 | $n->clear(); |
||
| 1228 | $n = null; |
||
| 1229 | } |
||
| 1230 | } |
||
| 1231 | if (isset($this->parent)) { |
||
| 1232 | $this->parent->clear(); |
||
| 1233 | unset($this->parent); |
||
| 1234 | } |
||
| 1235 | if (isset($this->root)) { |
||
| 1236 | $this->root->clear(); |
||
| 1237 | unset($this->root); |
||
| 1238 | } |
||
| 1239 | unset($this->doc); |
||
| 1240 | unset($this->noise); |
||
| 1241 | } |
||
| 1242 | |||
| 1243 | public function dump($show_attr=true) |
||
| 1244 | { |
||
| 1245 | $this->root->dump($show_attr); |
||
| 1246 | } |
||
| 1247 | |||
| 1248 | // prepare HTML data and init everything |
||
| 1249 | protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||
| 1250 | { |
||
| 1251 | $this->clear(); |
||
| 1252 | |||
| 1253 | // set the length of content before we do anything to it. |
||
| 1254 | $this->size = strlen($str); |
||
| 1255 | // Save the original size of the html that we got in. It might be useful to someone. |
||
| 1256 | $this->original_size = $this->size; |
||
| 1257 | |||
| 1258 | //before we save the string as the doc... strip out the \r \n's if we are told to. |
||
| 1259 | if ($stripRN) { |
||
| 1260 | $str = str_replace("\r", " ", $str); |
||
| 1261 | $str = str_replace("\n", " ", $str); |
||
| 1262 | |||
| 1263 | // set the length of content since we have changed it. |
||
| 1264 | $this->size = strlen($str); |
||
| 1265 | } |
||
| 1266 | |||
| 1267 | $this->doc = $str; |
||
| 1268 | $this->pos = 0; |
||
| 1269 | $this->cursor = 1; |
||
| 1270 | $this->noise = []; |
||
| 1271 | $this->nodes = []; |
||
| 1272 | $this->lowercase = $lowercase; |
||
| 1273 | $this->default_br_text = $defaultBRText; |
||
| 1274 | $this->default_span_text = $defaultSpanText; |
||
| 1275 | $this->root = new simple_html_dom_node($this); |
||
| 1276 | $this->root->tag = 'root'; |
||
| 1277 | $this->root->_[HDOM_INFO_BEGIN] = -1; |
||
| 1278 | $this->root->nodetype = HDOM_TYPE_ROOT; |
||
| 1279 | $this->parent = $this->root; |
||
| 1280 | if ($this->size>0) { |
||
| 1281 | $this->char = $this->doc[0]; |
||
| 1282 | } |
||
| 1283 | } |
||
| 1284 | |||
| 1285 | // parse html content |
||
| 1286 | protected function parse() |
||
| 1287 | { |
||
| 1288 | if (($s = $this->copy_until_char('<'))==='') { |
||
| 1289 | return $this->read_tag(); |
||
| 1290 | } |
||
| 1291 | |||
| 1292 | // text |
||
| 1293 | $node = new simple_html_dom_node($this); |
||
| 1294 | ++$this->cursor; |
||
| 1295 | $node->_[HDOM_INFO_TEXT] = $s; |
||
| 1296 | $this->link_nodes($node, false); |
||
| 1297 | return true; |
||
| 1298 | } |
||
| 1299 | |||
| 1300 | // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. |
||
| 1301 | // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec |
||
| 1302 | // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. |
||
| 1303 | protected function parse_charset() |
||
| 1304 | { |
||
| 1305 | global $debug_object; |
||
| 1306 | |||
| 1307 | $charset = null; |
||
| 1308 | |||
| 1309 | if (function_exists('get_last_retrieve_url_contents_content_type')) { |
||
| 1310 | $contentTypeHeader = get_last_retrieve_url_contents_content_type(); |
||
| 1311 | $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); |
||
| 1312 | if ($success) { |
||
| 1313 | $charset = $matches[1]; |
||
| 1314 | if (is_object($debug_object)) { |
||
| 1315 | $debug_object->debugLog(2, 'header content-type found charset of: ' . $charset); |
||
| 1316 | } |
||
| 1317 | } |
||
| 1318 | } |
||
| 1319 | |||
| 1320 | if (empty($charset)) { |
||
| 1321 | $el = $this->root->find('meta[http-equiv=Content-Type]', 0); |
||
| 1322 | if (!empty($el)) { |
||
| 1323 | $fullvalue = $el->content; |
||
| 1324 | if (is_object($debug_object)) { |
||
| 1325 | $debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue); |
||
| 1326 | } |
||
| 1327 | |||
| 1328 | if (!empty($fullvalue)) { |
||
| 1329 | $success = preg_match('/charset=(.+)/', $fullvalue, $matches); |
||
| 1330 | if ($success) { |
||
| 1331 | $charset = $matches[1]; |
||
| 1332 | } else { |
||
| 1333 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 |
||
| 1334 | if (is_object($debug_object)) { |
||
| 1335 | $debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'); |
||
| 1336 | } |
||
| 1337 | $charset = 'ISO-8859-1'; |
||
| 1338 | } |
||
| 1339 | } |
||
| 1340 | } |
||
| 1341 | } |
||
| 1342 | |||
| 1343 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... |
||
| 1344 | if (empty($charset)) { |
||
| 1345 | // Have php try to detect the encoding from the text given to us. |
||
| 1346 | $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" )); |
||
| 1347 | if (is_object($debug_object)) { |
||
| 1348 | $debug_object->debugLog(2, 'mb_detect found: ' . $charset); |
||
| 1349 | } |
||
| 1350 | |||
| 1351 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... |
||
| 1352 | if ($charset === false) { |
||
| 1353 | if (is_object($debug_object)) { |
||
| 1354 | $debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8'); |
||
| 1355 | } |
||
| 1356 | $charset = 'UTF-8'; |
||
| 1357 | } |
||
| 1358 | } |
||
| 1359 | |||
| 1360 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. |
||
| 1361 | if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) { |
||
| 1362 | if (is_object($debug_object)) { |
||
| 1363 | $debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset'); |
||
| 1364 | } |
||
| 1365 | $charset = 'CP1252'; |
||
| 1366 | } |
||
| 1367 | |||
| 1368 | if (is_object($debug_object)) { |
||
| 1369 | $debug_object->debugLog(1, 'EXIT - ' . $charset); |
||
| 1370 | } |
||
| 1371 | |||
| 1372 | return $this->_charset = $charset; |
||
| 1373 | } |
||
| 1374 | |||
| 1375 | // read tag info |
||
| 1376 | protected function read_tag() |
||
| 1377 | { |
||
| 1378 | if ($this->char!=='<') { |
||
| 1379 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||
| 1380 | return false; |
||
| 1381 | } |
||
| 1382 | $begin_tag_pos = $this->pos; |
||
| 1383 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1384 | |||
| 1385 | // end tag |
||
| 1386 | if ($this->char==='/') { |
||
| 1387 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1388 | // This represents the change in the simple_html_dom trunk from revision 180 to 181. |
||
| 1389 | // $this->skip($this->token_blank_t); |
||
| 1390 | $this->skip($this->token_blank); |
||
| 1391 | $tag = $this->copy_until_char('>'); |
||
| 1392 | |||
| 1393 | // skip attributes in end tag |
||
| 1394 | if (($pos = strpos($tag, ' '))!==false) { |
||
| 1395 | $tag = substr($tag, 0, $pos); |
||
| 1396 | } |
||
| 1397 | |||
| 1398 | $parent_lower = strtolower($this->parent->tag); |
||
| 1399 | $tag_lower = strtolower($tag); |
||
| 1400 | |||
| 1401 | if ($parent_lower!==$tag_lower) { |
||
| 1402 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { |
||
| 1403 | $this->parent->_[HDOM_INFO_END] = 0; |
||
| 1404 | $org_parent = $this->parent; |
||
| 1405 | |||
| 1406 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) { |
||
| 1407 | $this->parent = $this->parent->parent; |
||
| 1408 | } |
||
| 1409 | |||
| 1410 | if (strtolower($this->parent->tag)!==$tag_lower) { |
||
| 1411 | $this->parent = $org_parent; // restore origonal parent |
||
| 1412 | if ($this->parent->parent) { |
||
| 1413 | $this->parent = $this->parent->parent; |
||
| 1414 | } |
||
| 1415 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
| 1416 | return $this->as_text_node($tag); |
||
| 1417 | } |
||
| 1418 | } elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower])) { |
||
| 1419 | $this->parent->_[HDOM_INFO_END] = 0; |
||
| 1420 | $org_parent = $this->parent; |
||
| 1421 | |||
| 1422 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) { |
||
| 1423 | $this->parent = $this->parent->parent; |
||
| 1424 | } |
||
| 1425 | |||
| 1426 | if (strtolower($this->parent->tag)!==$tag_lower) { |
||
| 1427 | $this->parent = $org_parent; // restore origonal parent |
||
| 1428 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
| 1429 | return $this->as_text_node($tag); |
||
| 1430 | } |
||
| 1431 | } elseif (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) { |
||
| 1432 | $this->parent->_[HDOM_INFO_END] = 0; |
||
| 1433 | $this->parent = $this->parent->parent; |
||
| 1434 | } else { |
||
| 1435 | return $this->as_text_node($tag); |
||
| 1436 | } |
||
| 1437 | } |
||
| 1438 | |||
| 1439 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
| 1440 | if ($this->parent->parent) { |
||
| 1441 | $this->parent = $this->parent->parent; |
||
| 1442 | } |
||
| 1443 | |||
| 1444 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1445 | return true; |
||
| 1446 | } |
||
| 1447 | |||
| 1448 | $node = new simple_html_dom_node($this); |
||
| 1449 | $node->_[HDOM_INFO_BEGIN] = $this->cursor; |
||
| 1450 | ++$this->cursor; |
||
| 1451 | $tag = $this->copy_until($this->token_slash); |
||
| 1452 | $node->tag_start = $begin_tag_pos; |
||
| 1453 | |||
| 1454 | // doctype, cdata & comments... |
||
| 1455 | if (isset($tag[0]) && $tag[0]==='!') { |
||
| 1456 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); |
||
| 1457 | |||
| 1458 | if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { |
||
| 1459 | $node->nodetype = HDOM_TYPE_COMMENT; |
||
| 1460 | $node->tag = 'comment'; |
||
| 1461 | } else { |
||
| 1462 | $node->nodetype = HDOM_TYPE_UNKNOWN; |
||
| 1463 | $node->tag = 'unknown'; |
||
| 1464 | } |
||
| 1465 | if ($this->char==='>') { |
||
| 1466 | $node->_[HDOM_INFO_TEXT].='>'; |
||
| 1467 | } |
||
| 1468 | $this->link_nodes($node, true); |
||
| 1469 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1470 | return true; |
||
| 1471 | } |
||
| 1472 | |||
| 1473 | // text |
||
| 1474 | if ($pos=strpos($tag, '<')!==false) { |
||
| 1475 | $tag = '<' . substr($tag, 0, -1); |
||
| 1476 | $node->_[HDOM_INFO_TEXT] = $tag; |
||
| 1477 | $this->link_nodes($node, false); |
||
| 1478 | $this->char = $this->doc[--$this->pos]; // prev |
||
| 1479 | return true; |
||
| 1480 | } |
||
| 1481 | |||
| 1482 | if (!preg_match("/^[\w-:]+$/", $tag)) { |
||
| 1483 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); |
||
| 1484 | if ($this->char==='<') { |
||
| 1485 | $this->link_nodes($node, false); |
||
| 1486 | return true; |
||
| 1487 | } |
||
| 1488 | |||
| 1489 | if ($this->char==='>') { |
||
| 1490 | $node->_[HDOM_INFO_TEXT].='>'; |
||
| 1491 | } |
||
| 1492 | $this->link_nodes($node, false); |
||
| 1493 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1494 | return true; |
||
| 1495 | } |
||
| 1496 | |||
| 1497 | // begin tag |
||
| 1498 | $node->nodetype = HDOM_TYPE_ELEMENT; |
||
| 1499 | $tag_lower = strtolower($tag); |
||
| 1500 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; |
||
| 1501 | |||
| 1502 | // handle optional closing tags |
||
| 1503 | if (isset($this->optional_closing_tags[$tag_lower])) { |
||
| 1504 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { |
||
| 1505 | $this->parent->_[HDOM_INFO_END] = 0; |
||
| 1506 | $this->parent = $this->parent->parent; |
||
| 1507 | } |
||
| 1508 | $node->parent = $this->parent; |
||
| 1509 | } |
||
| 1510 | |||
| 1511 | $guard = 0; // prevent infinity loop |
||
| 1512 | $space = array($this->copy_skip($this->token_blank), '', ''); |
||
| 1513 | |||
| 1514 | // attributes |
||
| 1515 | do { |
||
| 1516 | if ($this->char!==null && $space[0]==='') { |
||
| 1517 | break; |
||
| 1518 | } |
||
| 1519 | $name = $this->copy_until($this->token_equal); |
||
| 1520 | if ($guard===$this->pos) { |
||
| 1521 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1522 | continue; |
||
| 1523 | } |
||
| 1524 | $guard = $this->pos; |
||
| 1525 | |||
| 1526 | // handle endless '<' |
||
| 1527 | if ($this->pos>=$this->size-1 && $this->char!=='>') { |
||
| 1528 | $node->nodetype = HDOM_TYPE_TEXT; |
||
| 1529 | $node->_[HDOM_INFO_END] = 0; |
||
| 1530 | $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; |
||
| 1531 | $node->tag = 'text'; |
||
| 1532 | $this->link_nodes($node, false); |
||
| 1533 | return true; |
||
| 1534 | } |
||
| 1535 | |||
| 1536 | // handle mismatch '<' |
||
| 1537 | if ($this->doc[$this->pos-1]=='<') { |
||
| 1538 | $node->nodetype = HDOM_TYPE_TEXT; |
||
| 1539 | $node->tag = 'text'; |
||
| 1540 | $node->attr = []; |
||
| 1541 | $node->_[HDOM_INFO_END] = 0; |
||
| 1542 | $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1); |
||
| 1543 | $this->pos -= 2; |
||
| 1544 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1545 | $this->link_nodes($node, false); |
||
| 1546 | return true; |
||
| 1547 | } |
||
| 1548 | |||
| 1549 | if ($name!=='/' && $name!=='') { |
||
| 1550 | $space[1] = $this->copy_skip($this->token_blank); |
||
| 1551 | $name = $this->restore_noise($name); |
||
| 1552 | if ($this->lowercase) { |
||
| 1553 | $name = strtolower($name); |
||
| 1554 | } |
||
| 1555 | if ($this->char==='=') { |
||
| 1556 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1557 | $this->parse_attr($node, $name, $space); |
||
| 1558 | } else { |
||
| 1559 | //no value attr: nowrap, checked selected... |
||
| 1560 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||
| 1561 | $node->attr[$name] = true; |
||
| 1562 | if ($this->char!='>') { |
||
| 1563 | $this->char = $this->doc[--$this->pos]; |
||
| 1564 | } // prev |
||
| 1565 | } |
||
| 1566 | $node->_[HDOM_INFO_SPACE][] = $space; |
||
| 1567 | $space = array($this->copy_skip($this->token_blank), '', ''); |
||
| 1568 | } else { |
||
| 1569 | break; |
||
| 1570 | } |
||
| 1571 | } while ($this->char!=='>' && $this->char!=='/'); |
||
| 1572 | |||
| 1573 | $this->link_nodes($node, true); |
||
| 1574 | $node->_[HDOM_INFO_ENDSPACE] = $space[0]; |
||
| 1575 | |||
| 1576 | // check self closing |
||
| 1577 | if ($this->copy_until_char_escape('>')==='/') { |
||
| 1578 | $node->_[HDOM_INFO_ENDSPACE] .= '/'; |
||
| 1579 | $node->_[HDOM_INFO_END] = 0; |
||
| 1580 | } else { |
||
| 1581 | // reset parent |
||
| 1582 | if (!isset($this->self_closing_tags[strtolower($node->tag)])) { |
||
| 1583 | $this->parent = $node; |
||
| 1584 | } |
||
| 1585 | } |
||
| 1586 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1587 | |||
| 1588 | // If it's a BR tag, we need to set it's text to the default text. |
||
| 1589 | // This way when we see it in plaintext, we can generate formatting that the user wants. |
||
| 1590 | // since a br tag never has sub nodes, this works well. |
||
| 1591 | if ($node->tag == "br") { |
||
| 1592 | $node->_[HDOM_INFO_INNER] = $this->default_br_text; |
||
| 1593 | } |
||
| 1594 | |||
| 1595 | return true; |
||
| 1596 | } |
||
| 1597 | |||
| 1598 | // parse attributes |
||
| 1599 | protected function parse_attr($node, $name, &$space) |
||
| 1600 | { |
||
| 1601 | // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 |
||
| 1602 | // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. |
||
| 1603 | if (isset($node->attr[$name])) { |
||
| 1604 | return; |
||
| 1605 | } |
||
| 1606 | |||
| 1607 | $space[2] = $this->copy_skip($this->token_blank); |
||
| 1608 | switch ($this->char) { |
||
| 1609 | case '"': |
||
| 1610 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
||
| 1611 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1612 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); |
||
| 1613 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1614 | break; |
||
| 1615 | case '\'': |
||
| 1616 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; |
||
| 1617 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1618 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); |
||
| 1619 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1620 | break; |
||
| 1621 | default: |
||
| 1622 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||
| 1623 | $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); |
||
| 1624 | } |
||
| 1625 | // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. |
||
| 1626 | $node->attr[$name] = str_replace("\r", "", $node->attr[$name]); |
||
| 1627 | $node->attr[$name] = str_replace("\n", "", $node->attr[$name]); |
||
| 1628 | // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. |
||
| 1629 | if ($name == "class") { |
||
| 1630 | $node->attr[$name] = trim($node->attr[$name]); |
||
| 1631 | } |
||
| 1632 | } |
||
| 1633 | |||
| 1634 | // link node's parent |
||
| 1635 | protected function link_nodes(&$node, $is_child) |
||
| 1636 | { |
||
| 1637 | $node->parent = $this->parent; |
||
| 1638 | $this->parent->nodes[] = $node; |
||
| 1639 | if ($is_child) { |
||
| 1640 | $this->parent->children[] = $node; |
||
| 1641 | } |
||
| 1642 | } |
||
| 1643 | |||
| 1644 | // as a text node |
||
| 1645 | protected function as_text_node($tag) |
||
| 1653 | } |
||
| 1654 | |||
| 1655 | protected function skip($chars) |
||
| 1656 | { |
||
| 1657 | $this->pos += strspn($this->doc, $chars, $this->pos); |
||
| 1658 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1659 | } |
||
| 1660 | |||
| 1661 | protected function copy_skip($chars) |
||
| 1662 | { |
||
| 1663 | $pos = $this->pos; |
||
| 1664 | $len = strspn($this->doc, $chars, $pos); |
||
| 1665 | $this->pos += $len; |
||
| 1666 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1667 | if ($len===0) { |
||
| 1668 | return ''; |
||
| 1669 | } |
||
| 1670 | return substr($this->doc, $pos, $len); |
||
| 1671 | } |
||
| 1672 | |||
| 1673 | protected function copy_until($chars) |
||
| 1674 | { |
||
| 1675 | $pos = $this->pos; |
||
| 1676 | $len = strcspn($this->doc, $chars, $pos); |
||
| 1677 | $this->pos += $len; |
||
| 1678 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
| 1679 | return substr($this->doc, $pos, $len); |
||
| 1680 | } |
||
| 1681 | |||
| 1682 | protected function copy_until_char($char) |
||
| 1683 | { |
||
| 1684 | if ($this->char===null) { |
||
| 1685 | return ''; |
||
| 1686 | } |
||
| 1687 | |||
| 1688 | if (($pos = strpos($this->doc, $char, $this->pos))===false) { |
||
| 1689 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); |
||
| 1690 | $this->char = null; |
||
| 1691 | $this->pos = $this->size; |
||
| 1692 | return $ret; |
||
| 1693 | } |
||
| 1694 | |||
| 1695 | if ($pos===$this->pos) { |
||
| 1696 | return ''; |
||
| 1697 | } |
||
| 1698 | $pos_old = $this->pos; |
||
| 1699 | $this->char = $this->doc[$pos]; |
||
| 1700 | $this->pos = $pos; |
||
| 1701 | return substr($this->doc, $pos_old, $pos-$pos_old); |
||
| 1702 | } |
||
| 1703 | |||
| 1704 | protected function copy_until_char_escape($char) |
||
| 1705 | { |
||
| 1706 | if ($this->char===null) { |
||
| 1707 | return ''; |
||
| 1708 | } |
||
| 1709 | |||
| 1710 | $start = $this->pos; |
||
| 1711 | while (1) { |
||
| 1712 | if (($pos = strpos($this->doc, $char, $start))===false) { |
||
| 1713 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); |
||
| 1714 | $this->char = null; |
||
| 1715 | $this->pos = $this->size; |
||
| 1716 | return $ret; |
||
| 1717 | } |
||
| 1718 | |||
| 1719 | if ($pos===$this->pos) { |
||
| 1720 | return ''; |
||
| 1721 | } |
||
| 1722 | |||
| 1723 | if ($this->doc[$pos-1]==='\\') { |
||
| 1724 | $start = $pos+1; |
||
| 1725 | continue; |
||
| 1726 | } |
||
| 1727 | |||
| 1728 | $pos_old = $this->pos; |
||
| 1729 | $this->char = $this->doc[$pos]; |
||
| 1730 | $this->pos = $pos; |
||
| 1731 | return substr($this->doc, $pos_old, $pos-$pos_old); |
||
| 1732 | } |
||
| 1733 | } |
||
| 1734 | |||
| 1735 | // remove noise from html content |
||
| 1736 | // save the noise in the $this->noise array. |
||
| 1737 | protected function remove_noise($pattern, $remove_tag=false) |
||
| 1738 | { |
||
| 1739 | global $debug_object; |
||
| 1740 | if (is_object($debug_object)) { |
||
| 1741 | $debug_object->debugLogEntry(1); |
||
| 1742 | } |
||
| 1743 | |||
| 1744 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); |
||
| 1745 | |||
| 1746 | for ($i=$count-1; $i>-1; --$i) { |
||
| 1747 | $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); |
||
| 1748 | if (is_object($debug_object)) { |
||
| 1749 | $debug_object->debugLog(2, 'key is: ' . $key); |
||
| 1750 | } |
||
| 1751 | $idx = ($remove_tag) ? 0 : 1; |
||
| 1752 | $this->noise[$key] = $matches[$i][$idx][0]; |
||
| 1753 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
||
| 1754 | } |
||
| 1755 | |||
| 1756 | // reset the length of content |
||
| 1757 | $this->size = strlen($this->doc); |
||
| 1758 | if ($this->size>0) { |
||
| 1759 | $this->char = $this->doc[0]; |
||
| 1760 | } |
||
| 1761 | } |
||
| 1762 | |||
| 1763 | // restore noise to html content |
||
| 1764 | public function restore_noise($text) |
||
| 1791 | } |
||
| 1792 | |||
| 1793 | // Sometimes we NEED one of the noise elements. |
||
| 1794 | public function search_noise($text) |
||
| 1795 | { |
||
| 1796 | global $debug_object; |
||
| 1797 | if (is_object($debug_object)) { |
||
| 1798 | $debug_object->debugLogEntry(1); |
||
| 1799 | } |
||
| 1800 | |||
| 1801 | foreach ($this->noise as $noiseElement) { |
||
| 1802 | if (strpos($noiseElement, $text)!==false) { |
||
| 1803 | return $noiseElement; |
||
| 1804 | } |
||
| 1805 | } |
||
| 1806 | } |
||
| 1807 | public function __toString() |
||
| 1808 | { |
||
| 1809 | return $this->root->innertext(); |
||
| 1810 | } |
||
| 1811 | |||
| 1812 | public function __get($name) |
||
| 1813 | { |
||
| 1814 | switch ($name) { |
||
| 1815 | case 'outertext': |
||
| 1816 | return $this->root->innertext(); |
||
| 1817 | case 'innertext': |
||
| 1818 | return $this->root->innertext(); |
||
| 1819 | case 'plaintext': |
||
| 1820 | return $this->root->text(); |
||
| 1821 | case 'charset': |
||
| 1822 | return $this->_charset; |
||
| 1823 | case 'target_charset': |
||
| 1824 | return $this->_target_charset; |
||
| 1825 | } |
||
| 1826 | } |
||
| 1827 | |||
| 1828 | // camel naming conventions |
||
| 1829 | public function childNodes($idx=-1) |
||
| 1830 | { |
||
| 1831 | return $this->root->childNodes($idx); |
||
| 1832 | } |
||
| 1833 | public function firstChild() |
||
| 1834 | { |
||
| 1835 | return $this->root->first_child(); |
||
| 1836 | } |
||
| 1837 | public function lastChild() |
||
| 1840 | } |
||
| 1841 | public function createElement($name, $value=null) |
||
| 1842 | { |
||
| 1843 | return @str_get_html("<$name>$value</$name>")->first_child(); |
||
| 1844 | } |
||
| 1845 | public function createTextNode($value) |
||
| 1846 | { |
||
| 1847 | return @end(str_get_html($value)->nodes); |
||
| 1848 | } |
||
| 1849 | public function getElementById($id) |
||
| 1850 | { |
||
| 1851 | return $this->find("#$id", 0); |
||
| 1852 | } |
||
| 1853 | public function getElementsById($id, $idx=null) |
||
| 1854 | { |
||
| 1855 | return $this->find("#$id", $idx); |
||
| 1856 | } |
||
| 1857 | public function getElementByTagName($name) |
||
| 1858 | { |
||
| 1859 | return $this->find($name, 0); |
||
| 1860 | } |
||
| 1861 | public function getElementsByTagName($name, $idx=-1) |
||
| 1864 | } |
||
| 1865 | public function loadFile() |
||
| 1866 | { |
||
| 1867 | $args = func_get_args(); |
||
| 1868 | $this->load_file($args); |
||
| 1869 | } |
||
| 1870 | } |
||
| 1871 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.