| @@ 395-446 (lines=52) @@ | ||
| 392 | * |
|
| 393 | * @return mixed |
|
| 394 | */ |
|
| 395 | public function html2text($document) |
|
| 396 | { |
|
| 397 | // PHP Manual:: function preg_replace |
|
| 398 | // $document should contain an HTML document. |
|
| 399 | // This will remove HTML tags, javascript sections |
|
| 400 | // and white space. It will also convert some |
|
| 401 | // common HTML entities to their text equivalent. |
|
| 402 | // Credits : newbb2 |
|
| 403 | $search = array( |
|
| 404 | "'<script[^>]*?>.*?</script>'si", // Strip out javascript |
|
| 405 | "'<img.*?/>'si", // Strip out img tags |
|
| 406 | "'<[\/\!]*?[^<>]*?>'si", // Strip out HTML tags |
|
| 407 | "'([\r\n])[\s]+'", // Strip out white space |
|
| 408 | "'&(quot|#34);'i", // Replace HTML entities |
|
| 409 | "'&(amp|#38);'i", |
|
| 410 | "'&(lt|#60);'i", |
|
| 411 | "'&(gt|#62);'i", |
|
| 412 | "'&(nbsp|#160);'i", |
|
| 413 | "'&(iexcl|#161);'i", |
|
| 414 | "'&(cent|#162);'i", |
|
| 415 | "'&(pound|#163);'i", |
|
| 416 | "'&(copy|#169);'i" |
|
| 417 | ); // evaluate as php |
|
| 418 | ||
| 419 | $replace = array( |
|
| 420 | '', |
|
| 421 | '', |
|
| 422 | '', |
|
| 423 | "\\1", |
|
| 424 | "\"", |
|
| 425 | '&', |
|
| 426 | '<', |
|
| 427 | '>', |
|
| 428 | ' ', |
|
| 429 | chr(161), |
|
| 430 | chr(162), |
|
| 431 | chr(163), |
|
| 432 | chr(169), |
|
| 433 | ); |
|
| 434 | ||
| 435 | $text = preg_replace($search, $replace, $document); |
|
| 436 | ||
| 437 | preg_replace_callback( |
|
| 438 | '/&#(\d+);/', |
|
| 439 | function ($matches) { |
|
| 440 | return chr($matches[1]); |
|
| 441 | }, |
|
| 442 | $document |
|
| 443 | ); |
|
| 444 | ||
| 445 | return $text; |
|
| 446 | } |
|
| 447 | } |
|
| 448 | ||
| @@ 105-157 (lines=53) @@ | ||
| 102 | * @param string $document |
|
| 103 | * @return mixed |
|
| 104 | */ |
|
| 105 | function publisherHtml2text($document) |
|
| 106 | { |
|
| 107 | // PHP Manual:: function preg_replace |
|
| 108 | // $document should contain an HTML document. |
|
| 109 | // This will remove HTML tags, javascript sections |
|
| 110 | // and white space. It will also convert some |
|
| 111 | // common HTML entities to their text equivalent. |
|
| 112 | // Credits : newbb2 |
|
| 113 | $search = array( |
|
| 114 | "'<script[^>]*?>.*?</script>'si", // Strip out javascript |
|
| 115 | "'<img.*?/>'si", // Strip out img tags |
|
| 116 | "'<[\/\!]*?[^<>]*?>'si", // Strip out HTML tags |
|
| 117 | "'([\r\n])[\s]+'", // Strip out white space |
|
| 118 | "'&(quot|#34);'i", // Replace HTML entities |
|
| 119 | "'&(amp|#38);'i", |
|
| 120 | "'&(lt|#60);'i", |
|
| 121 | "'&(gt|#62);'i", |
|
| 122 | "'&(nbsp|#160);'i", |
|
| 123 | "'&(iexcl|#161);'i", |
|
| 124 | "'&(cent|#162);'i", |
|
| 125 | "'&(pound|#163);'i", |
|
| 126 | "'&(copy|#169);'i" |
|
| 127 | ); // evaluate as php |
|
| 128 | ||
| 129 | $replace = array( |
|
| 130 | '', |
|
| 131 | '', |
|
| 132 | '', |
|
| 133 | "\\1", |
|
| 134 | "\"", |
|
| 135 | '&', |
|
| 136 | '<', |
|
| 137 | '>', |
|
| 138 | ' ', |
|
| 139 | chr(161), |
|
| 140 | chr(162), |
|
| 141 | chr(163), |
|
| 142 | chr(169), |
|
| 143 | ); |
|
| 144 | ||
| 145 | $text = preg_replace($search, $replace, $document); |
|
| 146 | ||
| 147 | preg_replace_callback( |
|
| 148 | '/&#(\d+);/', |
|
| 149 | function ($matches) { |
|
| 150 | return chr($matches[1]); |
|
| 151 | }, |
|
| 152 | $document |
|
| 153 | ); |
|
| 154 | ||
| 155 | return $text; |
|
| 156 | //<?php |
|
| 157 | } |
|
| 158 | ||
| 159 | /** |
|
| 160 | * @return array |
|