Html2Text::_convert_blockquotes()   B
last analyzed

Complexity

Conditions 9
Paths 8

Size

Total Lines 49

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
nc 8
nop 1
dl 0
loc 49
rs 7.5571
c 0
b 0
f 0
1
<?php
2
/*************************************************************************
3
 *                                                                       *
4
 * Converts HTML to formatted plain text                                 *
5
 *                                                                       *
6
 * Portions Copyright (c) 2005-2007 Jon Abernathy <[email protected]>    *
7
 *                                                                       *
8
 * This script is free software; you can redistribute it and/or modify   *
9
 * it under the terms of the GNU General Public License as published by  *
10
 * the Free Software Foundation; either version 2 of the License, or     *
11
 * (at your option) any later version.                                   *
12
 *                                                                       *
13
 * The GNU General Public License can be found at                        *
14
 * http://www.gnu.org/copyleft/gpl.html.                                 *
15
 *                                                                       *
16
 * This script is distributed in the hope that it will be useful,        *
17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          *
19
 * GNU General Public License for more details.                          *
20
 *                                                                       *
21
 *************************************************************************/
22
23
/**
24
 * Converts HTML to formatted plain text
25
 */
26
class Html2Text
27
{
28
    /**
29
     * Contains the HTML content to convert.
30
     *
31
     * @type string
32
     */
33
    protected $html;
34
35
    /**
36
     * Contains the converted, formatted text.
37
     *
38
     * @type string
39
     */
40
    protected $text;
41
42
    /**
43
     * Maximum width of the formatted text, in columns.
44
     *
45
     * Set this value to 0 (or less) to ignore word wrapping
46
     * and not constrain text to a fixed-width column.
47
     *
48
     * @type int
49
     */
50
    protected $width = 70;
51
52
    /**
53
     * List of preg* regular expression patterns to search for,
54
     * used in conjunction with $replace.
55
     *
56
     * @type array
57
     * @see $replace
58
     */
59
    protected $search = [
60
        "/\r/",                                  // Non-legal carriage return
61
        "/[\n\t]+/",                             // Newlines and tabs
62
        '/<head[^>]*>.*?<\/head>/i',             // <head>
63
        '/<script[^>]*>.*?<\/script>/i',         // <script>s -- which strip_tags supposedly has problems with
64
        '/<style[^>]*>.*?<\/style>/i',           // <style>s -- which strip_tags supposedly has problems with
65
        '/<p[^>]*>/i',                           // <P>
66
        '/<br[^>]*>/i',                          // <br>
67
        '/<i[^>]*>(.*?)<\/i>/i',                 // <i>
68
        '/<em[^>]*>(.*?)<\/em>/i',               // <em>
69
        '/(<ul[^>]*>|<\/ul>)/i',                 // <ul> and </ul>
70
        '/(<ol[^>]*>|<\/ol>)/i',                 // <ol> and </ol>
71
        '/(<dl[^>]*>|<\/dl>)/i',                 // <dl> and </dl>
72
        '/<li[^>]*>(.*?)<\/li>/i',               // <li> and </li>
73
        '/<dd[^>]*>(.*?)<\/dd>/i',               // <dd> and </dd>
74
        '/<dt[^>]*>(.*?)<\/dt>/i',               // <dt> and </dt>
75
        '/<li[^>]*>/i',                          // <li>
76
        '/<hr[^>]*>/i',                          // <hr>
77
        '/<div[^>]*>/i',                         // <div>
78
        '/(<table[^>]*>|<\/table>)/i',           // <table> and </table>
79
        '/(<tr[^>]*>|<\/tr>)/i',                 // <tr> and </tr>
80
        '/<td[^>]*>(.*?)<\/td>/i',               // <td> and </td>
81
        '/<span class="_html2text_ignore">.+?<\/span>/i',  // <span class="_html2text_ignore">...</span>
82
    ];
83
84
    /**
85
     * List of pattern replacements corresponding to patterns searched.
86
     *
87
     * @type array
88
     * @see $search
89
     */
90
    protected $replace = [
91
        '',                                     // Non-legal carriage return
92
        ' ',                                    // Newlines and tabs
93
        '',                                     // <head>
94
        '',                                     // <script>s -- which strip_tags supposedly has problems with
95
        '',                                     // <style>s -- which strip_tags supposedly has problems with
96
        "\n\n",                                 // <P>
97
        "\n",                                   // <br>
98
        '_\\1_',                                // <i>
99
        '_\\1_',                                // <em>
100
        "\n\n",                                 // <ul> and </ul>
101
        "\n\n",                                 // <ol> and </ol>
102
        "\n\n",                                 // <dl> and </dl>
103
        "\t* \\1\n",                            // <li> and </li>
104
        " \\1\n",                               // <dd> and </dd>
105
        "\t* \\1",                              // <dt> and </dt>
106
        "\n\t* ",                               // <li>
107
        "\n-------------------------\n",        // <hr>
108
        "<div>\n",                              // <div>
109
        "\n\n",                                 // <table> and </table>
110
        "\n",                                   // <tr> and </tr>
111
        "\t\t\\1\n",                            // <td> and </td>
112
        '',                                      // <span class="_html2text_ignore">...</span>
113
    ];
114
115
    /**
116
     * List of preg* regular expression patterns to search for,
117
     * used in conjunction with $ent_replace.
118
     *
119
     * @type array
120
     * @see $ent_replace
121
     */
122
    protected $ent_search = [
123
        '/&(nbsp|#160);/i',                      // Non-breaking space
124
        '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i',
125
        // Double quotes
126
        '/&(apos|rsquo|lsquo|#8216|#8217);/i',   // Single quotes
127
        '/>/i',                               // Greater-than
128
        '/&lt;/i',                               // Less-than
129
        '/&(copy|#169);/i',                      // Copyright
130
        '/&(trade|#8482|#153);/i',               // Trademark
131
        '/&(reg|#174);/i',                       // Registered
132
        '/&(mdash|#151|#8212);/i',               // mdash
133
        '/&(ndash|minus|#8211|#8722);/i',        // ndash
134
        '/&(bull|#149|#8226);/i',                // Bullet
135
        '/&(pound|#163);/i',                     // Pound sign
136
        '/&(euro|#8364);/i',                     // Euro sign
137
        '/&(amp|#38);/i',                        // Ampersand: see _converter()
138
        '/[ ]{2,}/',                             // Runs of spaces, post-handling
139
    ];
140
141
    /**
142
     * List of pattern replacements corresponding to patterns searched.
143
     *
144
     * @type array
145
     * @see $ent_search
146
     */
147
    protected $ent_replace = [
148
        ' ',                                    // Non-breaking space
149
        '"',                                    // Double quotes
150
        "'",                                    // Single quotes
151
        '>',
152
        '<',
153
        '(c)',
154
        '(tm)',
155
        '(R)',
156
        '--',
157
        '-',
158
        '*',
159
        '£',
160
        'EUR',                                  // Euro sign. € ?
161
        '|+|amp|+|',                            // Ampersand: see _converter()
162
        ' ',                                    // Runs of spaces, post-handling
163
    ];
164
165
    /**
166
     * List of preg* regular expression patterns to search for
167
     * and replace using callback function.
168
     *
169
     * @type array
170
     */
171
    protected $callback_search = [
172
        '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i', // <a href="">
173
        '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
174
        '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
175
        '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
176
        '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
177
    ];
178
179
    /**
180
     * List of preg* regular expression patterns to search for in PRE body,
181
     * used in conjunction with $pre_replace.
182
     *
183
     * @type array
184
     * @see $pre_replace
185
     */
186
    protected $pre_search = [
187
        "/\n/",
188
        "/\t/",
189
        '/ /',
190
        '/<pre[^>]*>/',
191
        '/<\/pre>/',
192
    ];
193
194
    /**
195
     * List of pattern replacements corresponding to patterns searched for PRE body.
196
     *
197
     * @type array
198
     * @see $pre_search
199
     */
200
    protected $pre_replace = [
201
        '<br>',
202
        '&nbsp;&nbsp;&nbsp;&nbsp;',
203
        '&nbsp;',
204
        '',
205
        '',
206
    ];
207
208
    /**
209
     * Temporary workspace used during PRE processing.
210
     *
211
     * @type string
212
     */
213
    protected $pre_content = '';
214
215
    /**
216
     * Contains a list of HTML tags to allow in the resulting text.
217
     *
218
     * @type string
219
     * @see set_allowed_tags()
220
     */
221
    protected $allowed_tags = '';
222
223
    /**
224
     * Contains the base URL that relative links should resolve to.
225
     *
226
     * @type string
227
     */
228
    protected $url;
229
230
    /**
231
     * Indicates whether content in the $html variable has been converted yet.
232
     *
233
     * @type bool
234
     * @see $html, $text
235
     */
236
    protected $_converted = false;
237
238
    /**
239
     * Contains URL addresses from links to be rendered in plain text.
240
     *
241
     * @type array
242
     * @see _build_link_list()
243
     */
244
    protected $_link_list = [];
245
246
    /**
247
     * Various configuration options (able to be set in the constructor)
248
     *
249
     * @type array
250
     */
251
    protected $_options = [
252
        // 'none'
253
        // 'inline' (show links inline)
254
        // 'nextline' (show links on the next line)
255
        // 'table' (if a table of link URLs should be listed after the text.
256
        'do_links' => 'inline',
257
        //  Maximum width of the formatted text, in columns.
258
        //  Set this value to 0 (or less) to ignore word wrapping
259
        //  and not constrain text to a fixed-width column.
260
        'width'    => 70,
261
    ];
262
263
    /**
264
     * Constructor.
265
     *
266
     * If the HTML source string (or file) is supplied, the class
267
     * will instantiate with that source propagated, all that has
268
     * to be done it to call get_text().
269
     *
270
     * @param string $source    HTML content
271
     * @param bool   $from_file Indicates $source is a file to pull content from
272
     * @param array  $options   Set configuration options
273
     */
274
    public function __construct($source = '', $from_file = false, $options = [])
275
    {
276
        $this->_options = array_merge($this->_options, $options);
277
278
        if (!empty($source)) {
279
            $this->set_html($source, $from_file);
280
        }
281
282
        $this->set_base_url();
283
    }
284
285
    /**
286
     * Loads source HTML into memory, either from $source string or a file.
287
     *
288
     * @param string $source    HTML content
289
     * @param bool   $from_file Indicates $source is a file to pull content from
290
     */
291
    public function set_html($source, $from_file = false)
292
    {
293
        if ($from_file && file_exists($source)) {
294
            $this->html = file_get_contents($source);
295
        } else {
296
            $this->html = $source;
297
        }
298
299
        $this->_converted = false;
300
    }
301
302
    /**
303
     * Returns the text, converted from HTML.
304
     *
305
     * @return string
306
     */
307
    public function get_text()
308
    {
309
        if (!$this->_converted) {
310
            $this->_convert();
311
        }
312
313
        return $this->text;
314
    }
315
316
    /**
317
     * Prints the text, converted from HTML.
318
     */
319
    public function print_text()
320
    {
321
        print $this->get_text();
322
    }
323
324
    /**
325
     * Alias to print_text(), operates identically.
326
     *
327
     * @see print_text()
328
     */
329
    public function p()
330
    {
331
        print $this->get_text();
332
    }
333
334
    /**
335
     * Sets the allowed HTML tags to pass through to the resulting text.
336
     *
337
     * Tags should be in the form "<p>", with no corresponding closing tag.
338
     *
339
     * @param string $allowed_tags
340
     */
341
    public function set_allowed_tags($allowed_tags = '')
342
    {
343
        if (!empty($allowed_tags)) {
344
            $this->allowed_tags = $allowed_tags;
345
        }
346
    }
347
348
    /**
349
     * Sets a base URL to handle relative links.
350
     *
351
     * @param string $url
352
     */
353
    public function set_base_url($url = '')
354
    {
355
        if (empty($url)) {
356
            if (\Xmf\Request::hasVar('HTTP_HOST', 'SERVER')) {
357
                $this->url = 'http://' . $_SERVER['HTTP_HOST'];
358
            } else {
359
                $this->url = '';
360
            }
361
        } else {
362
            // Strip any trailing slashes for consistency (relative
363
            // URLs may already start with a slash like "/file.html")
364
            if ('/' === mb_substr($url, -1)) {
365
                $url = mb_substr($url, 0, -1);
366
            }
367
            $this->url = $url;
368
        }
369
    }
370
371
    /**
372
     * Workhorse function that does actual conversion (calls _converter() method).
373
     */
374
    protected function _convert()
375
    {
376
        // Variables used for building the link list
377
        $this->_link_list = [];
378
379
        $text = trim(stripslashes($this->html));
380
381
        // Convert HTML to TXT
382
        $this->_converter($text);
383
384
        // Add link list
385
        if (!empty($this->_link_list)) {
386
            $text .= "\n\nLinks:\n------\n";
387
            foreach ($this->_link_list as $idx => $url) {
388
                $text .= '[' . ($idx + 1) . '] ' . $url . "\n";
389
            }
390
        }
391
392
        $this->text = $text;
393
394
        $this->_converted = true;
395
    }
396
397
    /**
398
     * Workhorse function that does actual conversion.
399
     *
400
     * First performs custom tag replacement specified by $search and
401
     * $replace arrays. Then strips any remaining HTML tags, reduces whitespace
402
     * and newlines to a readable format, and word wraps the text to
403
     * $this->_options['width'] characters.
404
     *
405
     * @param string $text Reference to HTML content string
406
     */
407
    protected function _converter(&$text)
408
    {
409
        // Convert <BLOCKQUOTE> (before PRE!)
410
        $this->_convert_blockquotes($text);
411
412
        // Convert <PRE>
413
        $this->_convert_pre($text);
414
415
        // Run our defined tags search-and-replace
416
        $text = preg_replace($this->search, $this->replace, $text);
417
418
        // Run our defined tags search-and-replace with callback
419
        $text = preg_replace_callback($this->callback_search, [$this, '_preg_callback'], $text);
420
421
        // Strip any other HTML tags
422
        $text = strip_tags($text, $this->allowed_tags);
423
424
        // Run our defined entities/characters search-and-replace
425
        $text = preg_replace($this->ent_search, $this->ent_replace, $text);
426
427
        // Replace known html entities
428
        $text = html_entity_decode($text, ENT_QUOTES);
429
430
        // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
431
        $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
432
433
        // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
434
        // This properly handles situation of "&amp;quot;" in input string
435
        $text = str_replace('|+|amp|+|', '&', $text);
436
437
        // Bring down number of empty lines to 2 max
438
        $text = preg_replace("/\n\s+\n/", "\n\n", $text);
439
        $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
440
441
        // remove leading empty lines (can be produced by eg. P tag on the beginning)
442
        $text = ltrim($text, "\n");
443
444
        // Wrap the text to a readable format
445
        // for PHP versions >= 4.0.2. Default width is 75
446
        // If width is 0 or less, don't wrap the text.
447
        if ($this->_options['width'] > 0) {
448
            $text = wordwrap($text, $this->_options['width']);
449
        }
450
    }
451
452
    /**
453
     * Helper function called by preg_replace() on link replacement.
454
     *
455
     * Maintains an internal list of links to be displayed at the end of the
456
     * text, with numeric indices to the original point in the text they
457
     * appeared. Also makes an effort at identifying and handling absolute
458
     * and relative links.
459
     *
460
     * @param string $link    URL of the link
461
     * @param string $display Part of the text to associate number with
462
     * @param string|null   $link_override
463
     *
464
     * @return string
465
     */
466
    protected function _build_link_list($link, $display, $link_override = null)
467
    {
468
        $link_method = $link_override ?: $this->_options['do_links'];
469
        if ('none' === $link_method) {
470
            return $display;
471
        }
472
473
        // Ignored link types
474
        if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
475
            return $display;
476
        }
477
478
        if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
479
            $url = $link;
480
        } else {
481
            $url = $this->url;
482
            if ('/' !== mb_substr($link, 0, 1)) {
483
                $url .= '/';
484
            }
485
            $url .= (string)$link;
486
        }
487
488
        if ('table' === $link_method) {
489
            if (false === ($index = array_search($url, $this->_link_list, true))) {
490
                $index              = count($this->_link_list);
491
                $this->_link_list[] = $url;
492
            }
493
494
            return $display . ' [' . ($index + 1) . ']';
495
        } elseif ('nextline' === $link_method) {
496
            return $display . "\n[" . $url . ']';
497
        }   // link_method defaults to inline
498
499
        return $display . ' [' . $url . ']';
500
    }
501
502
    /**
503
     * Helper function for PRE body conversion.
504
     *
505
     * @param string $text HTML content
506
     */
507
    protected function _convert_pre(&$text)
508
    {
509
        // get the content of PRE element
510
        while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
511
            $this->pre_content = $matches[1];
512
513
            // Run our defined tags search-and-replace with callback
514
            $this->pre_content = preg_replace_callback($this->callback_search, [$this, '_preg_callback'], $this->pre_content);
515
516
            // convert the content
517
            $this->pre_content = sprintf('<div><br>%s<br></div>', preg_replace($this->pre_search, $this->pre_replace, $this->pre_content));
518
519
            // replace the content (use callback because content can contain $0 variable)
520
            $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU', [$this, '_preg_pre_callback'], $text, 1);
521
522
            // free memory
523
            $this->pre_content = '';
524
        }
525
    }
526
527
    /**
528
     * Helper function for BLOCKQUOTE body conversion.
529
     *
530
     * @param string $text HTML content
531
     */
532
    protected function _convert_blockquotes(&$text)
533
    {
534
        if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
535
            $start  = 0;
536
            $taglen = 0;
537
            $level  = 0;
538
            $diff   = 0;
539
            foreach ($matches[0] as $m) {
540
                if ('<' === $m[0][0] && '/' === $m[0][1]) {
541
                    $level--;
542
                    if ($level < 0) {
543
                        $level = 0; // malformed HTML: go to next blockquote
544
                    } elseif ($level > 0) {
545
                        // skip inner blockquote
546
                    } else {
547
                        $end = $m[1];
548
                        $len = $end - $taglen - $start;
549
                        // Get blockquote content
550
                        $body = mb_substr($text, $start + $taglen - $diff, $len);
551
552
                        // Set text width
553
                        $p_width = $this->_options['width'];
554
                        if ($this->_options['width'] > 0) {
555
                            $this->_options['width'] -= 2;
556
                        }
557
                        // Convert blockquote content
558
                        $body = trim($body);
559
                        $this->_converter($body);
560
                        // Add citation markers and create PRE block
561
                        $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
562
                        $body = '<pre>' . htmlspecialchars($body, ENT_QUOTES | ENT_HTML5) . '</pre>';
563
                        // Re-set text width
564
                        $this->_options['width'] = $p_width;
565
                        // Replace content
566
                        $text = mb_substr($text, 0, $start - $diff) . $body . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
567
568
                        $diff = $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
569
                        unset($body);
570
                    }
571
                } else {
572
                    if (0 == $level) {
573
                        $start  = $m[1];
574
                        $taglen = mb_strlen($m[0]);
575
                    }
576
                    $level++;
577
                }
578
            }
579
        }
580
    }
581
582
    /**
583
     * Callback function for preg_replace_callback use.
584
     *
585
     * @param array $matches PREG matches
586
     *
587
     * @return string
588
     */
589
    protected function _preg_callback($matches)
590
    {
591
        switch (mb_strtolower($matches[1])) {
592
            case 'b':
593
            case 'strong':
594
                return $this->_toupper($matches[3]);
595
            case 'th':
596
                return $this->_toupper("\t\t" . $matches[3] . "\n");
597
            case 'h':
598
                return $this->_toupper("\n\n" . $matches[3] . "\n\n");
599
            case 'a':
600
                // override the link method
601
                $link_override = null;
602
                if (preg_match('/_html2text_link_(\w+)/', $matches[4], $link_override_match)) {
603
                    $link_override = $link_override_match[1];
604
                }
605
                // Remove spaces in URL (#1487805)
606
                $url = str_replace(' ', '', $matches[3]);
607
608
                return $this->_build_link_list($url, $matches[5], $link_override);
609
        }
610
611
        return '';
612
    }
613
614
    /**
615
     * Callback function for preg_replace_callback use in PRE content handler.
616
     *
617
     * @param array $matches PREG matches
618
     *
619
     * @return string
620
     */
621
    protected function _preg_pre_callback(
622
        /** @noinspection PhpUnusedParameterInspection */
623
        $matches)
0 ignored issues
show
Unused Code introduced by
The parameter $matches is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
624
    {
625
        return $this->pre_content;
626
    }
627
628
    /**
629
     * Strtoupper function with HTML tags and entities handling.
630
     *
631
     * @param string $str Text to convert
632
     *
633
     * @return string Converted text
634
     */
635
    private function _toupper($str)
636
    {
637
        // string can contain HTML tags
638
        $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
639
640
        // convert toupper only the text between HTML tags
641
        foreach ($chunks as $idx => $chunk) {
642
            if ('<' !== $chunk[0]) {
643
                $chunks[$idx] = $this->_strtoupper($chunk);
644
            }
645
        }
646
647
        return implode($chunks);
648
    }
649
650
    /**
651
     * Strtoupper multibyte wrapper function with HTML entities handling.
652
     * Forces mb_strtoupper-call to UTF-8.
653
     *
654
     * @param string $str Text to convert
655
     *
656
     * @return string Converted text
657
     */
658
    private function _strtoupper($str)
659
    {
660
        $str = html_entity_decode($str, ENT_COMPAT);
661
662
        if (function_exists('mb_strtoupper')) {
663
            $str = mb_strtoupper($str, 'UTF-8');
664
        } else {
665
            $str = mb_strtoupper($str);
666
        }
667
668
        $str = htmlspecialchars($str, ENT_COMPAT);
669
670
        return $str;
671
    }
672
}
673