Html2Text::_convert_blockquotes() - Code Metrics - XoopsModules25x/xnewsletter - Measure and Improve Code Quality continuously with Scrutinizer

Html2Text::_convert_blockquotes() B
last analyzed 2020-08-07 11:59 UTC

↳ Parent: Html2Text

Complexity

Conditions	9
Paths	8

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	9
nc	8
nop	1
dl	0
loc	49
rs	7.5571
c	0
b	0
f	0

<?php
/*************************************************************************
 *                                                                       *
 * Converts HTML to formatted plain text                                 *
 *                                                                       *
 * Portions Copyright (c) 2005-2007 Jon Abernathy <[email protected]>    *
 *                                                                       *
 * This script is free software; you can redistribute it and/or modify   *
 * it under the terms of the GNU General Public License as published by  *
 * the Free Software Foundation; either version 2 of the License, or     *
 * (at your option) any later version.                                   *
 *                                                                       *
 * The GNU General Public License can be found at                        *
 * http://www.gnu.org/copyleft/gpl.html.                                 *
 *                                                                       *
 * This script is distributed in the hope that it will be useful,        *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          *
 * GNU General Public License for more details.                          *
 *                                                                       *
 *************************************************************************/

/**
 * Converts HTML to formatted plain text
 */
class Html2Text
{
    /**
     * Contains the HTML content to convert.
     *
     * @type string
     */
    protected $html;

    /**
     * Contains the converted, formatted text.
     *
     * @type string
     */
    protected $text;

    /**
     * Maximum width of the formatted text, in columns.
     *
     * Set this value to 0 (or less) to ignore word wrapping
     * and not constrain text to a fixed-width column.
     *
     * @type int
     */
    protected $width = 70;

    /**
     * List of preg* regular expression patterns to search for,
     * used in conjunction with $replace.
     *
     * @type array
     * @see $replace
     */
    protected $search = [
        "/\r/",                                  // Non-legal carriage return
        "/[\n\t]+/",                             // Newlines and tabs
        '/<head[^>]*>.*?<\/head>/i',             // <head>
        '/<script[^>]*>.*?<\/script>/i',         // <script>s -- which strip_tags supposedly has problems with
        '/<style[^>]*>.*?<\/style>/i',           // <style>s -- which strip_tags supposedly has problems with
        '/<p[^>]*>/i',                           // <P>
        '/<br[^>]*>/i',                          // <br>
        '/<i[^>]*>(.*?)<\/i>/i',                 // <i>
        '/<em[^>]*>(.*?)<\/em>/i',               // <em>
        '/(<ul[^>]*>|<\/ul>)/i',                 // <ul> and </ul>
        '/(<ol[^>]*>|<\/ol>)/i',                 // <ol> and </ol>
        '/(<dl[^>]*>|<\/dl>)/i',                 // <dl> and </dl>
        '/<li[^>]*>(.*?)<\/li>/i',               // <li> and </li>
        '/<dd[^>]*>(.*?)<\/dd>/i',               // <dd> and </dd>
        '/<dt[^>]*>(.*?)<\/dt>/i',               // <dt> and </dt>
        '/<li[^>]*>/i',                          // <li>
        '/<hr[^>]*>/i',                          // <hr>
        '/<div[^>]*>/i',                         // <div>
        '/(<table[^>]*>|<\/table>)/i',           // <table> and </table>
        '/(<tr[^>]*>|<\/tr>)/i',                 // <tr> and </tr>
        '/<td[^>]*>(.*?)<\/td>/i',               // <td> and </td>
        '/<span class="_html2text_ignore">.+?<\/span>/i',  // <span class="_html2text_ignore">...</span>
    ];

    /**
     * List of pattern replacements corresponding to patterns searched.
     *
     * @type array
     * @see $search
     */
    protected $replace = [
        '',                                     // Non-legal carriage return
        ' ',                                    // Newlines and tabs
        '',                                     // <head>
        '',                                     // <script>s -- which strip_tags supposedly has problems with
        '',                                     // <style>s -- which strip_tags supposedly has problems with
        "\n\n",                                 // <P>
        "\n",                                   // <br>
        '_\\1_',                                // <i>
        '_\\1_',                                // <em>
        "\n\n",                                 // <ul> and </ul>
        "\n\n",                                 // <ol> and </ol>
        "\n\n",                                 // <dl> and </dl>
        "\t* \\1\n",                            // <li> and </li>
        " \\1\n",                               // <dd> and </dd>
        "\t* \\1",                              // <dt> and </dt>
        "\n\t* ",                               // <li>
        "\n-------------------------\n",        // <hr>
        "<div>\n",                              // <div>
        "\n\n",                                 // <table> and </table>
        "\n",                                   // <tr> and </tr>
        "\t\t\\1\n",                            // <td> and </td>
        '',                                      // <span class="_html2text_ignore">...</span>
    ];

    /**
     * List of preg* regular expression patterns to search for,
     * used in conjunction with $ent_replace.
     *
     * @type array
     * @see $ent_replace
     */
    protected $ent_search = [
        '/&(nbsp|#160);/i',                      // Non-breaking space
        '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i',
        // Double quotes
        '/&(apos|rsquo|lsquo|#8216|#8217);/i',   // Single quotes
        '/>/i',                               // Greater-than
        '/&lt;/i',                               // Less-than
        '/&(copy|#169);/i',                      // Copyright
        '/&(trade|#8482|#153);/i',               // Trademark
        '/&(reg|#174);/i',                       // Registered
        '/&(mdash|#151|#8212);/i',               // mdash
        '/&(ndash|minus|#8211|#8722);/i',        // ndash
        '/&(bull|#149|#8226);/i',                // Bullet
        '/&(pound|#163);/i',                     // Pound sign
        '/&(euro|#8364);/i',                     // Euro sign
        '/&(amp|#38);/i',                        // Ampersand: see _converter()
        '/[ ]{2,}/',                             // Runs of spaces, post-handling
    ];

    /**
     * List of pattern replacements corresponding to patterns searched.
     *
     * @type array
     * @see $ent_search
     */
    protected $ent_replace = [
        ' ',                                    // Non-breaking space
        '"',                                    // Double quotes
        "'",                                    // Single quotes
        '>',
        '<',
        '(c)',
        '(tm)',
        '(R)',
        '--',
        '-',
        '*',
        '£',
        'EUR',                                  // Euro sign. € ?
        '|+|amp|+|',                            // Ampersand: see _converter()
        ' ',                                    // Runs of spaces, post-handling
    ];

    /**
     * List of preg* regular expression patterns to search for
     * and replace using callback function.
     *
     * @type array
     */
    protected $callback_search = [
        '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i', // <a href="">
        '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
        '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
        '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
        '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
    ];

    /**
     * List of preg* regular expression patterns to search for in PRE body,
     * used in conjunction with $pre_replace.
     *
     * @type array
     * @see $pre_replace
     */
    protected $pre_search = [
        "/\n/",
        "/\t/",
        '/ /',
        '/<pre[^>]*>/',
        '/<\/pre>/',
    ];

    /**
     * List of pattern replacements corresponding to patterns searched for PRE body.
     *
     * @type array
     * @see $pre_search
     */
    protected $pre_replace = [
        '<br>',
        '&nbsp;&nbsp;&nbsp;&nbsp;',
        '&nbsp;',
        '',
        '',
    ];

    /**
     * Temporary workspace used during PRE processing.
     *
     * @type string
     */
    protected $pre_content = '';

    /**
     * Contains a list of HTML tags to allow in the resulting text.
     *
     * @type string
     * @see set_allowed_tags()
     */
    protected $allowed_tags = '';

    /**
     * Contains the base URL that relative links should resolve to.
     *
     * @type string
     */
    protected $url;

    /**
     * Indicates whether content in the $html variable has been converted yet.
     *
     * @type bool
     * @see $html, $text
     */
    protected $_converted = false;

    /**
     * Contains URL addresses from links to be rendered in plain text.
     *
     * @type array
     * @see _build_link_list()
     */
    protected $_link_list = [];

    /**
     * Various configuration options (able to be set in the constructor)
     *
     * @type array
     */
    protected $_options = [
        // 'none'
        // 'inline' (show links inline)
        // 'nextline' (show links on the next line)
        // 'table' (if a table of link URLs should be listed after the text.
        'do_links' => 'inline',
        //  Maximum width of the formatted text, in columns.
        //  Set this value to 0 (or less) to ignore word wrapping
        //  and not constrain text to a fixed-width column.
        'width'    => 70,
    ];

    /**
     * Constructor.
     *
     * If the HTML source string (or file) is supplied, the class
     * will instantiate with that source propagated, all that has
     * to be done it to call get_text().
     *
     * @param string $source    HTML content
     * @param bool   $from_file Indicates $source is a file to pull content from
     * @param array  $options   Set configuration options
     */
    public function __construct($source = '', $from_file = false, $options = [])
    {
        $this->_options = array_merge($this->_options, $options);

        if (!empty($source)) {
            $this->set_html($source, $from_file);
        }

        $this->set_base_url();
    }

    /**
     * Loads source HTML into memory, either from $source string or a file.
     *
     * @param string $source    HTML content
     * @param bool   $from_file Indicates $source is a file to pull content from
     */
    public function set_html($source, $from_file = false)
    {
        if ($from_file && file_exists($source)) {
            $this->html = file_get_contents($source);
        } else {
            $this->html = $source;
        }

        $this->_converted = false;
    }

    /**
     * Returns the text, converted from HTML.
     *
     * @return string
     */
    public function get_text()
    {
        if (!$this->_converted) {
            $this->_convert();
        }

        return $this->text;
    }

    /**
     * Prints the text, converted from HTML.
     */
    public function print_text()
    {
        print $this->get_text();
    }

    /**
     * Alias to print_text(), operates identically.
     *
     * @see print_text()
     */
    public function p()
    {
        print $this->get_text();
    }

    /**
     * Sets the allowed HTML tags to pass through to the resulting text.
     *
     * Tags should be in the form "<p>", with no corresponding closing tag.
     *
     * @param string $allowed_tags
     */
    public function set_allowed_tags($allowed_tags = '')
    {
        if (!empty($allowed_tags)) {
            $this->allowed_tags = $allowed_tags;
        }
    }

    /**
     * Sets a base URL to handle relative links.
     *
     * @param string $url
     */
    public function set_base_url($url = '')
    {
        if (empty($url)) {
            if (\Xmf\Request::hasVar('HTTP_HOST', 'SERVER')) {
                $this->url = 'http://' . $_SERVER['HTTP_HOST'];
            } else {
                $this->url = '';
            }
        } else {
            // Strip any trailing slashes for consistency (relative
            // URLs may already start with a slash like "/file.html")
            if ('/' === mb_substr($url, -1)) {
                $url = mb_substr($url, 0, -1);
            }
            $this->url = $url;
        }
    }

    /**
     * Workhorse function that does actual conversion (calls _converter() method).
     */
    protected function _convert()
    {
        // Variables used for building the link list
        $this->_link_list = [];

        $text = trim(stripslashes($this->html));

        // Convert HTML to TXT
        $this->_converter($text);

        // Add link list
        if (!empty($this->_link_list)) {
            $text .= "\n\nLinks:\n------\n";
            foreach ($this->_link_list as $idx => $url) {
                $text .= '[' . ($idx + 1) . '] ' . $url . "\n";
            }
        }

        $this->text = $text;

        $this->_converted = true;
    }

    /**
     * Workhorse function that does actual conversion.
     *
     * First performs custom tag replacement specified by $search and
     * $replace arrays. Then strips any remaining HTML tags, reduces whitespace
     * and newlines to a readable format, and word wraps the text to
     * $this->_options['width'] characters.
     *
     * @param string $text Reference to HTML content string
     */
    protected function _converter(&$text)
    {
        // Convert <BLOCKQUOTE> (before PRE!)
        $this->_convert_blockquotes($text);

        // Convert <PRE>
        $this->_convert_pre($text);

        // Run our defined tags search-and-replace
        $text = preg_replace($this->search, $this->replace, $text);

        // Run our defined tags search-and-replace with callback
        $text = preg_replace_callback($this->callback_search, [$this, '_preg_callback'], $text);

        // Strip any other HTML tags
        $text = strip_tags($text, $this->allowed_tags);

        // Run our defined entities/characters search-and-replace
        $text = preg_replace($this->ent_search, $this->ent_replace, $text);

        // Replace known html entities
        $text = html_entity_decode($text, ENT_QUOTES);

        // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
        $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);

        // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
        // This properly handles situation of "&amp;quot;" in input string
        $text = str_replace('|+|amp|+|', '&', $text);

        // Bring down number of empty lines to 2 max
        $text = preg_replace("/\n\s+\n/", "\n\n", $text);
        $text = preg_replace("/[\n]{3,}/", "\n\n", $text);

        // remove leading empty lines (can be produced by eg. P tag on the beginning)
        $text = ltrim($text, "\n");

        // Wrap the text to a readable format
        // for PHP versions >= 4.0.2. Default width is 75
        // If width is 0 or less, don't wrap the text.
        if ($this->_options['width'] > 0) {
            $text = wordwrap($text, $this->_options['width']);
        }
    }

    /**
     * Helper function called by preg_replace() on link replacement.
     *
     * Maintains an internal list of links to be displayed at the end of the
     * text, with numeric indices to the original point in the text they
     * appeared. Also makes an effort at identifying and handling absolute
     * and relative links.
     *
     * @param string $link    URL of the link
     * @param string $display Part of the text to associate number with
     * @param string|null   $link_override
     *
     * @return string
     */
    protected function _build_link_list($link, $display, $link_override = null)
    {
        $link_method = $link_override ?: $this->_options['do_links'];
        if ('none' === $link_method) {
            return $display;
        }

        // Ignored link types
        if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
            return $display;
        }

        if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
            $url = $link;
        } else {
            $url = $this->url;
            if ('/' !== mb_substr($link, 0, 1)) {
                $url .= '/';
            }
            $url .= (string)$link;
        }

        if ('table' === $link_method) {
            if (false === ($index = array_search($url, $this->_link_list, true))) {
                $index              = count($this->_link_list);
                $this->_link_list[] = $url;
            }

            return $display . ' [' . ($index + 1) . ']';
        } elseif ('nextline' === $link_method) {
            return $display . "\n[" . $url . ']';
        }   // link_method defaults to inline

        return $display . ' [' . $url . ']';
    }

    /**
     * Helper function for PRE body conversion.
     *
     * @param string $text HTML content
     */
    protected function _convert_pre(&$text)
    {
        // get the content of PRE element
        while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
            $this->pre_content = $matches[1];

            // Run our defined tags search-and-replace with callback
            $this->pre_content = preg_replace_callback($this->callback_search, [$this, '_preg_callback'], $this->pre_content);

            // convert the content
            $this->pre_content = sprintf('<div><br>%s<br></div>', preg_replace($this->pre_search, $this->pre_replace, $this->pre_content));

            // replace the content (use callback because content can contain $0 variable)
            $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU', [$this, '_preg_pre_callback'], $text, 1);

            // free memory
            $this->pre_content = '';
        }
    }

    /**
     * Helper function for BLOCKQUOTE body conversion.
     *
     * @param string $text HTML content
     */
    protected function _convert_blockquotes(&$text)
    {
        if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
            $start  = 0;
            $taglen = 0;
            $level  = 0;
            $diff   = 0;
            foreach ($matches[0] as $m) {
                if ('<' === $m[0][0] && '/' === $m[0][1]) {
                    $level--;
                    if ($level < 0) {
                        $level = 0; // malformed HTML: go to next blockquote
                    } elseif ($level > 0) {
                        // skip inner blockquote
                    } else {
                        $end = $m[1];
                        $len = $end - $taglen - $start;
                        // Get blockquote content
                        $body = mb_substr($text, $start + $taglen - $diff, $len);

                        // Set text width
                        $p_width = $this->_options['width'];
                        if ($this->_options['width'] > 0) {
                            $this->_options['width'] -= 2;
                        }
                        // Convert blockquote content
                        $body = trim($body);
                        $this->_converter($body);
                        // Add citation markers and create PRE block
                        $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
                        $body = '<pre>' . htmlspecialchars($body, ENT_QUOTES | ENT_HTML5) . '</pre>';
                        // Re-set text width
                        $this->_options['width'] = $p_width;
                        // Replace content
                        $text = mb_substr($text, 0, $start - $diff) . $body . mb_substr($text, $end + mb_strlen($m[0]) - $diff);

                        $diff = $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
                        unset($body);
                    }
                } else {
                    if (0 == $level) {
                        $start  = $m[1];
                        $taglen = mb_strlen($m[0]);
                    }
                    $level++;
                }
            }
        }
    }

    /**
     * Callback function for preg_replace_callback use.
     *
     * @param array $matches PREG matches
     *
     * @return string
     */
    protected function _preg_callback($matches)
    {
        switch (mb_strtolower($matches[1])) {
            case 'b':
            case 'strong':
                return $this->_toupper($matches[3]);
            case 'th':
                return $this->_toupper("\t\t" . $matches[3] . "\n");
            case 'h':
                return $this->_toupper("\n\n" . $matches[3] . "\n\n");
            case 'a':
                // override the link method
                $link_override = null;
                if (preg_match('/_html2text_link_(\w+)/', $matches[4], $link_override_match)) {
                    $link_override = $link_override_match[1];
                }
                // Remove spaces in URL (#1487805)
                $url = str_replace(' ', '', $matches[3]);

                return $this->_build_link_list($url, $matches[5], $link_override);
        }

        return '';
    }

    /**
     * Callback function for preg_replace_callback use in PRE content handler.
     *
     * @param array $matches PREG matches
     *
     * @return string
     */
    protected function _preg_pre_callback(
        /** @noinspection PhpUnusedParameterInspection */
        $matches)

    {
        return $this->pre_content;
    }

    /**
     * Strtoupper function with HTML tags and entities handling.
     *
     * @param string $str Text to convert
     *
     * @return string Converted text
     */
    private function _toupper($str)
    {
        // string can contain HTML tags
        $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);

        // convert toupper only the text between HTML tags
        foreach ($chunks as $idx => $chunk) {
            if ('<' !== $chunk[0]) {
                $chunks[$idx] = $this->_strtoupper($chunk);
            }
        }

        return implode($chunks);
    }

    /**
     * Strtoupper multibyte wrapper function with HTML entities handling.
     * Forces mb_strtoupper-call to UTF-8.
     *
     * @param string $str Text to convert
     *
     * @return string Converted text
     */
    private function _strtoupper($str)
    {
        $str = html_entity_decode($str, ENT_COMPAT);

        if (function_exists('mb_strtoupper')) {
            $str = mb_strtoupper($str, 'UTF-8');
        } else {
            $str = mb_strtoupper($str);
        }

        $str = htmlspecialchars($str, ENT_COMPAT);

        return $str;
    }
}


1			<?php
2			/*************************************************************************
3			* *
4			* Converts HTML to formatted plain text *
5			* *
6			* Portions Copyright (c) 2005-2007 Jon Abernathy <[email protected]> *
7			* *
8			* This script is free software; you can redistribute it and/or modify *
9			* it under the terms of the GNU General Public License as published by *
10			* the Free Software Foundation; either version 2 of the License, or *
11			* (at your option) any later version. *
12			* *
13			* The GNU General Public License can be found at *
14			* http://www.gnu.org/copyleft/gpl.html. *
15			* *
16			* This script is distributed in the hope that it will be useful, *
17			* but WITHOUT ANY WARRANTY; without even the implied warranty of *
18			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
19			* GNU General Public License for more details. *
20			* *
21			*************************************************************************/
22
23			/**
24			* Converts HTML to formatted plain text
25			*/
26			class Html2Text
27			{
28			/**
29			* Contains the HTML content to convert.
30			*
31			* @type string
32			*/
33			protected $html;
34
35			/**
36			* Contains the converted, formatted text.
37			*
38			* @type string
39			*/
40			protected $text;
41
42			/**
43			* Maximum width of the formatted text, in columns.
44			*
45			* Set this value to 0 (or less) to ignore word wrapping
46			* and not constrain text to a fixed-width column.
47			*
48			* @type int
49			*/
50			protected $width = 70;
51
52			/**
53			* List of preg* regular expression patterns to search for,
54			* used in conjunction with $replace.
55			*
56			* @type array
57			* @see $replace
58			*/
59			protected $search = [
60			"/\r/", // Non-legal carriage return
61			"/[\n\t]+/", // Newlines and tabs
62			'/<head[^>]>.?<\/head>/i', // <head>
63			'/<script[^>]>.?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
64			'/<style[^>]>.?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
65			'/<p[^>]*>/i', // <P>
66			'/<br[^>]*>/i', // <br>
67			'/<i[^>]>(.?)<\/i>/i', // <i>
68			'/<em[^>]>(.?)<\/em>/i', // <em>
69			'/(<ul[^>]*>\|<\/ul>)/i', // <ul> and </ul>
70			'/(<ol[^>]*>\|<\/ol>)/i', // <ol> and </ol>
71			'/(<dl[^>]*>\|<\/dl>)/i', // <dl> and </dl>
72			'/<li[^>]>(.?)<\/li>/i', // <li> and </li>
73			'/<dd[^>]>(.?)<\/dd>/i', // <dd> and </dd>
74			'/<dt[^>]>(.?)<\/dt>/i', // <dt> and </dt>
75			'/<li[^>]*>/i', // <li>
76			'/<hr[^>]*>/i', // <hr>
77			'/<div[^>]*>/i', // <div>
78			'/(<table[^>]*>\|<\/table>)/i', // <table> and </table>
79			'/(<tr[^>]*>\|<\/tr>)/i', // <tr> and </tr>
80			'/<td[^>]>(.?)<\/td>/i', // <td> and </td>
81			'/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
82			];
83
84			/**
85			* List of pattern replacements corresponding to patterns searched.
86			*
87			* @type array
88			* @see $search
89			*/
90			protected $replace = [
91			'', // Non-legal carriage return
92			' ', // Newlines and tabs
93			'', // <head>
94			'', // <script>s -- which strip_tags supposedly has problems with
95			'', // <style>s -- which strip_tags supposedly has problems with
96			"\n\n", // <P>
97			"\n", // <br>
98			'_\\1_', // <i>
99			'_\\1_', // <em>
100			"\n\n", // <ul> and </ul>
101			"\n\n", // <ol> and </ol>
102			"\n\n", // <dl> and </dl>
103			"\t* \\1\n", // <li> and </li>
104			" \\1\n", // <dd> and </dd>
105			"\t* \\1", // <dt> and </dt>
106			"\n\t* ", // <li>
107			"\n-------------------------\n", // <hr>
108			"<div>\n", // <div>
109			"\n\n", // <table> and </table>
110			"\n", // <tr> and </tr>
111			"\t\t\\1\n", // <td> and </td>
112			'', // <span class="_html2text_ignore">...</span>
113			];
114
115			/**
116			* List of preg* regular expression patterns to search for,
117			* used in conjunction with $ent_replace.
118			*
119			* @type array
120			* @see $ent_replace
121			*/
122			protected $ent_search = [
123			'/&(nbsp\|#160);/i', // Non-breaking space
124			'/&(quot\|rdquo\|ldquo\|#8220\|#8221\|#147\|#148);/i',
125			// Double quotes
126			'/&(apos\|rsquo\|lsquo\|#8216\|#8217);/i', // Single quotes
127			'/>/i', // Greater-than
128			'/</i', // Less-than
129			'/&(copy\|#169);/i', // Copyright
130			'/&(trade\|#8482\|#153);/i', // Trademark
131			'/&(reg\|#174);/i', // Registered
132			'/&(mdash\|#151\|#8212);/i', // mdash
133			'/&(ndash\|minus\|#8211\|#8722);/i', // ndash
134			'/&(bull\|#149\|#8226);/i', // Bullet
135			'/&(pound\|#163);/i', // Pound sign
136			'/&(euro\|#8364);/i', // Euro sign
137			'/&(amp\|#38);/i', // Ampersand: see _converter()
138			'/[ ]{2,}/', // Runs of spaces, post-handling
139			];
140
141			/**
142			* List of pattern replacements corresponding to patterns searched.
143			*
144			* @type array
145			* @see $ent_search
146			*/
147			protected $ent_replace = [
148			' ', // Non-breaking space
149			'"', // Double quotes
150			"'", // Single quotes
151			'>',
152			'<',
153			'(c)',
154			'(tm)',
155			'(R)',
156			'--',
157			'-',
158			'*',
159			'£',
160			'EUR', // Euro sign. € ?
161			'\|+\|amp\|+\|', // Ampersand: see _converter()
162			' ', // Runs of spaces, post-handling
163			];
164
165			/**
166			* List of preg* regular expression patterns to search for
167			* and replace using callback function.
168			*
169			* @type array
170			*/
171			protected $callback_search = [
172			'/<(a) [^>]href=("\|\')([^"\']+)\2([^>])>(.*?)<\/a>/i', // <a href="">
173			'/<(h)[123456]( [^>])?>(.?)<\/h[123456]>/i', // h1 - h6
174			'/<(b)( [^>])?>(.?)<\/b>/i', // <b>
175			'/<(strong)( [^>])?>(.?)<\/strong>/i', // <strong>
176			'/<(th)( [^>])?>(.?)<\/th>/i', // <th> and </th>
177			];
178
179			/**
180			* List of preg* regular expression patterns to search for in PRE body,
181			* used in conjunction with $pre_replace.
182			*
183			* @type array
184			* @see $pre_replace
185			*/
186			protected $pre_search = [
187			"/\n/",
188			"/\t/",
189			'/ /',
190			'/<pre[^>]*>/',
191			'/<\/pre>/',
192			];
193
194			/**
195			* List of pattern replacements corresponding to patterns searched for PRE body.
196			*
197			* @type array
198			* @see $pre_search
199			*/
200			protected $pre_replace = [
201			'<br>',
202			'    ',
203			' ',
204			'',
205			'',
206			];
207
208			/**
209			* Temporary workspace used during PRE processing.
210			*
211			* @type string
212			*/
213			protected $pre_content = '';
214
215			/**
216			* Contains a list of HTML tags to allow in the resulting text.
217			*
218			* @type string
219			* @see set_allowed_tags()
220			*/
221			protected $allowed_tags = '';
222
223			/**
224			* Contains the base URL that relative links should resolve to.
225			*
226			* @type string
227			*/
228			protected $url;
229
230			/**
231			* Indicates whether content in the $html variable has been converted yet.
232			*
233			* @type bool
234			* @see $html, $text
235			*/
236			protected $_converted = false;
237
238			/**
239			* Contains URL addresses from links to be rendered in plain text.
240			*
241			* @type array
242			* @see _build_link_list()
243			*/
244			protected $_link_list = [];
245
246			/**
247			* Various configuration options (able to be set in the constructor)
248			*
249			* @type array
250			*/
251			protected $_options = [
252			// 'none'
253			// 'inline' (show links inline)
254			// 'nextline' (show links on the next line)
255			// 'table' (if a table of link URLs should be listed after the text.
256			'do_links' => 'inline',
257			// Maximum width of the formatted text, in columns.
258			// Set this value to 0 (or less) to ignore word wrapping
259			// and not constrain text to a fixed-width column.
260			'width' => 70,
261			];
262
263			/**
264			* Constructor.
265			*
266			* If the HTML source string (or file) is supplied, the class
267			* will instantiate with that source propagated, all that has
268			* to be done it to call get_text().
269			*
270			* @param string $source HTML content
271			* @param bool $from_file Indicates $source is a file to pull content from
272			* @param array $options Set configuration options
273			*/
274			public function __construct($source = '', $from_file = false, $options = [])
275			{
276			$this->_options = array_merge($this->_options, $options);
277
278			if (!empty($source)) {
279			$this->set_html($source, $from_file);
280			}
281
282			$this->set_base_url();
283			}
284
285			/**
286			* Loads source HTML into memory, either from $source string or a file.
287			*
288			* @param string $source HTML content
289			* @param bool $from_file Indicates $source is a file to pull content from
290			*/
291			public function set_html($source, $from_file = false)
292			{
293			if ($from_file && file_exists($source)) {
294			$this->html = file_get_contents($source);
295			} else {
296			$this->html = $source;
297			}
298
299			$this->_converted = false;
300			}
301
302			/**
303			* Returns the text, converted from HTML.
304			*
305			* @return string
306			*/
307			public function get_text()
308			{
309			if (!$this->_converted) {
310			$this->_convert();
311			}
312
313			return $this->text;
314			}
315
316			/**
317			* Prints the text, converted from HTML.
318			*/
319			public function print_text()
320			{
321			print $this->get_text();
322			}
323
324			/**
325			* Alias to print_text(), operates identically.
326			*
327			* @see print_text()
328			*/
329			public function p()
330			{
331			print $this->get_text();
332			}
333
334			/**
335			* Sets the allowed HTML tags to pass through to the resulting text.
336			*
337			* Tags should be in the form "<p>", with no corresponding closing tag.
338			*
339			* @param string $allowed_tags
340			*/
341			public function set_allowed_tags($allowed_tags = '')
342			{
343			if (!empty($allowed_tags)) {
344			$this->allowed_tags = $allowed_tags;
345			}
346			}
347
348			/**
349			* Sets a base URL to handle relative links.
350			*
351			* @param string $url
352			*/
353			public function set_base_url($url = '')
354			{
355			if (empty($url)) {
356			if (\Xmf\Request::hasVar('HTTP_HOST', 'SERVER')) {
357			$this->url = 'http://' . $_SERVER['HTTP_HOST'];
358			} else {
359			$this->url = '';
360			}
361			} else {
362			// Strip any trailing slashes for consistency (relative
363			// URLs may already start with a slash like "/file.html")
364			if ('/' === mb_substr($url, -1)) {
365			$url = mb_substr($url, 0, -1);
366			}
367			$this->url = $url;
368			}
369			}
370
371			/**
372			* Workhorse function that does actual conversion (calls _converter() method).
373			*/
374			protected function _convert()
375			{
376			// Variables used for building the link list
377			$this->_link_list = [];
378
379			$text = trim(stripslashes($this->html));
380
381			// Convert HTML to TXT
382			$this->_converter($text);
383
384			// Add link list
385			if (!empty($this->_link_list)) {
386			$text .= "\n\nLinks:\n------\n";
387			foreach ($this->_link_list as $idx => $url) {
388			$text .= '[' . ($idx + 1) . '] ' . $url . "\n";
389			}
390			}
391
392			$this->text = $text;
393
394			$this->_converted = true;
395			}
396
397			/**
398			* Workhorse function that does actual conversion.
399			*
400			* First performs custom tag replacement specified by $search and
401			* $replace arrays. Then strips any remaining HTML tags, reduces whitespace
402			* and newlines to a readable format, and word wraps the text to
403			* $this->_options['width'] characters.
404			*
405			* @param string $text Reference to HTML content string
406			*/
407			protected function _converter(&$text)
408			{
409			// Convert <BLOCKQUOTE> (before PRE!)
410			$this->_convert_blockquotes($text);
411
412			// Convert <PRE>
413			$this->_convert_pre($text);
414
415			// Run our defined tags search-and-replace
416			$text = preg_replace($this->search, $this->replace, $text);
417
418			// Run our defined tags search-and-replace with callback
419			$text = preg_replace_callback($this->callback_search, [$this, '_preg_callback'], $text);
420
421			// Strip any other HTML tags
422			$text = strip_tags($text, $this->allowed_tags);
423
424			// Run our defined entities/characters search-and-replace
425			$text = preg_replace($this->ent_search, $this->ent_replace, $text);
426
427			// Replace known html entities
428			$text = html_entity_decode($text, ENT_QUOTES);
429
430			// Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
431			$text = preg_replace('/&([a-zA-Z0-9]{2,6}\|#[0-9]{2,4});/', '', $text);
432
433			// Convert "\|+\|amp\|+\|" into "&", need to be done after handling of unknown entities
434			// This properly handles situation of "&quot;" in input string
435			$text = str_replace('\|+\|amp\|+\|', '&', $text);
436
437			// Bring down number of empty lines to 2 max
438			$text = preg_replace("/\n\s+\n/", "\n\n", $text);
439			$text = preg_replace("/[\n]{3,}/", "\n\n", $text);
440
441			// remove leading empty lines (can be produced by eg. P tag on the beginning)
442			$text = ltrim($text, "\n");
443
444			// Wrap the text to a readable format
445			// for PHP versions >= 4.0.2. Default width is 75
446			// If width is 0 or less, don't wrap the text.
447			if ($this->_options['width'] > 0) {
448			$text = wordwrap($text, $this->_options['width']);
449			}
450			}
451
452			/**
453			* Helper function called by preg_replace() on link replacement.
454			*
455			* Maintains an internal list of links to be displayed at the end of the
456			* text, with numeric indices to the original point in the text they
457			* appeared. Also makes an effort at identifying and handling absolute
458			* and relative links.
459			*
460			* @param string $link URL of the link
461			* @param string $display Part of the text to associate number with
462			* @param string\|null $link_override
463			*
464			* @return string
465			*/
466			protected function _build_link_list($link, $display, $link_override = null)
467			{
468			$link_method = $link_override ?: $this->_options['do_links'];
469			if ('none' === $link_method) {
470			return $display;
471			}
472
473			// Ignored link types
474			if (preg_match('!^(javascript:\|mailto:\|#)!i', $link)) {
475			return $display;
476			}
477
478			if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
479			$url = $link;
480			} else {
481			$url = $this->url;
482			if ('/' !== mb_substr($link, 0, 1)) {
483			$url .= '/';
484			}
485			$url .= (string)$link;
486			}
487
488			if ('table' === $link_method) {
489			if (false === ($index = array_search($url, $this->_link_list, true))) {
490			$index = count($this->_link_list);
491			$this->_link_list[] = $url;
492			}
493
494			return $display . ' [' . ($index + 1) . ']';
495			} elseif ('nextline' === $link_method) {
496			return $display . "\n[" . $url . ']';
497			} // link_method defaults to inline
498
499			return $display . ' [' . $url . ']';
500			}
501
502			/**
503			* Helper function for PRE body conversion.
504			*
505			* @param string $text HTML content
506			*/
507			protected function _convert_pre(&$text)
508			{
509			// get the content of PRE element
510			while (preg_match('/<pre[^>]>(.)<\/pre>/ismU', $text, $matches)) {
511			$this->pre_content = $matches[1];
512
513			// Run our defined tags search-and-replace with callback
514			$this->pre_content = preg_replace_callback($this->callback_search, [$this, '_preg_callback'], $this->pre_content);
515
516			// convert the content
517			$this->pre_content = sprintf('<div><br>%s<br></div>', preg_replace($this->pre_search, $this->pre_replace, $this->pre_content));
518
519			// replace the content (use callback because content can contain $0 variable)
520			$text = preg_replace_callback('/<pre[^>]>.<\/pre>/ismU', [$this, '_preg_pre_callback'], $text, 1);
521
522			// free memory
523			$this->pre_content = '';
524			}
525			}
526
527			/**
528			* Helper function for BLOCKQUOTE body conversion.
529			*
530			* @param string $text HTML content
531			*/
532			protected function _convert_blockquotes(&$text)
533			{
534			if (preg_match_all('/<\/blockquote[^>]>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
535			$start = 0;
536			$taglen = 0;
537			$level = 0;
538			$diff = 0;
539			foreach ($matches[0] as $m) {
540			if ('<' === $m[0][0] && '/' === $m[0][1]) {
541			$level--;
542			if ($level < 0) {
543			$level = 0; // malformed HTML: go to next blockquote
544			} elseif ($level > 0) {
545			// skip inner blockquote
546			} else {
547			$end = $m[1];
548			$len = $end - $taglen - $start;
549			// Get blockquote content
550			$body = mb_substr($text, $start + $taglen - $diff, $len);
551
552			// Set text width
553			$p_width = $this->_options['width'];
554			if ($this->_options['width'] > 0) {
555			$this->_options['width'] -= 2;
556			}
557			// Convert blockquote content
558			$body = trim($body);
559			$this->_converter($body);
560			// Add citation markers and create PRE block
561			$body = preg_replace('/((^\|\n)>*)/', '\\1> ', trim($body));
562			$body = '<pre>' . htmlspecialchars($body, ENT_QUOTES \| ENT_HTML5) . '</pre>';
563			// Re-set text width
564			$this->_options['width'] = $p_width;
565			// Replace content
566			$text = mb_substr($text, 0, $start - $diff) . $body . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
567
568			$diff = $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
569			unset($body);
570			}
571			} else {
572			if (0 == $level) {
573			$start = $m[1];
574			$taglen = mb_strlen($m[0]);
575			}
576			$level++;
577			}
578			}
579			}
580			}
581
582			/**
583			* Callback function for preg_replace_callback use.
584			*
585			* @param array $matches PREG matches
586			*
587			* @return string
588			*/
589			protected function _preg_callback($matches)
590			{
591			switch (mb_strtolower($matches[1])) {
592			case 'b':
593			case 'strong':
594			return $this->_toupper($matches[3]);
595			case 'th':
596			return $this->_toupper("\t\t" . $matches[3] . "\n");
597			case 'h':
598			return $this->_toupper("\n\n" . $matches[3] . "\n\n");
599			case 'a':
600			// override the link method
601			$link_override = null;
602			if (preg_match('/_html2text_link_(\w+)/', $matches[4], $link_override_match)) {
603			$link_override = $link_override_match[1];
604			}
605			// Remove spaces in URL (#1487805)
606			$url = str_replace(' ', '', $matches[3]);
607
608			return $this->_build_link_list($url, $matches[5], $link_override);
609			}
610
611			return '';
612			}
613
614			/**
615			* Callback function for preg_replace_callback use in PRE content handler.
616			*
617			* @param array $matches PREG matches
618			*
619			* @return string
620			*/
621			protected function _preg_pre_callback(
622			/** @noinspection PhpUnusedParameterInspection */
623			$matches)
			0 ignored issues – show Unused Code introduced 2019-10-05 16:16 UTC by Report Bug Copy Issue Report The parameter `$matches` is not used and could be removed. This check looks from parameters that have been defined for a function or method, but which are not used in the method body. Loading history...
624			{
625			return $this->pre_content;
626			}
627
628			/**
629			* Strtoupper function with HTML tags and entities handling.
630			*
631			* @param string $str Text to convert
632			*
633			* @return string Converted text
634			*/
635			private function _toupper($str)
636			{
637			// string can contain HTML tags
638			$chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY \| PREG_SPLIT_DELIM_CAPTURE);
639
640			// convert toupper only the text between HTML tags
641			foreach ($chunks as $idx => $chunk) {
642			if ('<' !== $chunk[0]) {
643			$chunks[$idx] = $this->_strtoupper($chunk);
644			}
645			}
646
647			return implode($chunks);
648			}
649
650			/**
651			* Strtoupper multibyte wrapper function with HTML entities handling.
652			* Forces mb_strtoupper-call to UTF-8.
653			*
654			* @param string $str Text to convert
655			*
656			* @return string Converted text
657			*/
658			private function _strtoupper($str)
659			{
660			$str = html_entity_decode($str, ENT_COMPAT);
661
662			if (function_exists('mb_strtoupper')) {
663			$str = mb_strtoupper($str, 'UTF-8');
664			} else {
665			$str = mb_strtoupper($str);
666			}
667
668			$str = htmlspecialchars($str, ENT_COMPAT);
669
670			return $str;
671			}
672			}
673

XoopsModules25x / xnewsletter

Html2Text::_convert_blockquotes() B last analyzed 2020-08-07 11:59 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

Html2Text::_convert_blockquotes() B
last analyzed 2020-08-07 11:59 UTC