Zend_Search_Lucene_Document_Html::__construct() - Code Metrics - Inspection of "created/updated travis/scrutinizer config files" - adamjakab/SuiteCRM - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — CI ( 0f01dd...c95a04 )

by Adam

created 2016-03-07 17:10 UTC

Zend_Search_Lucene_Document_Html::__construct() F

↳ Parent: Zend_Search_Lucene_Document_Html

Complexity

Conditions	18
Paths	2592

Size

Total Lines	98
Code Lines	56

Duplication

Lines	0
Ratio	0 %

Metric	Value
dl	0
loc	98
rs	2
cc	18
eloc	56
nc	2592
nop	4

How to fix Long Method Complexity

<?php
/**
 * Zend Framework
 *
 * LICENSE
 *
 * This source file is subject to the new BSD license that is bundled
 * with this package in the file LICENSE.txt.
 * It is also available through the world-wide-web at this URL:
 * http://framework.zend.com/license/new-bsd
 * If you did not receive a copy of the license and are unable to
 * obtain it through the world-wide-web, please send an email
 * to [email protected] so we can send you a copy immediately.
 *
 * @category   Zend
 * @package    Zend_Search_Lucene
 * @subpackage Document
 * @copyright  Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 * @version    $Id: Html.php 24593 2012-01-05 20:35:02Z matthew $
 */


/** Zend_Search_Lucene_Document */
require_once 'Zend/Search/Lucene/Document.php';


/**
 * HTML document.
 *
 * @category   Zend
 * @package    Zend_Search_Lucene
 * @subpackage Document
 * @copyright  Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 */
class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document

{
    /**
     * List of document links
     *
     * @var array
     */
    private $_links = array();

    /**
     * List of document header links
     *
     * @var array
     */
    private $_headerLinks = array();

    /**
     * Stored DOM representation
     *
     * @var DOMDocument
     */
    private $_doc;

    /**
     * Exclud nofollow links flag
     *
     * If true then links with rel='nofollow' attribute are not included into
     * document links.
     *
     * @var boolean
     */
    private static $_excludeNoFollowLinks = false;

    /**
     *
     * List of inline tags
     *
     * @var array
     */
    private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code',
                                'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike',
                                'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins',
                                'q', 'sub', 'sup');

    /**
     * Object constructor
     *
     * @param string  $data         HTML string (may be HTML fragment, )
     * @param boolean $isFile
     * @param boolean $storeContent
     * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
     */
    private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
    {
        $this->_doc = new DOMDocument();
        $this->_doc->substituteEntities = true;

        if ($isFile) {
            $htmlData = file_get_contents($data);
        } else {
            $htmlData = $data;
        }
        @$this->_doc->loadHTML($htmlData);
// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}

        if ($this->_doc->encoding === null) {
            // Document encoding is not recognized

            /** @todo improve HTML vs HTML fragment recognition */
            if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
                // It's an HTML document
                // Add additional HEAD section and recognize document
                $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);

                @$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset))
// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
                                     . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>'
                                     . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));

                // Remove additional HEAD section
                $xpath = new DOMXPath($this->_doc);
                $head  = $xpath->query('/html/head')->item(0);
                $head->parentNode->removeChild($head);
            } else {
                // It's an HTML fragment
                @$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>'
// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
                                     . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData)
                                     . '</body></html>');
            }

        }
        /** @todo Add correction of wrong HTML encoding recognition processing
         * The case is:
         * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
         * even $this->_doc->encoding demonstrates another recognized encoding
         */

        $xpath = new DOMXPath($this->_doc);

        $docTitle = '';
        $titleNodes = $xpath->query('/html/head/title');
        foreach ($titleNodes as $titleNode) {
            // title should always have only one entry, but we process all nodeset entries
            $docTitle .= $titleNode->nodeValue . ' ';
        }
        $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8'));

        $metaNodes = $xpath->query('/html/head/meta[@name]');
        foreach ($metaNodes as $metaNode) {
            $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'),
                                                           $metaNode->getAttribute('content'),
                                                           'UTF-8'));
        }

        $docBody = '';
        $bodyNodes = $xpath->query('/html/body');
        foreach ($bodyNodes as $bodyNode) {
            // body should always have only one entry, but we process all nodeset entries
            $this->_retrieveNodeText($bodyNode, $docBody);
        }
        if ($storeContent) {
            $this->addField(Zend_Search_Lucene_Field::Text('contents', $docBody, 'UTF-8'));
        } else {
            $this->addField(Zend_Search_Lucene_Field::UnStored('contents', $docBody, 'UTF-8'));
        }

        $linkNodes = $this->_doc->getElementsByTagName('a');
        foreach ($linkNodes as $linkNode) {
            if (($href = $linkNode->getAttribute('href')) != '' &&
                (!self::$_excludeNoFollowLinks  ||  strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
               ) {
                $this->_links[] = $href;
            }
        }
        $linkNodes = $this->_doc->getElementsByTagName('area');
        foreach ($linkNodes as $linkNode) {
            if (($href = $linkNode->getAttribute('href')) != '' &&
                (!self::$_excludeNoFollowLinks  ||  strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
               ) {
                $this->_links[] = $href;
            }
        }
        $this->_links = array_unique($this->_links);

        $linkNodes = $xpath->query('/html/head/link');
        foreach ($linkNodes as $linkNode) {
            if (($href = $linkNode->getAttribute('href')) != '') {
                $this->_headerLinks[] = $href;
            }
        }
        $this->_headerLinks = array_unique($this->_headerLinks);
    }

    /**
     * Set exclude nofollow links flag
     *
     * @param boolean $newValue
     */
    public static function setExcludeNoFollowLinks($newValue)
    {
        self::$_excludeNoFollowLinks = $newValue;
    }

    /**
     * Get exclude nofollow links flag
     *
     * @return boolean
     */
    public static function getExcludeNoFollowLinks()
    {
        return self::$_excludeNoFollowLinks;
    }

    /**
     * Get node text
     *
     * We should exclude scripts, which may be not included into comment tags, CDATA sections,
     *
     * @param DOMNode $node
     * @param string &$text
     */
    private function _retrieveNodeText(DOMNode $node, &$text)
    {
        if ($node->nodeType == XML_TEXT_NODE) {
            $text .= $node->nodeValue;
            if(!in_array($node->parentNode->tagName, $this->_inlineTags)) {
                $text .= ' ';
            }
        } else if ($node->nodeType == XML_ELEMENT_NODE  &&  $node->nodeName != 'script') {
            foreach ($node->childNodes as $childNode) {
                $this->_retrieveNodeText($childNode, $text);
            }
        }
    }

    /**
     * Get document HREF links
     *
     * @return array
     */
    public function getLinks()
    {
        return $this->_links;
    }

    /**
     * Get document header links
     *
     * @return array
     */
    public function getHeaderLinks()
    {
        return $this->_headerLinks;
    }

    /**
     * Load HTML document from a string
     *
     * @param string  $data
     * @param boolean $storeContent
     * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
     * @return Zend_Search_Lucene_Document_Html
     */
    public static function loadHTML($data, $storeContent = false, $defaultEncoding = '')
    {
        return new Zend_Search_Lucene_Document_Html($data, false, $storeContent, $defaultEncoding);
    }

    /**
     * Load HTML document from a file
     *
     * @param string  $file
     * @param boolean $storeContent
     * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
     * @return Zend_Search_Lucene_Document_Html
     */
    public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '')
    {
        return new Zend_Search_Lucene_Document_Html($file, true, $storeContent, $defaultEncoding);
    }


    /**
     * Highlight text in text node
     *
     * @param DOMText $node
     * @param array   $wordsToHighlight
     * @param callback $callback   Callback method, used to transform (highlighting) text.
     * @param array    $params     Array of additionall callback parameters (first non-optional parameter is a text to transform)
     * @throws Zend_Search_Lucene_Exception
     */
    protected function _highlightTextNode(DOMText $node, $wordsToHighlight, $callback, $params)
    {
        /** Zend_Search_Lucene_Analysis_Analyzer */
        require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';

        $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
        $analyzer->setInput($node->nodeValue, 'UTF-8');

        $matchedTokens = array();

        while (($token = $analyzer->nextToken()) !== null) {
            if (isset($wordsToHighlight[$token->getTermText()])) {
                $matchedTokens[] = $token;
            }
        }

        if (count($matchedTokens) == 0) {
            return;
        }

        $matchedTokens = array_reverse($matchedTokens);

        foreach ($matchedTokens as $token) {
            // Cut text after matched token
            $node->splitText($token->getEndOffset());

            // Cut matched node
            $matchedWordNode = $node->splitText($token->getStartOffset());

            // Retrieve HTML string representation for highlihted word
            $fullCallbackparamsList = $params;
            array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue);
            $highlightedWordNodeSetHtml = call_user_func_array($callback, $fullCallbackparamsList);

            // Transform HTML string to a DOM representation and automatically transform retrieved string
            // into valid XHTML (It's automatically done by loadHTML() method)
            $highlightedWordNodeSetDomDocument = new DOMDocument('1.0', 'UTF-8');
            $success = @$highlightedWordNodeSetDomDocument->
                                loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>'
                                       . $highlightedWordNodeSetHtml
                                       . '</body></html>');
            if (!$success) {
                require_once 'Zend/Search/Lucene/Exception.php';
                throw new Zend_Search_Lucene_Exception("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHtml'.");
            }
            $highlightedWordNodeSetXpath = new DOMXPath($highlightedWordNodeSetDomDocument);
            $highlightedWordNodeSet      = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes;

            for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) {
                $nodeToImport = $highlightedWordNodeSet->item($count);
                $node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */),
                                                $matchedWordNode);
            }

            $node->parentNode->removeChild($matchedWordNode);
        }
    }


    /**
     * highlight words in content of the specified node
     *
     * @param DOMNode $contextNode
     * @param array $wordsToHighlight
     * @param callback $callback   Callback method, used to transform (highlighting) text.
     * @param array    $params     Array of additionall callback parameters (first non-optional parameter is a text to transform)
     */
    protected function _highlightNodeRecursive(DOMNode $contextNode, $wordsToHighlight, $callback, $params)
    {
        $textNodes = array();

        if (!$contextNode->hasChildNodes()) {
            return;
        }

        foreach ($contextNode->childNodes as $childNode) {
            if ($childNode->nodeType == XML_TEXT_NODE) {
                // process node later to leave childNodes structure untouched
                $textNodes[] = $childNode;
            } else {
                // Process node if it's not a script node
                if ($childNode->nodeName != 'script') {
                    $this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params);
                }
            }
        }

        foreach ($textNodes as $textNode) {
            $this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params);
        }
    }

    /**
     * Standard callback method used to highlight words.
     *
     * @param  string  $stringToHighlight
     * @return string
     * @internal
     */
    public function applyColour($stringToHighlight, $colour)
    {
        return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>';
    }

    /**
     * Highlight text with specified color
     *
     * @param string|array $words
     * @param string $colour
     * @return string
     */
    public function highlight($words, $colour = '#66ffff')
    {
        return $this->highlightExtended($words, array($this, 'applyColour'), array($colour));
    }



    /**
     * Highlight text using specified View helper or callback function.
     *
     * @param string|array $words  Words to highlight. Words could be organized using the array or string.
     * @param callback $callback   Callback method, used to transform (highlighting) text.
     * @param array    $params     Array of additionall callback parameters passed through into it
     *                             (first non-optional parameter is an HTML fragment for highlighting)
     * @return string
     * @throws Zend_Search_Lucene_Exception
     */
    public function highlightExtended($words, $callback, $params = array())
    {
        /** Zend_Search_Lucene_Analysis_Analyzer */
        require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';

        if (!is_array($words)) {
            $words = array($words);
        }

        $wordsToHighlightList = array();
        $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
        foreach ($words as $wordString) {
            $wordsToHighlightList[] = $analyzer->tokenize($wordString);
        }
        $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList);

        if (count($wordsToHighlight) == 0) {
            return $this->_doc->saveHTML();
        }

        $wordsToHighlightFlipped = array();
        foreach ($wordsToHighlight as $id => $token) {
            $wordsToHighlightFlipped[$token->getTermText()] = $id;
        }

        if (!is_callable($callback)) {
            require_once 'Zend/Search/Lucene/Exception.php';
            throw new Zend_Search_Lucene_Exception('$viewHelper parameter must be a View Helper name, View Helper object or callback.');
        }

        $xpath = new DOMXPath($this->_doc);

        $matchedNodes = $xpath->query("/html/body");
        foreach ($matchedNodes as $matchedNode) {
            $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params);
        }
    }


    /**
     * Get HTML
     *
     * @return string
     */
    public function getHTML()
    {
        return $this->_doc->saveHTML();
    }

    /**
     * Get HTML body
     *
     * @return string
     */
    public function getHtmlBody()
    {
        $xpath = new DOMXPath($this->_doc);
        $bodyNodes = $xpath->query('/html/body')->item(0)->childNodes;

        $outputFragments = array();
        for ($count = 0; $count < $bodyNodes->length; $count++) {
            $outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count));
        }

        return implode($outputFragments);
    }
}



1			<?php
2			/**
3			* Zend Framework
4			*
5			* LICENSE
6			*
7			* This source file is subject to the new BSD license that is bundled
8			* with this package in the file LICENSE.txt.
9			* It is also available through the world-wide-web at this URL:
10			* http://framework.zend.com/license/new-bsd
11			* If you did not receive a copy of the license and are unable to
12			* obtain it through the world-wide-web, please send an email
13			* to [email protected] so we can send you a copy immediately.
14			*
15			* @category Zend
16			* @package Zend_Search_Lucene
17			* @subpackage Document
18			* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
19			* @license http://framework.zend.com/license/new-bsd New BSD License
20			* @version $Id: Html.php 24593 2012-01-05 20:35:02Z matthew $
21			*/
22
23
24			/** Zend_Search_Lucene_Document */
25			require_once 'Zend/Search/Lucene/Document.php';
26
27
28			/**
29			* HTML document.
30			*
31			* @category Zend
32			* @package Zend_Search_Lucene
33			* @subpackage Document
34			* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
35			* @license http://framework.zend.com/license/new-bsd New BSD License
36			*/
37			class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
			0 ignored issues – show Coding Style introduced 2016-03-07 17:58 UTC by Report Bug Copy Issue Report Since you have declared the constructor as private, maybe you should also declare the class as final. Loading history...
38			{
39			/**
40			* List of document links
41			*
42			* @var array
43			*/
44			private $_links = array();
45
46			/**
47			* List of document header links
48			*
49			* @var array
50			*/
51			private $_headerLinks = array();
52
53			/**
54			* Stored DOM representation
55			*
56			* @var DOMDocument
57			*/
58			private $_doc;
59
60			/**
61			* Exclud nofollow links flag
62			*
63			* If true then links with rel='nofollow' attribute are not included into
64			* document links.
65			*
66			* @var boolean
67			*/
68			private static $_excludeNoFollowLinks = false;
69
70			/**
71			*
72			* List of inline tags
73			*
74			* @var array
75			*/
76			private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code',
77			'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike',
78			'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins',
79			'q', 'sub', 'sup');
80
81			/**
82			* Object constructor
83			*
84			* @param string $data HTML string (may be HTML fragment, )
85			* @param boolean $isFile
86			* @param boolean $storeContent
87			* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
88			*/
89			private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
90			{
91			$this->_doc = new DOMDocument();
92			$this->_doc->substituteEntities = true;
93
94			if ($isFile) {
95			$htmlData = file_get_contents($data);
96			} else {
97			$htmlData = $data;
98			}
99			@$this->_doc->loadHTML($htmlData);
			0 ignored issues – show Security Best Practice introduced 2016-03-07 17:58 UTC by Report Bug Copy Issue Report It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended. If you suppress an error, we recommend checking for the error condition explicitly: // For example instead of @mkdir($dir); // Better use if (@mkdir($dir) === false) { throw new \RuntimeException('The directory '.$dir.' could not be created.'); } Loading history...
100
101			if ($this->_doc->encoding === null) {
102			// Document encoding is not recognized
103
104			/** @todo improve HTML vs HTML fragment recognition */
105			if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
106			// It's an HTML document
107			// Add additional HEAD section and recognize document
108			$htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
109
110			@$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset))
			0 ignored issues – show Security Best Practice introduced 2016-03-07 17:58 UTC by Report Bug Copy Issue Report It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended. If you suppress an error, we recommend checking for the error condition explicitly: // For example instead of @mkdir($dir); // Better use if (@mkdir($dir) === false) { throw new \RuntimeException('The directory '.$dir.' could not be created.'); } Loading history...
111			. '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>'
112			. iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
113
114			// Remove additional HEAD section
115			$xpath = new DOMXPath($this->_doc);
116			$head = $xpath->query('/html/head')->item(0);
117			$head->parentNode->removeChild($head);
118			} else {
119			// It's an HTML fragment
120			@$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>'
			0 ignored issues – show Security Best Practice introduced 2016-03-07 17:58 UTC by Report Bug Copy Issue Report It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended. If you suppress an error, we recommend checking for the error condition explicitly: // For example instead of @mkdir($dir); // Better use if (@mkdir($dir) === false) { throw new \RuntimeException('The directory '.$dir.' could not be created.'); } Loading history...
121			. iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData)
122			. '</body></html>');
123			}
124
125			}
126			/** @todo Add correction of wrong HTML encoding recognition processing
127			* The case is:
128			* Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
129			* even $this->_doc->encoding demonstrates another recognized encoding
130			*/
131
132			$xpath = new DOMXPath($this->_doc);
133
134			$docTitle = '';
135			$titleNodes = $xpath->query('/html/head/title');
136			foreach ($titleNodes as $titleNode) {
137			// title should always have only one entry, but we process all nodeset entries
138			$docTitle .= $titleNode->nodeValue . ' ';
139			}
140			$this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8'));
141
142			$metaNodes = $xpath->query('/html/head/meta[@name]');
143			foreach ($metaNodes as $metaNode) {
144			$this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'),
145			$metaNode->getAttribute('content'),
146			'UTF-8'));
147			}
148
149			$docBody = '';
150			$bodyNodes = $xpath->query('/html/body');
151			foreach ($bodyNodes as $bodyNode) {
152			// body should always have only one entry, but we process all nodeset entries
153			$this->_retrieveNodeText($bodyNode, $docBody);
154			}
155			if ($storeContent) {
156			$this->addField(Zend_Search_Lucene_Field::Text('contents', $docBody, 'UTF-8'));
157			} else {
158			$this->addField(Zend_Search_Lucene_Field::UnStored('contents', $docBody, 'UTF-8'));
159			}
160
161			$linkNodes = $this->_doc->getElementsByTagName('a');
162			foreach ($linkNodes as $linkNode) {
163			if (($href = $linkNode->getAttribute('href')) != '' &&
164			(!self::$_excludeNoFollowLinks \|\| strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
165			) {
166			$this->_links[] = $href;
167			}
168			}
169			$linkNodes = $this->_doc->getElementsByTagName('area');
170			foreach ($linkNodes as $linkNode) {
171			if (($href = $linkNode->getAttribute('href')) != '' &&
172			(!self::$_excludeNoFollowLinks \|\| strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
173			) {
174			$this->_links[] = $href;
175			}
176			}
177			$this->_links = array_unique($this->_links);
178
179			$linkNodes = $xpath->query('/html/head/link');
180			foreach ($linkNodes as $linkNode) {
181			if (($href = $linkNode->getAttribute('href')) != '') {
182			$this->_headerLinks[] = $href;
183			}
184			}
185			$this->_headerLinks = array_unique($this->_headerLinks);
186			}
187
188			/**
189			* Set exclude nofollow links flag
190			*
191			* @param boolean $newValue
192			*/
193			public static function setExcludeNoFollowLinks($newValue)
194			{
195			self::$_excludeNoFollowLinks = $newValue;
196			}
197
198			/**
199			* Get exclude nofollow links flag
200			*
201			* @return boolean
202			*/
203			public static function getExcludeNoFollowLinks()
204			{
205			return self::$_excludeNoFollowLinks;
206			}
207
208			/**
209			* Get node text
210			*
211			* We should exclude scripts, which may be not included into comment tags, CDATA sections,
212			*
213			* @param DOMNode $node
214			* @param string &$text
215			*/
216			private function _retrieveNodeText(DOMNode $node, &$text)
217			{
218			if ($node->nodeType == XML_TEXT_NODE) {
219			$text .= $node->nodeValue;
220			if(!in_array($node->parentNode->tagName, $this->_inlineTags)) {
221			$text .= ' ';
222			}
223			} else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') {
224			foreach ($node->childNodes as $childNode) {
225			$this->_retrieveNodeText($childNode, $text);
226			}
227			}
228			}
229
230			/**
231			* Get document HREF links
232			*
233			* @return array
234			*/
235			public function getLinks()
236			{
237			return $this->_links;
238			}
239
240			/**
241			* Get document header links
242			*
243			* @return array
244			*/
245			public function getHeaderLinks()
246			{
247			return $this->_headerLinks;
248			}
249
250			/**
251			* Load HTML document from a string
252			*
253			* @param string $data
254			* @param boolean $storeContent
255			* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
256			* @return Zend_Search_Lucene_Document_Html
257			*/
258			public static function loadHTML($data, $storeContent = false, $defaultEncoding = '')
259			{
260			return new Zend_Search_Lucene_Document_Html($data, false, $storeContent, $defaultEncoding);
261			}
262
263			/**
264			* Load HTML document from a file
265			*
266			* @param string $file
267			* @param boolean $storeContent
268			* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
269			* @return Zend_Search_Lucene_Document_Html
270			*/
271			public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '')
272			{
273			return new Zend_Search_Lucene_Document_Html($file, true, $storeContent, $defaultEncoding);
274			}
275
276
277			/**
278			* Highlight text in text node
279			*
280			* @param DOMText $node
281			* @param array $wordsToHighlight
282			* @param callback $callback Callback method, used to transform (highlighting) text.
283			* @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform)
284			* @throws Zend_Search_Lucene_Exception
285			*/
286			protected function _highlightTextNode(DOMText $node, $wordsToHighlight, $callback, $params)
287			{
288			/** Zend_Search_Lucene_Analysis_Analyzer */
289			require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
290
291			$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
292			$analyzer->setInput($node->nodeValue, 'UTF-8');
293
294			$matchedTokens = array();
295
296			while (($token = $analyzer->nextToken()) !== null) {
297			if (isset($wordsToHighlight[$token->getTermText()])) {
298			$matchedTokens[] = $token;
299			}
300			}
301
302			if (count($matchedTokens) == 0) {
303			return;
304			}
305
306			$matchedTokens = array_reverse($matchedTokens);
307
308			foreach ($matchedTokens as $token) {
309			// Cut text after matched token
310			$node->splitText($token->getEndOffset());
311
312			// Cut matched node
313			$matchedWordNode = $node->splitText($token->getStartOffset());
314
315			// Retrieve HTML string representation for highlihted word
316			$fullCallbackparamsList = $params;
317			array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue);
318			$highlightedWordNodeSetHtml = call_user_func_array($callback, $fullCallbackparamsList);
319
320			// Transform HTML string to a DOM representation and automatically transform retrieved string
321			// into valid XHTML (It's automatically done by loadHTML() method)
322			$highlightedWordNodeSetDomDocument = new DOMDocument('1.0', 'UTF-8');
323			$success = @$highlightedWordNodeSetDomDocument->
324			loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>'
325			. $highlightedWordNodeSetHtml
326			. '</body></html>');
327			if (!$success) {
328			require_once 'Zend/Search/Lucene/Exception.php';
329			throw new Zend_Search_Lucene_Exception("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHtml'.");
330			}
331			$highlightedWordNodeSetXpath = new DOMXPath($highlightedWordNodeSetDomDocument);
332			$highlightedWordNodeSet = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes;
333
334			for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) {
335			$nodeToImport = $highlightedWordNodeSet->item($count);
336			$node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */),
337			$matchedWordNode);
338			}
339
340			$node->parentNode->removeChild($matchedWordNode);
341			}
342			}
343
344
345			/**
346			* highlight words in content of the specified node
347			*
348			* @param DOMNode $contextNode
349			* @param array $wordsToHighlight
350			* @param callback $callback Callback method, used to transform (highlighting) text.
351			* @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform)
352			*/
353			protected function _highlightNodeRecursive(DOMNode $contextNode, $wordsToHighlight, $callback, $params)
354			{
355			$textNodes = array();
356
357			if (!$contextNode->hasChildNodes()) {
358			return;
359			}
360
361			foreach ($contextNode->childNodes as $childNode) {
362			if ($childNode->nodeType == XML_TEXT_NODE) {
363			// process node later to leave childNodes structure untouched
364			$textNodes[] = $childNode;
365			} else {
366			// Process node if it's not a script node
367			if ($childNode->nodeName != 'script') {
368			$this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params);
369			}
370			}
371			}
372
373			foreach ($textNodes as $textNode) {
374			$this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params);
375			}
376			}
377
378			/**
379			* Standard callback method used to highlight words.
380			*
381			* @param string $stringToHighlight
382			* @return string
383			* @internal
384			*/
385			public function applyColour($stringToHighlight, $colour)
386			{
387			return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>';
388			}
389
390			/**
391			* Highlight text with specified color
392			*
393			* @param string\|array $words
394			* @param string $colour
395			* @return string
396			*/
397			public function highlight($words, $colour = '#66ffff')
398			{
399			return $this->highlightExtended($words, array($this, 'applyColour'), array($colour));
400			}
401
402
403
404			/**
405			* Highlight text using specified View helper or callback function.
406			*
407			* @param string\|array $words Words to highlight. Words could be organized using the array or string.
408			* @param callback $callback Callback method, used to transform (highlighting) text.
409			* @param array $params Array of additionall callback parameters passed through into it
410			* (first non-optional parameter is an HTML fragment for highlighting)
411			* @return string
412			* @throws Zend_Search_Lucene_Exception
413			*/
414			public function highlightExtended($words, $callback, $params = array())
415			{
416			/** Zend_Search_Lucene_Analysis_Analyzer */
417			require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
418
419			if (!is_array($words)) {
420			$words = array($words);
421			}
422
423			$wordsToHighlightList = array();
424			$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
425			foreach ($words as $wordString) {
426			$wordsToHighlightList[] = $analyzer->tokenize($wordString);
427			}
428			$wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList);
429
430			if (count($wordsToHighlight) == 0) {
431			return $this->_doc->saveHTML();
432			}
433
434			$wordsToHighlightFlipped = array();
435			foreach ($wordsToHighlight as $id => $token) {
436			$wordsToHighlightFlipped[$token->getTermText()] = $id;
437			}
438
439			if (!is_callable($callback)) {
440			require_once 'Zend/Search/Lucene/Exception.php';
441			throw new Zend_Search_Lucene_Exception('$viewHelper parameter must be a View Helper name, View Helper object or callback.');
442			}
443
444			$xpath = new DOMXPath($this->_doc);
445
446			$matchedNodes = $xpath->query("/html/body");
447			foreach ($matchedNodes as $matchedNode) {
448			$this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params);
449			}
450			}
451
452
453			/**
454			* Get HTML
455			*
456			* @return string
457			*/
458			public function getHTML()
459			{
460			return $this->_doc->saveHTML();
461			}
462
463			/**
464			* Get HTML body
465			*
466			* @return string
467			*/
468			public function getHtmlBody()
469			{
470			$xpath = new DOMXPath($this->_doc);
471			$bodyNodes = $xpath->query('/html/body')->item(0)->childNodes;
472
473			$outputFragments = array();
474			for ($count = 0; $count < $bodyNodes->length; $count++) {
475			$outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count));
476			}
477
478			return implode($outputFragments);
479			}
480			}
481
482

adamjakab / SuiteCRM

Push — CI ( 0f01dd...c95a04 )

Zend_Search_Lucene_Document_Html::__construct() F

Complexity

Size

Duplication

How to fix Long Method Complexity

Long Method

Duplication Side-by-Side

Filter issues like