SearchHighlighter::extract() - Code Metrics - wikimedia/mediawiki - Measure and Improve Code Quality continuously with Scrutinizer

SearchHighlighter::extract() B
last analyzed 2016-11-13 17:42 UTC

↳ Parent: SearchHighlighter

Complexity

Conditions	6
Paths	32

Size

Total Lines	23
Code Lines	15

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	6
eloc	15
nc	32
nop	5
dl	0
loc	23
rs	8.5906
c	0
b	0
f	0

<?php
/**
 * Basic search engine highlighting
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 * @ingroup Search
 */

/**
 * Highlight bits of wikitext
 *
 * @ingroup Search
 */
class SearchHighlighter {
	protected $mCleanWikitext = true;

	function __construct( $cleanupWikitext = true ) {
		$this->mCleanWikitext = $cleanupWikitext;
	}

	/**
	 * Wikitext highlighting when $wgAdvancedSearchHighlighting = true
	 *
	 * @param string $text
	 * @param array $terms Terms to highlight (not html escaped but
	 *   regex escaped via SearchDatabase::regexTerm())
	 * @param int $contextlines
	 * @param int $contextchars
	 * @return string
	 */
	public function highlightText( $text, $terms, $contextlines, $contextchars ) {
		global $wgContLang, $wgSearchHighlightBoundaries;

		if ( $text == '' ) {
			return '';
		}

		// spli text into text + templates/links/tables
		$spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
		// first capture group is for detecting nested templates/links/tables/references
		$endPatterns = [
			1 => '/(\{\{)|(\}\})/', // template
			2 => '/(\[\[)|(\]\])/', // image
			3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table

		// @todo FIXME: This should prolly be a hook or something
		// instead of hardcoding a class name from the Cite extension
		if ( class_exists( 'Cite' ) ) {
			$spat .= '|(<ref>)'; // references via cite extension
			$endPatterns[4] = '/(<ref>)|(<\/ref>)/';
		}
		$spat .= '/';
		$textExt = []; // text extracts
		$otherExt = []; // other extracts
		$start = 0;
		$textLen = strlen( $text );
		$count = 0; // sequence number to maintain ordering
		while ( $start < $textLen ) {
			// find start of template/image/table
			if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
				$epat = '';
				foreach ( $matches as $key => $val ) {
					if ( $key > 0 && $val[1] != - 1 ) {
						if ( $key == 2 ) {
							// see if this is an image link
							$ns = substr( $val[0], 2, - 1 );
							if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
								break;
							}

						}
						$epat = $endPatterns[$key];
						$this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
						$start = $val[1];
						break;
					}
				}
				if ( $epat ) {
					// find end (and detect any nested elements)
					$level = 0;
					$offset = $start + 1;
					$found = false;
					while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
						if ( array_key_exists( 2, $endMatches ) ) {
							// found end
							if ( $level == 0 ) {
								$len = strlen( $endMatches[2][0] );
								$off = $endMatches[2][1];
								$this->splitAndAdd( $otherExt, $count,
									substr( $text, $start, $off + $len - $start ) );
								$start = $off + $len;
								$found = true;
								break;
							} else {
								// end of nested element
								$level -= 1;
							}
						} else {
							// nested
							$level += 1;
						}
						$offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
					}
					if ( !$found ) {
						// couldn't find appropriate closing tag, skip
						$this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
						$start += strlen( $matches[0][0] );
					}
					continue;
				}
			}
			// else: add as text extract
			$this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
			break;
		}

		$all = $textExt + $otherExt; // these have disjunct key sets

		// prepare regexps
		foreach ( $terms as $index => $term ) {
			// manually do upper/lowercase stuff for utf-8 since PHP won't do it
			if ( preg_match( '/[\x80-\xff]/', $term ) ) {
				$terms[$index] = preg_replace_callback(
					'/./us',
					[ $this, 'caseCallback' ],
					$terms[$index]
				);
			} else {
				$terms[$index] = $term;
			}
		}
		$anyterm = implode( '|', $terms );
		$phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
		// @todo FIXME: A hack to scale contextchars, a correct solution
		// would be to have contextchars actually be char and not byte
		// length, and do proper utf-8 substrings and lengths everywhere,
		// but PHP is making that very hard and unclean to implement :(
		$scale = strlen( $anyterm ) / mb_strlen( $anyterm );
		$contextchars = intval( $contextchars * $scale );

		$patPre = "(^|$wgSearchHighlightBoundaries)";
		$patPost = "($wgSearchHighlightBoundaries|$)";

		$pat1 = "/(" . $phrase . ")/ui";
		$pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";

		$left = $contextlines;

		$snippets = [];
		$offsets = [];

		// show beginning only if it contains all words
		$first = 0;
		$firstText = '';
		foreach ( $textExt as $index => $line ) {
			if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
				$firstText = $this->extract( $line, 0, $contextchars * $contextlines );
				$first = $index;
				break;
			}
		}
		if ( $firstText ) {
			$succ = true;
			// check if first text contains all terms
			foreach ( $terms as $term ) {
				if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
					$succ = false;
					break;
				}
			}
			if ( $succ ) {
				$snippets[$first] = $firstText;
				$offsets[$first] = 0;
			}
		}
		if ( !$snippets ) {
			// match whole query on text
			$this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
			// match whole query on templates/tables/images
			$this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
			// match any words on text
			$this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
			// match any words on templates/tables/images
			$this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );

			ksort( $snippets );
		}

		// add extra chars to each snippet to make snippets constant size
		$extended = [];
		if ( count( $snippets ) == 0 ) {
			// couldn't find the target words, just show beginning of article
			if ( array_key_exists( $first, $all ) ) {
				$targetchars = $contextchars * $contextlines;
				$snippets[$first] = '';
				$offsets[$first] = 0;
			}
		} else {
			// if begin of the article contains the whole phrase, show only that !!
			if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
				&& $offsets[$first] < $contextchars * 2 ) {
				$snippets = [ $first => $snippets[$first] ];
			}

			// calc by how much to extend existing snippets
			$targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
		}

		foreach ( $snippets as $index => $line ) {
			$extended[$index] = $line;
			$len = strlen( $line );
			if ( $len < $targetchars - 20 ) {
				// complete this line
				if ( $len < strlen( $all[$index] ) ) {
					$extended[$index] = $this->extract(
						$all[$index],
						$offsets[$index],
						$offsets[$index] + $targetchars,
function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}
						$offsets[$index]
					);
					$len = strlen( $extended[$index] );
				}

				// add more lines
				$add = $index + 1;
				while ( $len < $targetchars - 20
						&& array_key_exists( $add, $all )
						&& !array_key_exists( $add, $snippets ) ) {
					$offsets[$add] = 0;
					$tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
					$extended[$add] = $tt;
					$len += strlen( $tt );
					$add++;
				}
			}
		}

		// $snippets = array_map( 'htmlspecialchars', $extended );
		$snippets = $extended;
		$last = - 1;
		$extract = '';
		foreach ( $snippets as $index => $line ) {
			if ( $last == - 1 ) {
				$extract .= $line; // first line
			} elseif ( $last + 1 == $index
				&& $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
			) {
				$extract .= " " . $line; // continous lines
			} else {
				$extract .= '<b> ... </b>' . $line;
			}

			$last = $index;
		}
		if ( $extract ) {
			$extract .= '<b> ... </b>';
		}

		$processed = [];
		foreach ( $terms as $term ) {
			if ( !isset( $processed[$term] ) ) {
				$pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
				$extract = preg_replace( $pat3,
					"\\1<span class='searchmatch'>\\2</span>\\3", $extract );
				$processed[$term] = true;
			}
		}

		return $extract;
	}

	/**
	 * Split text into lines and add it to extracts array
	 *
	 * @param array $extracts Index -> $line
	 * @param int $count
	 * @param string $text
	 */
	function splitAndAdd( &$extracts, &$count, $text ) {
		$split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
		foreach ( $split as $line ) {
			$tt = trim( $line );
			if ( $tt ) {
				$extracts[$count++] = $tt;
			}
		}
	}

	/**
	 * Do manual case conversion for non-ascii chars
	 *
	 * @param array $matches
	 * @return string
	 */
	function caseCallback( $matches ) {
		global $wgContLang;
		if ( strlen( $matches[0] ) > 1 ) {
			return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
		} else {
			return $matches[0];
		}
	}

	/**
	 * Extract part of the text from start to end, but by
	 * not chopping up words
	 * @param string $text
	 * @param int $start
	 * @param int $end
	 * @param int $posStart (out) actual start position
	 * @param int $posEnd (out) actual end position
	 * @return string
	 */
	function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
		if ( $start != 0 ) {
			$start = $this->position( $text, $start, 1 );
		}
		if ( $end >= strlen( $text ) ) {
			$end = strlen( $text );
		} else {
			$end = $this->position( $text, $end );
		}

		if ( !is_null( $posStart ) ) {
			$posStart = $start;
		}
		if ( !is_null( $posEnd ) ) {
			$posEnd = $end;
		}

		if ( $end > $start ) {
			return substr( $text, $start, $end - $start );
		} else {
			return '';
		}
	}

	/**
	 * Find a nonletter near a point (index) in the text
	 *
	 * @param string $text
	 * @param int $point
	 * @param int $offset Offset to found index
	 * @return int Nearest nonletter index, or beginning of utf8 char if none
	 */
	function position( $text, $point, $offset = 0 ) {
		$tolerance = 10;
		$s = max( 0, $point - $tolerance );
		$l = min( strlen( $text ), $point + $tolerance ) - $s;
		$m = [];

		if ( preg_match(
			'/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
			substr( $text, $s, $l ),
			$m,
			PREG_OFFSET_CAPTURE
		) ) {
			return $m[0][1] + $s + $offset;
		} else {
			// check if point is on a valid first UTF8 char
			$char = ord( $text[$point] );
			while ( $char >= 0x80 && $char < 0xc0 ) {
				// skip trailing bytes
				$point++;
				if ( $point >= strlen( $text ) ) {
					return strlen( $text );
				}
				$char = ord( $text[$point] );
			}

			return $point;

		}
	}

	/**
	 * Search extracts for a pattern, and return snippets
	 *
	 * @param string $pattern Regexp for matching lines
	 * @param array $extracts Extracts to search
	 * @param int $linesleft Number of extracts to make
	 * @param int $contextchars Length of snippet
	 * @param array $out Map for highlighted snippets
	 * @param array $offsets Map of starting points of snippets
	 * @protected
	 */
	function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
		if ( $linesleft == 0 ) {
			return; // nothing to do
		}
		foreach ( $extracts as $index => $line ) {
			if ( array_key_exists( $index, $out ) ) {
				continue; // this line already highlighted
			}

			$m = [];
			if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
				continue;
			}

			$offset = $m[0][1];
			$len = strlen( $m[0][0] );
			if ( $offset + $len < $contextchars ) {
				$begin = 0;
			} elseif ( $len > $contextchars ) {
				$begin = $offset;
			} else {
				$begin = $offset + intval( ( $len - $contextchars ) / 2 );
			}

			$end = $begin + $contextchars;

			$posBegin = $begin;
			// basic snippet from this line
			$out[$index] = $this->extract( $line, $begin, $end, $posBegin );
			$offsets[$index] = $posBegin;
			$linesleft--;
			if ( $linesleft == 0 ) {
				return;
			}
		}
	}

	/**
	 * Basic wikitext removal
	 * @protected
	 * @param string $text
	 * @return mixed
	 */
	function removeWiki( $text ) {
		$text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
		$text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
		$text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
		$text = preg_replace_callback(
			"/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
			[ $this, 'linkReplace' ],
			$text
		);
		$text = preg_replace( "/<\/?[^>]+>/", "", $text );
		$text = preg_replace( "/'''''/", "", $text );
		$text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
		$text = preg_replace( "/''/", "", $text );

		return $text;
	}

	/**
	 * callback to replace [[target|caption]] kind of links, if
	 * the target is category or image, leave it
	 *
	 * @param array $matches
	 * @return string
	 */
	function linkReplace( $matches ) {
		$colon = strpos( $matches[1], ':' );
		if ( $colon === false ) {
			return $matches[2]; // replace with caption
		}
		global $wgContLang;
		$ns = substr( $matches[1], 0, $colon );
		$index = $wgContLang->getNsIndex( $ns );
		if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
			return $matches[0]; // return the whole thing
		} else {
			return $matches[2];
		}
	}

	/**
	 * Simple & fast snippet extraction, but gives completely unrelevant
	 * snippets
	 *
	 * Used when $wgAdvancedSearchHighlighting is false.
	 *
	 * @param string $text
	 * @param array $terms Escaped for regex by SearchDatabase::regexTerm()
	 * @param int $contextlines
	 * @param int $contextchars
	 * @return string
	 */
	public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
		global $wgContLang;

		$lines = explode( "\n", $text );

		$terms = implode( '|', $terms );
		$max = intval( $contextchars ) + 1;
		$pat1 = "/(.*)($terms)(.{0,$max})/i";

		$lineno = 0;

		$extract = "";
		foreach ( $lines as $line ) {
			if ( 0 == $contextlines ) {
				break;
			}
			++$lineno;
			$m = [];
			if ( !preg_match( $pat1, $line, $m ) ) {
				continue;
			}
			--$contextlines;
			// truncate function changes ... to relevant i18n message.
			$pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );

			if ( count( $m ) < 3 ) {
				$post = '';
			} else {
				$post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
			}

			$found = $m[2];

			$line = htmlspecialchars( $pre . $found . $post );
			$pat2 = '/(' . $terms . ")/i";
			$line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );

			$extract .= "${line}\n";
		}

		return $extract;
	}

	/**
	 * Returns the first few lines of the text
	 *
	 * @param string $text
	 * @param int $contextlines Max number of returned lines
	 * @param int $contextchars Average number of characters per line
	 * @return string
	 */
	public function highlightNone( $text, $contextlines, $contextchars ) {
		$match = [];
		$text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
		$text = str_replace( "\n\n", "\n", $text ); // remove empty lines
		preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );

		// Trim and limit to max number of chars
		$text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
		return str_replace( "\n", '<br>', $text );
	}
}


1			<?php
2			/**
3			* Basic search engine highlighting
4			*
5			* This program is free software; you can redistribute it and/or modify
6			* it under the terms of the GNU General Public License as published by
7			* the Free Software Foundation; either version 2 of the License, or
8			* (at your option) any later version.
9			*
10			* This program is distributed in the hope that it will be useful,
11			* but WITHOUT ANY WARRANTY; without even the implied warranty of
12			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13			* GNU General Public License for more details.
14			*
15			* You should have received a copy of the GNU General Public License along
16			* with this program; if not, write to the Free Software Foundation, Inc.,
17			* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18			* http://www.gnu.org/copyleft/gpl.html
19			*
20			* @file
21			* @ingroup Search
22			*/
23
24			/**
25			* Highlight bits of wikitext
26			*
27			* @ingroup Search
28			*/
29			class SearchHighlighter {
30			protected $mCleanWikitext = true;
31
32			function __construct( $cleanupWikitext = true ) {
33			$this->mCleanWikitext = $cleanupWikitext;
34			}
35
36			/**
37			* Wikitext highlighting when $wgAdvancedSearchHighlighting = true
38			*
39			* @param string $text
40			* @param array $terms Terms to highlight (not html escaped but
41			* regex escaped via SearchDatabase::regexTerm())
42			* @param int $contextlines
43			* @param int $contextchars
44			* @return string
45			*/
46			public function highlightText( $text, $terms, $contextlines, $contextchars ) {
47			global $wgContLang, $wgSearchHighlightBoundaries;
48
49			if ( $text == '' ) {
50			return '';
51			}
52
53			// spli text into text + templates/links/tables
54			$spat = "/(\\{\\{)\|(\\[\\[[^\\]:]+:)\|(\n\\{\\\|)";
55			// first capture group is for detecting nested templates/links/tables/references
56			$endPatterns = [
57			1 => '/(\{\{)\|(\}\})/', // template
58			2 => '/(\[\[)\|(\]\])/', // image
59			3 => "/(\n\\{\\\|)\|(\n\\\|\\})/" ]; // table
60
61			// @todo FIXME: This should prolly be a hook or something
62			// instead of hardcoding a class name from the Cite extension
63			if ( class_exists( 'Cite' ) ) {
64			$spat .= '\|(<ref>)'; // references via cite extension
65			$endPatterns[4] = '/(<ref>)\|(<\/ref>)/';
66			}
67			$spat .= '/';
68			$textExt = []; // text extracts
69			$otherExt = []; // other extracts
70			$start = 0;
71			$textLen = strlen( $text );
72			$count = 0; // sequence number to maintain ordering
73			while ( $start < $textLen ) {
74			// find start of template/image/table
75			if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
76			$epat = '';
77			foreach ( $matches as $key => $val ) {
78			if ( $key > 0 && $val[1] != - 1 ) {
79			if ( $key == 2 ) {
80			// see if this is an image link
81			$ns = substr( $val[0], 2, - 1 );
82			if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
83			break;
84			}
85
86			}
87			$epat = $endPatterns[$key];
88			$this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
89			$start = $val[1];
90			break;
91			}
92			}
93			if ( $epat ) {
94			// find end (and detect any nested elements)
95			$level = 0;
96			$offset = $start + 1;
97			$found = false;
98			while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
99			if ( array_key_exists( 2, $endMatches ) ) {
100			// found end
101			if ( $level == 0 ) {
102			$len = strlen( $endMatches[2][0] );
103			$off = $endMatches[2][1];
104			$this->splitAndAdd( $otherExt, $count,
105			substr( $text, $start, $off + $len - $start ) );
106			$start = $off + $len;
107			$found = true;
108			break;
109			} else {
110			// end of nested element
111			$level -= 1;
112			}
113			} else {
114			// nested
115			$level += 1;
116			}
117			$offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
118			}
119			if ( !$found ) {
120			// couldn't find appropriate closing tag, skip
121			$this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
122			$start += strlen( $matches[0][0] );
123			}
124			continue;
125			}
126			}
127			// else: add as text extract
128			$this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
129			break;
130			}
131
132			$all = $textExt + $otherExt; // these have disjunct key sets
133
134			// prepare regexps
135			foreach ( $terms as $index => $term ) {
136			// manually do upper/lowercase stuff for utf-8 since PHP won't do it
137			if ( preg_match( '/[\x80-\xff]/', $term ) ) {
138			$terms[$index] = preg_replace_callback(
139			'/./us',
140			[ $this, 'caseCallback' ],
141			$terms[$index]
142			);
143			} else {
144			$terms[$index] = $term;
145			}
146			}
147			$anyterm = implode( '\|', $terms );
148			$phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
149			// @todo FIXME: A hack to scale contextchars, a correct solution
150			// would be to have contextchars actually be char and not byte
151			// length, and do proper utf-8 substrings and lengths everywhere,
152			// but PHP is making that very hard and unclean to implement :(
153			$scale = strlen( $anyterm ) / mb_strlen( $anyterm );
154			$contextchars = intval( $contextchars * $scale );
155
156			$patPre = "(^\|$wgSearchHighlightBoundaries)";
157			$patPost = "($wgSearchHighlightBoundaries\|$)";
158
159			$pat1 = "/(" . $phrase . ")/ui";
160			$pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
161
162			$left = $contextlines;
163
164			$snippets = [];
165			$offsets = [];
166
167			// show beginning only if it contains all words
168			$first = 0;
169			$firstText = '';
170			foreach ( $textExt as $index => $line ) {
171			if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
172			$firstText = $this->extract( $line, 0, $contextchars * $contextlines );
173			$first = $index;
174			break;
175			}
176			}
177			if ( $firstText ) {
178			$succ = true;
179			// check if first text contains all terms
180			foreach ( $terms as $term ) {
181			if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
182			$succ = false;
183			break;
184			}
185			}
186			if ( $succ ) {
187			$snippets[$first] = $firstText;
188			$offsets[$first] = 0;
189			}
190			}
191			if ( !$snippets ) {
192			// match whole query on text
193			$this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
194			// match whole query on templates/tables/images
195			$this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
196			// match any words on text
197			$this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
198			// match any words on templates/tables/images
199			$this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
200
201			ksort( $snippets );
202			}
203
204			// add extra chars to each snippet to make snippets constant size
205			$extended = [];
206			if ( count( $snippets ) == 0 ) {
207			// couldn't find the target words, just show beginning of article
208			if ( array_key_exists( $first, $all ) ) {
209			$targetchars = $contextchars * $contextlines;
210			$snippets[$first] = '';
211			$offsets[$first] = 0;
212			}
213			} else {
214			// if begin of the article contains the whole phrase, show only that !!
215			if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
216			&& $offsets[$first] < $contextchars * 2 ) {
217			$snippets = [ $first => $snippets[$first] ];
218			}
219
220			// calc by how much to extend existing snippets
221			$targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
222			}
223
224			foreach ( $snippets as $index => $line ) {
225			$extended[$index] = $line;
226			$len = strlen( $line );
227			if ( $len < $targetchars - 20 ) {
228			// complete this line
229			if ( $len < strlen( $all[$index] ) ) {
230			$extended[$index] = $this->extract(
231			$all[$index],
232			$offsets[$index],
233			$offsets[$index] + $targetchars,
			0 ignored issues – show Bug introduced 2016-01-16 18:00 UTC by Report Bug Copy Issue Report The variable `$targetchars` does not seem to be defined for all execution paths leading up to this point. If you define a variable conditionally, it can happen that it is not defined for all execution paths. Let’s take a look at an example: function myFunction($a) { switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; } // $x is potentially undefined here. echo $x; } In the above example, the variable `$x` is defined if you pass “foo” or “bar” as argument for `$a`. However, since the `switch` statement has no default case statement, if you pass any other value, the variable `$x` would be undefined. Available Fixes Check for existence of the variable explicitly: function myFunction($a) { switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; } if (isset($x)) { // Make sure it's always set. echo $x; } } Define a default value for the variable: function myFunction($a) { $x = ''; // Set a default which gets overridden for certain paths. switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; } echo $x; } Add a value for the missing path: function myFunction($a) { switch ($a) { case 'foo': $x = 1; break; case 'bar': $x = 2; break; // We add support for the missing case. default: $x = ''; break; } echo $x; } Loading history...
234			$offsets[$index]
235			);
236			$len = strlen( $extended[$index] );
237			}
238
239			// add more lines
240			$add = $index + 1;
241			while ( $len < $targetchars - 20
242			&& array_key_exists( $add, $all )
243			&& !array_key_exists( $add, $snippets ) ) {
244			$offsets[$add] = 0;
245			$tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
246			$extended[$add] = $tt;
247			$len += strlen( $tt );
248			$add++;
249			}
250			}
251			}
252
253			// $snippets = array_map( 'htmlspecialchars', $extended );
254			$snippets = $extended;
255			$last = - 1;
256			$extract = '';
257			foreach ( $snippets as $index => $line ) {
258			if ( $last == - 1 ) {
259			$extract .= $line; // first line
260			} elseif ( $last + 1 == $index
261			&& $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
262			) {
263			$extract .= " " . $line; // continous lines
264			} else {
265			$extract .= '<b> ... </b>' . $line;
266			}
267
268			$last = $index;
269			}
270			if ( $extract ) {
271			$extract .= '<b> ... </b>';
272			}
273
274			$processed = [];
275			foreach ( $terms as $term ) {
276			if ( !isset( $processed[$term] ) ) {
277			$pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
278			$extract = preg_replace( $pat3,
279			"\\1<span class='searchmatch'>\\2</span>\\3", $extract );
280			$processed[$term] = true;
281			}
282			}
283
284			return $extract;
285			}
286
287			/**
288			* Split text into lines and add it to extracts array
289			*
290			* @param array $extracts Index -> $line
291			* @param int $count
292			* @param string $text
293			*/
294			function splitAndAdd( &$extracts, &$count, $text ) {
295			$split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
296			foreach ( $split as $line ) {
297			$tt = trim( $line );
298			if ( $tt ) {
299			$extracts[$count++] = $tt;
300			}
301			}
302			}
303
304			/**
305			* Do manual case conversion for non-ascii chars
306			*
307			* @param array $matches
308			* @return string
309			*/
310			function caseCallback( $matches ) {
311			global $wgContLang;
312			if ( strlen( $matches[0] ) > 1 ) {
313			return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
314			} else {
315			return $matches[0];
316			}
317			}
318
319			/**
320			* Extract part of the text from start to end, but by
321			* not chopping up words
322			* @param string $text
323			* @param int $start
324			* @param int $end
325			* @param int $posStart (out) actual start position
326			* @param int $posEnd (out) actual end position
327			* @return string
328			*/
329			function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
330			if ( $start != 0 ) {
331			$start = $this->position( $text, $start, 1 );
332			}
333			if ( $end >= strlen( $text ) ) {
334			$end = strlen( $text );
335			} else {
336			$end = $this->position( $text, $end );
337			}
338
339			if ( !is_null( $posStart ) ) {
340			$posStart = $start;
341			}
342			if ( !is_null( $posEnd ) ) {
343			$posEnd = $end;
344			}
345
346			if ( $end > $start ) {
347			return substr( $text, $start, $end - $start );
348			} else {
349			return '';
350			}
351			}
352
353			/**
354			* Find a nonletter near a point (index) in the text
355			*
356			* @param string $text
357			* @param int $point
358			* @param int $offset Offset to found index
359			* @return int Nearest nonletter index, or beginning of utf8 char if none
360			*/
361			function position( $text, $point, $offset = 0 ) {
362			$tolerance = 10;
363			$s = max( 0, $point - $tolerance );
364			$l = min( strlen( $text ), $point + $tolerance ) - $s;
365			$m = [];
366
367			if ( preg_match(
368			'/[ ,.!?~!@#$%^&*\(\)+=\-\\\\|\[\]"\'<>]/',
369			substr( $text, $s, $l ),
370			$m,
371			PREG_OFFSET_CAPTURE
372			) ) {
373			return $m[0][1] + $s + $offset;
374			} else {
375			// check if point is on a valid first UTF8 char
376			$char = ord( $text[$point] );
377			while ( $char >= 0x80 && $char < 0xc0 ) {
378			// skip trailing bytes
379			$point++;
380			if ( $point >= strlen( $text ) ) {
381			return strlen( $text );
382			}
383			$char = ord( $text[$point] );
384			}
385
386			return $point;
387
388			}
389			}
390
391			/**
392			* Search extracts for a pattern, and return snippets
393			*
394			* @param string $pattern Regexp for matching lines
395			* @param array $extracts Extracts to search
396			* @param int $linesleft Number of extracts to make
397			* @param int $contextchars Length of snippet
398			* @param array $out Map for highlighted snippets
399			* @param array $offsets Map of starting points of snippets
400			* @protected
401			*/
402			function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
403			if ( $linesleft == 0 ) {
404			return; // nothing to do
405			}
406			foreach ( $extracts as $index => $line ) {
407			if ( array_key_exists( $index, $out ) ) {
408			continue; // this line already highlighted
409			}
410
411			$m = [];
412			if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
413			continue;
414			}
415
416			$offset = $m[0][1];
417			$len = strlen( $m[0][0] );
418			if ( $offset + $len < $contextchars ) {
419			$begin = 0;
420			} elseif ( $len > $contextchars ) {
421			$begin = $offset;
422			} else {
423			$begin = $offset + intval( ( $len - $contextchars ) / 2 );
424			}
425
426			$end = $begin + $contextchars;
427
428			$posBegin = $begin;
429			// basic snippet from this line
430			$out[$index] = $this->extract( $line, $begin, $end, $posBegin );
431			$offsets[$index] = $posBegin;
432			$linesleft--;
433			if ( $linesleft == 0 ) {
434			return;
435			}
436			}
437			}
438
439			/**
440			* Basic wikitext removal
441			* @protected
442			* @param string $text
443			* @return mixed
444			*/
445			function removeWiki( $text ) {
446			$text = preg_replace( "/\\{\\{([^\|]+?)\\}\\}/", "", $text );
447			$text = preg_replace( "/\\{\\{([^\|]+\\\|)(.*?)\\}\\}/", "\\2", $text );
448			$text = preg_replace( "/\\[\\[([^\|]+?)\\]\\]/", "\\1", $text );
449			$text = preg_replace_callback(
450			"/\\[\\[([^\|]+\\\|)(.*?)\\]\\]/",
451			[ $this, 'linkReplace' ],
452			$text
453			);
454			$text = preg_replace( "/<\/?[^>]+>/", "", $text );
455			$text = preg_replace( "/'''''/", "", $text );
456			$text = preg_replace( "/('''\|<\/?[iIuUbB]>)/", "", $text );
457			$text = preg_replace( "/''/", "", $text );
458
459			return $text;
460			}
461
462			/**
463			* callback to replace [[target\|caption]] kind of links, if
464			* the target is category or image, leave it
465			*
466			* @param array $matches
467			* @return string
468			*/
469			function linkReplace( $matches ) {
470			$colon = strpos( $matches[1], ':' );
471			if ( $colon === false ) {
472			return $matches[2]; // replace with caption
473			}
474			global $wgContLang;
475			$ns = substr( $matches[1], 0, $colon );
476			$index = $wgContLang->getNsIndex( $ns );
477			if ( $index !== false && ( $index == NS_FILE \|\| $index == NS_CATEGORY ) ) {
478			return $matches[0]; // return the whole thing
479			} else {
480			return $matches[2];
481			}
482			}
483
484			/**
485			* Simple & fast snippet extraction, but gives completely unrelevant
486			* snippets
487			*
488			* Used when $wgAdvancedSearchHighlighting is false.
489			*
490			* @param string $text
491			* @param array $terms Escaped for regex by SearchDatabase::regexTerm()
492			* @param int $contextlines
493			* @param int $contextchars
494			* @return string
495			*/
496			public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
497			global $wgContLang;
498
499			$lines = explode( "\n", $text );
500
501			$terms = implode( '\|', $terms );
502			$max = intval( $contextchars ) + 1;
503			$pat1 = "/(.*)($terms)(.{0,$max})/i";
504
505			$lineno = 0;
506
507			$extract = "";
508			foreach ( $lines as $line ) {
509			if ( 0 == $contextlines ) {
510			break;
511			}
512			++$lineno;
513			$m = [];
514			if ( !preg_match( $pat1, $line, $m ) ) {
515			continue;
516			}
517			--$contextlines;
518			// truncate function changes ... to relevant i18n message.
519			$pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
520
521			if ( count( $m ) < 3 ) {
522			$post = '';
523			} else {
524			$post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
525			}
526
527			$found = $m[2];
528
529			$line = htmlspecialchars( $pre . $found . $post );
530			$pat2 = '/(' . $terms . ")/i";
531			$line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
532
533			$extract .= "${line}\n";
534			}
535
536			return $extract;
537			}
538
539			/**
540			* Returns the first few lines of the text
541			*
542			* @param string $text
543			* @param int $contextlines Max number of returned lines
544			* @param int $contextchars Average number of characters per line
545			* @return string
546			*/
547			public function highlightNone( $text, $contextlines, $contextchars ) {
548			$match = [];
549			$text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
550			$text = str_replace( "\n\n", "\n", $text ); // remove empty lines
551			preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
552
553			// Trim and limit to max number of chars
554			$text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
555			return str_replace( "\n", '<br>', $text );
556			}
557			}
558

wikimedia / mediawiki

SearchHighlighter::extract() B last analyzed 2016-11-13 17:42 UTC

Complexity

Size

Duplication

Importance

Available Fixes

Duplication Side-by-Side

Filter issues like

SearchHighlighter::extract() B
last analyzed 2016-11-13 17:42 UTC