SearchHighlighter   D
last analyzed

Complexity

Total Complexity 82

Size/Duplication

Total Lines 529
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 0

Importance

Changes 0
Metric Value
dl 0
loc 529
rs 4.8717
c 0
b 0
f 0
wmc 82
lcom 1
cbo 0

11 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 3 1
A splitAndAdd() 0 9 4
A caseCallback() 0 8 2
B extract() 0 23 6
B position() 0 29 5
C process() 0 36 8
A removeWiki() 0 16 1
B linkReplace() 0 14 5
B highlightSimple() 0 42 5
A highlightNone() 0 10 1
F highlightText() 0 240 44

How to fix   Complexity   

Complex Class

Complex classes like SearchHighlighter often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use SearchHighlighter, and based on these observations, apply Extract Interface, too.

1
<?php
2
/**
3
 * Basic search engine highlighting
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License along
16
 * with this program; if not, write to the Free Software Foundation, Inc.,
17
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18
 * http://www.gnu.org/copyleft/gpl.html
19
 *
20
 * @file
21
 * @ingroup Search
22
 */
23
24
/**
25
 * Highlight bits of wikitext
26
 *
27
 * @ingroup Search
28
 */
29
class SearchHighlighter {
30
	protected $mCleanWikitext = true;
31
32
	function __construct( $cleanupWikitext = true ) {
33
		$this->mCleanWikitext = $cleanupWikitext;
34
	}
35
36
	/**
37
	 * Wikitext highlighting when $wgAdvancedSearchHighlighting = true
38
	 *
39
	 * @param string $text
40
	 * @param array $terms Terms to highlight (not html escaped but
41
	 *   regex escaped via SearchDatabase::regexTerm())
42
	 * @param int $contextlines
43
	 * @param int $contextchars
44
	 * @return string
45
	 */
46
	public function highlightText( $text, $terms, $contextlines, $contextchars ) {
47
		global $wgContLang, $wgSearchHighlightBoundaries;
48
49
		if ( $text == '' ) {
50
			return '';
51
		}
52
53
		// spli text into text + templates/links/tables
54
		$spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
55
		// first capture group is for detecting nested templates/links/tables/references
56
		$endPatterns = [
57
			1 => '/(\{\{)|(\}\})/', // template
58
			2 => '/(\[\[)|(\]\])/', // image
59
			3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
60
61
		// @todo FIXME: This should prolly be a hook or something
62
		// instead of hardcoding a class name from the Cite extension
63
		if ( class_exists( 'Cite' ) ) {
64
			$spat .= '|(<ref>)'; // references via cite extension
65
			$endPatterns[4] = '/(<ref>)|(<\/ref>)/';
66
		}
67
		$spat .= '/';
68
		$textExt = []; // text extracts
69
		$otherExt = []; // other extracts
70
		$start = 0;
71
		$textLen = strlen( $text );
72
		$count = 0; // sequence number to maintain ordering
73
		while ( $start < $textLen ) {
74
			// find start of template/image/table
75
			if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
76
				$epat = '';
77
				foreach ( $matches as $key => $val ) {
78
					if ( $key > 0 && $val[1] != - 1 ) {
79
						if ( $key == 2 ) {
80
							// see if this is an image link
81
							$ns = substr( $val[0], 2, - 1 );
82
							if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
83
								break;
84
							}
85
86
						}
87
						$epat = $endPatterns[$key];
88
						$this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
89
						$start = $val[1];
90
						break;
91
					}
92
				}
93
				if ( $epat ) {
94
					// find end (and detect any nested elements)
95
					$level = 0;
96
					$offset = $start + 1;
97
					$found = false;
98
					while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
99
						if ( array_key_exists( 2, $endMatches ) ) {
100
							// found end
101
							if ( $level == 0 ) {
102
								$len = strlen( $endMatches[2][0] );
103
								$off = $endMatches[2][1];
104
								$this->splitAndAdd( $otherExt, $count,
105
									substr( $text, $start, $off + $len - $start ) );
106
								$start = $off + $len;
107
								$found = true;
108
								break;
109
							} else {
110
								// end of nested element
111
								$level -= 1;
112
							}
113
						} else {
114
							// nested
115
							$level += 1;
116
						}
117
						$offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
118
					}
119
					if ( !$found ) {
120
						// couldn't find appropriate closing tag, skip
121
						$this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
122
						$start += strlen( $matches[0][0] );
123
					}
124
					continue;
125
				}
126
			}
127
			// else: add as text extract
128
			$this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
129
			break;
130
		}
131
132
		$all = $textExt + $otherExt; // these have disjunct key sets
133
134
		// prepare regexps
135
		foreach ( $terms as $index => $term ) {
136
			// manually do upper/lowercase stuff for utf-8 since PHP won't do it
137
			if ( preg_match( '/[\x80-\xff]/', $term ) ) {
138
				$terms[$index] = preg_replace_callback(
139
					'/./us',
140
					[ $this, 'caseCallback' ],
141
					$terms[$index]
142
				);
143
			} else {
144
				$terms[$index] = $term;
145
			}
146
		}
147
		$anyterm = implode( '|', $terms );
148
		$phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
149
		// @todo FIXME: A hack to scale contextchars, a correct solution
150
		// would be to have contextchars actually be char and not byte
151
		// length, and do proper utf-8 substrings and lengths everywhere,
152
		// but PHP is making that very hard and unclean to implement :(
153
		$scale = strlen( $anyterm ) / mb_strlen( $anyterm );
154
		$contextchars = intval( $contextchars * $scale );
155
156
		$patPre = "(^|$wgSearchHighlightBoundaries)";
157
		$patPost = "($wgSearchHighlightBoundaries|$)";
158
159
		$pat1 = "/(" . $phrase . ")/ui";
160
		$pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
161
162
		$left = $contextlines;
163
164
		$snippets = [];
165
		$offsets = [];
166
167
		// show beginning only if it contains all words
168
		$first = 0;
169
		$firstText = '';
170
		foreach ( $textExt as $index => $line ) {
171
			if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
172
				$firstText = $this->extract( $line, 0, $contextchars * $contextlines );
173
				$first = $index;
174
				break;
175
			}
176
		}
177
		if ( $firstText ) {
178
			$succ = true;
179
			// check if first text contains all terms
180
			foreach ( $terms as $term ) {
181
				if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
182
					$succ = false;
183
					break;
184
				}
185
			}
186
			if ( $succ ) {
187
				$snippets[$first] = $firstText;
188
				$offsets[$first] = 0;
189
			}
190
		}
191
		if ( !$snippets ) {
192
			// match whole query on text
193
			$this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
194
			// match whole query on templates/tables/images
195
			$this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
196
			// match any words on text
197
			$this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
198
			// match any words on templates/tables/images
199
			$this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
200
201
			ksort( $snippets );
202
		}
203
204
		// add extra chars to each snippet to make snippets constant size
205
		$extended = [];
206
		if ( count( $snippets ) == 0 ) {
207
			// couldn't find the target words, just show beginning of article
208
			if ( array_key_exists( $first, $all ) ) {
209
				$targetchars = $contextchars * $contextlines;
210
				$snippets[$first] = '';
211
				$offsets[$first] = 0;
212
			}
213
		} else {
214
			// if begin of the article contains the whole phrase, show only that !!
215
			if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
216
				&& $offsets[$first] < $contextchars * 2 ) {
217
				$snippets = [ $first => $snippets[$first] ];
218
			}
219
220
			// calc by how much to extend existing snippets
221
			$targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
222
		}
223
224
		foreach ( $snippets as $index => $line ) {
225
			$extended[$index] = $line;
226
			$len = strlen( $line );
227
			if ( $len < $targetchars - 20 ) {
228
				// complete this line
229
				if ( $len < strlen( $all[$index] ) ) {
230
					$extended[$index] = $this->extract(
231
						$all[$index],
232
						$offsets[$index],
233
						$offsets[$index] + $targetchars,
0 ignored issues
show
Bug introduced by
The variable $targetchars does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
234
						$offsets[$index]
235
					);
236
					$len = strlen( $extended[$index] );
237
				}
238
239
				// add more lines
240
				$add = $index + 1;
241
				while ( $len < $targetchars - 20
242
						&& array_key_exists( $add, $all )
243
						&& !array_key_exists( $add, $snippets ) ) {
244
					$offsets[$add] = 0;
245
					$tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
246
					$extended[$add] = $tt;
247
					$len += strlen( $tt );
248
					$add++;
249
				}
250
			}
251
		}
252
253
		// $snippets = array_map( 'htmlspecialchars', $extended );
254
		$snippets = $extended;
255
		$last = - 1;
256
		$extract = '';
257
		foreach ( $snippets as $index => $line ) {
258
			if ( $last == - 1 ) {
259
				$extract .= $line; // first line
260
			} elseif ( $last + 1 == $index
261
				&& $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
262
			) {
263
				$extract .= " " . $line; // continous lines
264
			} else {
265
				$extract .= '<b> ... </b>' . $line;
266
			}
267
268
			$last = $index;
269
		}
270
		if ( $extract ) {
271
			$extract .= '<b> ... </b>';
272
		}
273
274
		$processed = [];
275
		foreach ( $terms as $term ) {
276
			if ( !isset( $processed[$term] ) ) {
277
				$pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
278
				$extract = preg_replace( $pat3,
279
					"\\1<span class='searchmatch'>\\2</span>\\3", $extract );
280
				$processed[$term] = true;
281
			}
282
		}
283
284
		return $extract;
285
	}
286
287
	/**
288
	 * Split text into lines and add it to extracts array
289
	 *
290
	 * @param array $extracts Index -> $line
291
	 * @param int $count
292
	 * @param string $text
293
	 */
294
	function splitAndAdd( &$extracts, &$count, $text ) {
295
		$split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
296
		foreach ( $split as $line ) {
297
			$tt = trim( $line );
298
			if ( $tt ) {
299
				$extracts[$count++] = $tt;
300
			}
301
		}
302
	}
303
304
	/**
305
	 * Do manual case conversion for non-ascii chars
306
	 *
307
	 * @param array $matches
308
	 * @return string
309
	 */
310
	function caseCallback( $matches ) {
311
		global $wgContLang;
312
		if ( strlen( $matches[0] ) > 1 ) {
313
			return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
314
		} else {
315
			return $matches[0];
316
		}
317
	}
318
319
	/**
320
	 * Extract part of the text from start to end, but by
321
	 * not chopping up words
322
	 * @param string $text
323
	 * @param int $start
324
	 * @param int $end
325
	 * @param int $posStart (out) actual start position
326
	 * @param int $posEnd (out) actual end position
327
	 * @return string
328
	 */
329
	function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
330
		if ( $start != 0 ) {
331
			$start = $this->position( $text, $start, 1 );
332
		}
333
		if ( $end >= strlen( $text ) ) {
334
			$end = strlen( $text );
335
		} else {
336
			$end = $this->position( $text, $end );
337
		}
338
339
		if ( !is_null( $posStart ) ) {
340
			$posStart = $start;
341
		}
342
		if ( !is_null( $posEnd ) ) {
343
			$posEnd = $end;
344
		}
345
346
		if ( $end > $start ) {
347
			return substr( $text, $start, $end - $start );
348
		} else {
349
			return '';
350
		}
351
	}
352
353
	/**
354
	 * Find a nonletter near a point (index) in the text
355
	 *
356
	 * @param string $text
357
	 * @param int $point
358
	 * @param int $offset Offset to found index
359
	 * @return int Nearest nonletter index, or beginning of utf8 char if none
360
	 */
361
	function position( $text, $point, $offset = 0 ) {
362
		$tolerance = 10;
363
		$s = max( 0, $point - $tolerance );
364
		$l = min( strlen( $text ), $point + $tolerance ) - $s;
365
		$m = [];
366
367
		if ( preg_match(
368
			'/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
369
			substr( $text, $s, $l ),
370
			$m,
371
			PREG_OFFSET_CAPTURE
372
		) ) {
373
			return $m[0][1] + $s + $offset;
374
		} else {
375
			// check if point is on a valid first UTF8 char
376
			$char = ord( $text[$point] );
377
			while ( $char >= 0x80 && $char < 0xc0 ) {
378
				// skip trailing bytes
379
				$point++;
380
				if ( $point >= strlen( $text ) ) {
381
					return strlen( $text );
382
				}
383
				$char = ord( $text[$point] );
384
			}
385
386
			return $point;
387
388
		}
389
	}
390
391
	/**
392
	 * Search extracts for a pattern, and return snippets
393
	 *
394
	 * @param string $pattern Regexp for matching lines
395
	 * @param array $extracts Extracts to search
396
	 * @param int $linesleft Number of extracts to make
397
	 * @param int $contextchars Length of snippet
398
	 * @param array $out Map for highlighted snippets
399
	 * @param array $offsets Map of starting points of snippets
400
	 * @protected
401
	 */
402
	function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
403
		if ( $linesleft == 0 ) {
404
			return; // nothing to do
405
		}
406
		foreach ( $extracts as $index => $line ) {
407
			if ( array_key_exists( $index, $out ) ) {
408
				continue; // this line already highlighted
409
			}
410
411
			$m = [];
412
			if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
413
				continue;
414
			}
415
416
			$offset = $m[0][1];
417
			$len = strlen( $m[0][0] );
418
			if ( $offset + $len < $contextchars ) {
419
				$begin = 0;
420
			} elseif ( $len > $contextchars ) {
421
				$begin = $offset;
422
			} else {
423
				$begin = $offset + intval( ( $len - $contextchars ) / 2 );
424
			}
425
426
			$end = $begin + $contextchars;
427
428
			$posBegin = $begin;
429
			// basic snippet from this line
430
			$out[$index] = $this->extract( $line, $begin, $end, $posBegin );
431
			$offsets[$index] = $posBegin;
432
			$linesleft--;
433
			if ( $linesleft == 0 ) {
434
				return;
435
			}
436
		}
437
	}
438
439
	/**
440
	 * Basic wikitext removal
441
	 * @protected
442
	 * @param string $text
443
	 * @return mixed
444
	 */
445
	function removeWiki( $text ) {
446
		$text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
447
		$text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
448
		$text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
449
		$text = preg_replace_callback(
450
			"/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
451
			[ $this, 'linkReplace' ],
452
			$text
453
		);
454
		$text = preg_replace( "/<\/?[^>]+>/", "", $text );
455
		$text = preg_replace( "/'''''/", "", $text );
456
		$text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
457
		$text = preg_replace( "/''/", "", $text );
458
459
		return $text;
460
	}
461
462
	/**
463
	 * callback to replace [[target|caption]] kind of links, if
464
	 * the target is category or image, leave it
465
	 *
466
	 * @param array $matches
467
	 * @return string
468
	 */
469
	function linkReplace( $matches ) {
470
		$colon = strpos( $matches[1], ':' );
471
		if ( $colon === false ) {
472
			return $matches[2]; // replace with caption
473
		}
474
		global $wgContLang;
475
		$ns = substr( $matches[1], 0, $colon );
476
		$index = $wgContLang->getNsIndex( $ns );
477
		if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
478
			return $matches[0]; // return the whole thing
479
		} else {
480
			return $matches[2];
481
		}
482
	}
483
484
	/**
485
	 * Simple & fast snippet extraction, but gives completely unrelevant
486
	 * snippets
487
	 *
488
	 * Used when $wgAdvancedSearchHighlighting is false.
489
	 *
490
	 * @param string $text
491
	 * @param array $terms Escaped for regex by SearchDatabase::regexTerm()
492
	 * @param int $contextlines
493
	 * @param int $contextchars
494
	 * @return string
495
	 */
496
	public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
497
		global $wgContLang;
498
499
		$lines = explode( "\n", $text );
500
501
		$terms = implode( '|', $terms );
502
		$max = intval( $contextchars ) + 1;
503
		$pat1 = "/(.*)($terms)(.{0,$max})/i";
504
505
		$lineno = 0;
506
507
		$extract = "";
508
		foreach ( $lines as $line ) {
509
			if ( 0 == $contextlines ) {
510
				break;
511
			}
512
			++$lineno;
513
			$m = [];
514
			if ( !preg_match( $pat1, $line, $m ) ) {
515
				continue;
516
			}
517
			--$contextlines;
518
			// truncate function changes ... to relevant i18n message.
519
			$pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
520
521
			if ( count( $m ) < 3 ) {
522
				$post = '';
523
			} else {
524
				$post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
525
			}
526
527
			$found = $m[2];
528
529
			$line = htmlspecialchars( $pre . $found . $post );
530
			$pat2 = '/(' . $terms . ")/i";
531
			$line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
532
533
			$extract .= "${line}\n";
534
		}
535
536
		return $extract;
537
	}
538
539
	/**
540
	 * Returns the first few lines of the text
541
	 *
542
	 * @param string $text
543
	 * @param int $contextlines Max number of returned lines
544
	 * @param int $contextchars Average number of characters per line
545
	 * @return string
546
	 */
547
	public function highlightNone( $text, $contextlines, $contextchars ) {
548
		$match = [];
549
		$text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
550
		$text = str_replace( "\n\n", "\n", $text ); // remove empty lines
551
		preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
552
553
		// Trim and limit to max number of chars
554
		$text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
555
		return str_replace( "\n", '<br>', $text );
556
	}
557
}
558