Completed
Branch master (dc3656)
by
unknown
30:14
created

WikiTextStructure::getAuxiliaryText()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nc 1
nop 0
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
<?php
2
3
use HtmlFormatter\HtmlFormatter;
0 ignored issues
show
Bug introduced by
This use statement conflicts with another class in this namespace, HtmlFormatter.

Let’s assume that you have a directory layout like this:

.
|-- OtherDir
|   |-- Bar.php
|   `-- Foo.php
`-- SomeDir
    `-- Foo.php

and let’s assume the following content of Bar.php:

// Bar.php
namespace OtherDir;

use SomeDir\Foo; // This now conflicts the class OtherDir\Foo

If both files OtherDir/Foo.php and SomeDir/Foo.php are loaded in the same runtime, you will see a PHP error such as the following:

PHP Fatal error:  Cannot use SomeDir\Foo as Foo because the name is already in use in OtherDir/Foo.php

However, as OtherDir/Foo.php does not necessarily have to be loaded and the error is only triggered if it is loaded before OtherDir/Bar.php, this problem might go unnoticed for a while. In order to prevent this error from surfacing, you must import the namespace with a different alias:

// Bar.php
namespace OtherDir;

use SomeDir\Foo as SomeDirFoo; // There is no conflict anymore.
Loading history...
4
use MediaWiki\Logger\LoggerFactory;
5
6
/**
7
 * Class allowing to explore structure of parsed wikitext.
8
 */
9
class WikiTextStructure {
10
	/**
11
	 * @var string
12
	 */
13
	private $openingText;
14
	/**
15
	 * @var string
16
	 */
17
	private $allText;
18
	/**
19
	 * @var string[]
20
	 */
21
	private $auxText = [];
22
	/**
23
	 * @var ParserOutput
24
	 */
25
	private $parserOutput;
26
27
	/**
28
	 * @var string[] selectors to elements that are excluded entirely from search
29
	 */
30
	private $excludedElementSelectors = [
31
		'audio', 'video',       // "it looks like you don't have javascript enabled..."
32
		                        // do not need to index
33
		'sup.reference',        // The [1] for references
34
		'.mw-cite-backlink',    // The ↑ next to references in the references section
35
		'h1', 'h2', 'h3',       // Headings are already indexed in their own field.
36
		'h5', 'h6', 'h4',
37
		'.autocollapse',        // Collapsed fields are hidden by default so we don't want them
38
								// showing up.
39
	];
40
41
	/**
42
	 * @var string[] selectors to elements that are considered auxiliary to article text for search
43
	 */
44
	private $auxiliaryElementSelectors = [
45
		'.thumbcaption',        // Thumbnail captions aren't really part of the text proper
46
		'table',                // Neither are tables
47
		'.rellink',             // Common style for "See also:".
48
		'.dablink',             // Common style for calling out helpful links at the top
49
								// of the article.
50
		'.searchaux',           // New class users can use to mark stuff as auxiliary to searches.
51
	];
52
53
	/**
54
	 * WikiTextStructure constructor.
55
	 * @param ParserOutput $parserOutput
56
	 */
57
	public function __construct( ParserOutput $parserOutput ) {
58
		$this->parserOutput = $parserOutput;
59
	}
60
61
	/**
62
	 * Get headings on the page.
63
	 * @return string[]
64
	 * First strip out things that look like references.  We can't use HTML filtering because
65
	 * the references come back as <sup> tags without a class.  To keep from breaking stuff like
66
	 *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
67
	 * we don't remove the whole <sup> tag.  We also don't want to strip the <sup> tag and remove
68
	 * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
69
	 * or something.  Whatever.  So we only strip things that look like <sup> tags wrapping a
70
	 * reference.  And since the data looks like:
71
	 *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
72
	 * we can not really use HtmlFormatter as we have no suitable selector.
73
	 */
74
	public function headings() {
75
		$headings = [];
76
		$ignoredHeadings = $this->getIgnoredHeadings();
77
		foreach ( $this->parserOutput->getSections() as $heading ) {
78
			$heading = $heading[ 'line' ];
79
80
			// Some wikis wrap the brackets in a span:
81
			// http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
82
			$heading = preg_replace( '/<\/?span>/', '', $heading );
83
			// Normalize [] so the following regexp would work.
84
			$heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
85
			$heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
86
87
			// Strip tags from the heading or else we'll display them (escaped) in search results
88
			$heading = trim( Sanitizer::stripAllTags( $heading ) );
89
90
			// Note that we don't take the level of the heading into account - all headings are equal.
91
			// Except the ones we ignore.
92
			if ( !in_array( $heading, $ignoredHeadings ) ) {
93
				$headings[] = $heading;
94
			}
95
		}
96
		return $headings;
97
	}
98
99
	/**
100
	 * Parse a message content into an array. This function is generally used to
101
	 * parse settings stored as i18n messages (see search-ignored-headings).
102
	 *
103
	 * @param string $message
104
	 * @return string[]
105
	 */
106
	public static function parseSettingsInMessage( $message ) {
107
		$lines = explode( "\n", $message );
108
		$lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
109
		$lines = array_map( 'trim', $lines );          // Remove extra spaces
110
		$lines = array_filter( $lines );               // Remove empty lines
111
		return $lines;
112
	}
113
114
	/**
115
	 * Get list of heading to ignore.
116
	 * @return string[]
117
	 */
118
	private function getIgnoredHeadings() {
119
		static $ignoredHeadings = null;
120
		if ( $ignoredHeadings === null ) {
121
			$ignoredHeadings = [];
122
			$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
123
			if ( $source->isBlank() ) {
124
				// Try old version too, just in case
125
				$source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
126
			}
127
			if ( !$source->isDisabled() ) {
128
				$lines = self::parseSettingsInMessage( $source->plain() );
129
				$ignoredHeadings = $lines;               // Now we just have headings!
130
			}
131
		}
132
		return $ignoredHeadings;
133
	}
134
135
	/**
136
	 * Extract parts of the text - opening, main and auxiliary.
137
	 */
138
	private function extractWikitextParts() {
139
		if ( !is_null( $this->allText ) ) {
140
			return;
141
		}
142
		$this->parserOutput->setEditSectionTokens( false );
143
		$this->parserOutput->setTOCEnabled( false );
144
		$text = $this->parserOutput->getText();
145
		if ( strlen( $text ) == 0 ) {
146
			$this->allText = "";
147
			// empty text - nothing to seek here
148
			return;
149
		}
150
		$opening = null;
0 ignored issues
show
Unused Code introduced by
$opening is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
151
152
		$this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
153
154
		// Add extra spacing around break tags so text crammed together like<br>this
155
		// doesn't make one word.
156
		$text = str_replace( '<br', "\n<br", $text );
157
158
		$formatter = new HtmlFormatter( $text );
159
160
		// Strip elements from the page that we never want in the search text.
161
		$formatter->remove( $this->excludedElementSelectors );
162
		$formatter->filterContent();
163
164
		// Strip elements from the page that are auxiliary text.  These will still be
165
		// searched but matches will be ranked lower and non-auxiliary matches will be
166
		// preferred in highlighting.
167
		$formatter->remove( $this->auxiliaryElementSelectors );
168
		$auxiliaryElements = $formatter->filterContent();
169
		$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
170
		foreach ( $auxiliaryElements as $auxiliaryElement ) {
171
			$this->auxText[] =
172
				trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
173
		}
174
	}
175
176
	/**
177
	 * Get text before first heading.
178
	 * @param string $text
179
	 * @return string|null
180
	 */
181
	private function extractHeadingBeforeFirstHeading( $text ) {
182
		$matches = [];
183
		if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
184
			// There isn't a first heading so we interpret this as the article
185
			// being entirely without heading.
186
			return null;
187
		}
188
		$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
189
		if ( !$text ) {
190
			// There isn't any text before the first heading so we declare there isn't
191
			// a first heading.
192
			return null;
193
		}
194
195
		$formatter = new HtmlFormatter( $text );
196
		$formatter->remove( $this->excludedElementSelectors );
197
		$formatter->remove( $this->auxiliaryElementSelectors );
198
		$formatter->filterContent();
199
		$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
200
201
		if ( !$text ) {
202
			// There isn't any text after filtering before the first heading so we declare
203
			// that there isn't a first heading.
204
			return null;
205
		}
206
207
		return $text;
208
	}
209
210
	/**
211
	 * Get opening text
212
	 * @return string
213
	 */
214
	public function getOpeningText() {
215
		$this->extractWikitextParts();
216
		return $this->openingText;
217
	}
218
219
	/**
220
	 * Get main text
221
	 * @return string
222
	 */
223
	public function getMainText() {
224
		$this->extractWikitextParts();
225
		return $this->allText;
226
	}
227
228
	/**
229
	 * Get auxiliary text
230
	 * @return string[]
231
	 */
232
	public function getAuxiliaryText() {
233
		$this->extractWikitextParts();
234
		return $this->auxText;
235
	}
236
237
	/**
238
	 * Get the defaultsort property
239
	 * @return string|null
240
	 */
241
	public function getDefaultSort() {
242
		return $this->parserOutput->getProperty( 'defaultsort' );
243
	}
244
}
245