Test Setup Failed
Push — master ( 1f58dd...e46154 )
by
unknown
31:29
created

src/LingoParser.php (1 issue)

1
<?php
2
3
/**
4
 * File holding the Lingo\LingoParser class.
5
 *
6
 * This file is part of the MediaWiki extension Lingo.
7
 *
8
 * @copyright 2011 - 2018, Stephan Gambke
9
 * @license GPL-2.0-or-later
10
 *
11
 * The Lingo extension is free software: you can redistribute it and/or modify
12
 * it under the terms of the GNU General Public License as published by the Free
13
 * Software Foundation; either version 2 of the License, or (at your option) any
14
 * later version.
15
 *
16
 * The Lingo extension is distributed in the hope that it will be useful, but
17
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
19
 * details.
20
 *
21
 * You should have received a copy of the GNU General Public License along
22
 * with this program. If not, see <http://www.gnu.org/licenses/>.
23
 *
24
 * @author Stephan Gambke
25
 *
26
 * @file
27
 * @ingroup Lingo
28
 */
29
namespace Lingo;
30
31
use DOMDocument;
32
use DOMXPath;
33
use ObjectCache;
34
use Parser;
35
use StubObject;
36
use Title;
37
use Wikimedia\AtEase\AtEase;
38
39
/**
40
 * This class parses the given text and enriches it with definitions for defined
41
 * terms.
42
 *
43
 * Contains a static function to initiate the parsing.
44
 *
45
 * @ingroup Lingo
46
 */
47
class LingoParser {
48
49
	const WORD_VALUE = 0;
50
	const WORD_OFFSET = 1;
51
52
	private $mLingoTree = null;
53
54
	/**
55
	 * @var Backend
56
	 */
57
	private $mLingoBackend = null;
58
	private static $parserSingleton = null;
59
60
	// Api params passed in from ApiMakeParserOptions Hook
61
	private $mApiParams = null;
62
63
	// The RegEx to split a chunk of text into words
64
	public $regex = null;
65 8
66
	/**
67
	 * Lingo\LingoParser constructor.
68 8
	 * @param MessageLog|null $messages
69 8
	 */
70
	public function __construct( MessageLog &$messages = null ) {
0 ignored issues
show
The parameter $messages is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

70
	public function __construct( /** @scrutinizer ignore-unused */ MessageLog &$messages = null ) {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
71
		// The RegEx to split a chunk of text into words
72
		// Words are: placeholders for stripped items, sequences of letters and numbers, single characters that are neither letter nor number
73
		$this->regex = '/' . preg_quote( Parser::MARKER_PREFIX, '/' ) . '.*?' . preg_quote( Parser::MARKER_SUFFIX, '/' ) . '|[\p{L}\p{N}]+|[^\p{L}\p{N}]/u';
74
	}
75
76 7
	/**
77 7
	 * @param Parser $mwParser
78 3
	 *
79
	 * @return Boolean
80
	 */
81 7
	public function parse( $mwParser ) {
82
		if ( $this->shouldParse( $mwParser ) ) {
83
			$this->realParse( $mwParser );
84
		}
85
86
		return true;
87
	}
88
89
	/**
90
	 * @return LingoParser
91
	 * @since 2.0.1
92
	 */
93
	public static function getInstance() {
94
		if ( !self::$parserSingleton ) {
95
			self::$parserSingleton = new LingoParser();
96
97
		}
98
99
		return self::$parserSingleton;
100
	}
101
102
	/**
103
	 * @return string
104
	 */
105
	private function getCacheKey() {
106
		// FIXME: If Lingo ever stores the glossary tree per user, then the cache key also needs to include the user id (see T163608)
107
		return ObjectCache::getLocalClusterInstance()->makeKey( 'ext', 'lingo', 'lingotree', Tree::TREE_VERSION, get_class( $this->getBackend() ) );
108
	}
109
110
	/**
111
	 * @return Backend the backend used by the parser
112
	 * @throws \MWException
113
	 */
114
	public function getBackend() {
115
		if ( $this->mLingoBackend === null ) {
116
			throw new \MWException( 'No Lingo backend available!' );
117
		}
118
119
		return $this->mLingoBackend;
120
	}
121
122
	/**
123
	 * Returns the list of terms in the glossary
124
	 *
125
	 * @return array an array mapping terms (keys) to descriptions (values)
126
	 */
127
	public function getLingoArray() {
128
		return $this->getLingoTree()->getTermList();
129
	}
130
131
	/**
132
	 * Returns the list of terms in the glossary as a Lingo\Tree
133
	 *
134
	 * @return Tree a Lingo\Tree mapping terms (keys) to descriptions (values)
135
	 */
136
	public function getLingoTree() {
137
		// build glossary array only once per request
138
		if ( !$this->mLingoTree ) {
139
140
			// use cache if enabled
141
			if ( $this->getBackend()->useCache() ) {
142
143
				// Try cache first
144
				global $wgexLingoCacheType;
145
				$cache = ( $wgexLingoCacheType !== null ) ? wfGetCache( $wgexLingoCacheType ) : wfGetMainCache();
146
				$cachekey = $this->getCacheKey();
147
				$cachedLingoTree = $cache->get( $cachekey );
148
149
				// cache hit?
150
				if ( $cachedLingoTree !== false && $cachedLingoTree !== null ) {
151
152
					wfDebug( "Cache hit: Got lingo tree from cache.\n" );
153
					$this->mLingoTree = &$cachedLingoTree;
154
155
					wfDebug( "Re-cached lingo tree.\n" );
156
				} else {
157
158
					wfDebug( "Cache miss: Lingo tree not found in cache.\n" );
159
					$this->mLingoTree =& $this->buildLingo();
160
					wfDebug( "Cached lingo tree.\n" );
161
				}
162
163
				// Keep for one month
164
				// Limiting the cache validity will allow to purge stale cache
165
				// entries inserted by older versions after one month
166
				$cache->set( $cachekey, $this->mLingoTree, 60 * 60 * 24 * 30 );
167
168
			} else {
169
				wfDebug( "Caching of lingo tree disabled.\n" );
170
				$this->mLingoTree =& $this->buildLingo();
171
			}
172
173
		}
174
175
		return $this->mLingoTree;
176
	}
177
178
	/**
179
	 * @return Tree
180
	 */
181
	protected function &buildLingo() {
182
		$lingoTree = new Tree();
183
		$backend = &$this->mLingoBackend;
184
185
		// assemble the result array
186
		while ( $elementData = $backend->next() ) {
187
			$lingoTree->addTerm( $elementData[ Element::ELEMENT_TERM ], $elementData );
188
		}
189
190
		return $lingoTree;
191
	}
192
193
	/**
194
	 * Parses the given text and enriches applicable terms
195
	 *
196
	 * This method currently only recognizes terms consisting of max one word
197 3
	 *
198 3
	 * @param Parser $parser
199
	 *
200 3
	 * @return Boolean
201 3
	 */
202
	protected function realParse( &$parser ) {
203
		// Parse text identical to options used in includes/api/ApiParse.php
204
		$params = $this->mApiParams;
205
		$text = is_null( $params ) ? $parser->getOutput()->getText() : $parser->getOutput()->getText( [
206
			'allowTOC' => !$params['disabletoc'],
207
			'enableSectionEditLinks' => !$params['disableeditsection'],
208
			'wrapperDivClass' => $params['wrapoutputclass'],
209
			'deduplicateStyles' => !$params['disablestylededuplication'],
210
		] );
211
212
		if ( $text === null || $text === '' ) {
213
			return true;
214
		}
215
216
		// Get array of terms
217
		$glossary = $this->getLingoTree();
218
219
		if ( $glossary == null ) {
220
			return true;
221
		}
222
223
		// Parse HTML from page
224
225
		// TODO: Remove call to \MediaWiki\suppressWarnings() for MW 1.34+.
226
		// \Wikimedia\AtEase\AtEase::suppressWarnings() is available from MW 1.34.
227
		if (method_exists( AtEase::class, 'suppressWarnings' ) ) {
228
			\Wikimedia\AtEase\AtEase::suppressWarnings();
229
		} else {
230
			\MediaWiki\suppressWarnings();
231
		}
232
233
		$doc = new DOMDocument( '1.0', 'utf-8' );
234
		$doc->loadHTML( '<html><head><meta http-equiv="content-type" content="charset=utf-8"/></head><body>' . $text . '</body></html>' );
235
236
		// TODO: Remove call to \MediaWiki\restoreWarnings() for MW 1.34+.
237
		// \Wikimedia\AtEase\AtEase::restoreWarnings() is available from MW 1.34.
238
		if (method_exists( AtEase::class, 'suppressWarnings' ) ) {
239
			\Wikimedia\AtEase\AtEase::restoreWarnings();
240
		} else {
241
			\MediaWiki\restoreWarnings();
242
		}
243
244
		// Find all text in HTML.
245
		$xpath = new DOMXPath( $doc );
246
		$textElements = $xpath->query(
247
			"//*[not(ancestor-or-self::*[@class='noglossary'] or ancestor-or-self::a)][text()!=' ']/text()"
248
		);
249
250
		// Iterate all HTML text matches
251
		$numberOfTextElements = $textElements->length;
252
253
		$definitions = [];
254
255
		for ( $textElementIndex = 0; $textElementIndex < $numberOfTextElements; $textElementIndex++ ) {
256
			$textElement = $textElements->item( $textElementIndex );
257
258
			if ( strlen( $textElement->nodeValue ) < $glossary->getMinTermLength() ) {
259
				continue;
260
			}
261
262
			$matches = [];
263
			preg_match_all(
264
				$this->regex,
265
				$textElement->nodeValue,
266
				$matches,
267
				PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER
268
			);
269
270
			if ( count( $matches ) === 0 || count( $matches[ 0 ] ) === 0 ) {
271
				continue;
272
			}
273
274
			$wordDescriptors = &$matches[ 0 ]; // See __construct() for definition of "word"
275
			$numberOfWordDescriptors = count( $wordDescriptors );
276
277
			$parentNode = &$textElement->parentNode;
278
279
			$wordDescriptorIndex = 0;
280
			$changedElem = false;
281
282
			while ( $wordDescriptorIndex < $numberOfWordDescriptors ) {
283
284
				/** @var \Lingo\Element $definition */
285
				list( $skippedWords, $usedWords, $definition ) =
286
					$glossary->findNextTerm( $wordDescriptors, $wordDescriptorIndex, $numberOfWordDescriptors );
287
288
				if ( $usedWords > 0 ) { // found a term
289
290
					if ( $skippedWords > 0 ) { // skipped some text, insert it as is
291
292
						$start = $wordDescriptors[ $wordDescriptorIndex ][ self::WORD_OFFSET ];
293
						$length = $wordDescriptors[ $wordDescriptorIndex + $skippedWords ][ self::WORD_OFFSET ] - $start;
294
295
						$parentNode->insertBefore(
296
							$doc->createTextNode(
297
								substr( $textElement->nodeValue, $start, $length )
298
							),
299
							$textElement
300
						);
301
					}
302
303
					$parentNode->insertBefore( $definition->getFormattedTerm( $doc ), $textElement );
304
305
					$definitions[ $definition->getId() ] = $definition->getFormattedDefinitions();
306
307
					$changedElem = true;
308
309
				} else { // did not find any term, just use the rest of the text
310
311
					// If we found no term now and no term before, there was no
312
					// term in the whole element. Might as well not change the
313
					// element at all.
314
315
					// Only change element if found term before
316
					if ( $changedElem === true ) {
317
318
						$start = $wordDescriptors[ $wordDescriptorIndex ][ self::WORD_OFFSET ];
319
320
						$parentNode->insertBefore(
321
							$doc->createTextNode(
322
								substr( $textElement->nodeValue, $start )
323
							),
324
							$textElement
325
						);
326
327
					}
328
329
					// In principle superfluous, the loop would run out anyway. Might save a bit of time.
330
					break;
331
				}
332
333
				$wordDescriptorIndex += $usedWords + $skippedWords;
334
			}
335
336
			if ( $changedElem ) {
337
				$parentNode->removeChild( $textElement );
338
			}
339
		}
340
341
		if ( count( $definitions ) > 0 ) {
342
343
			$this->loadModules( $parser );
344
345
			// U - Ungreedy, D - dollar matches only end of string, s - dot matches newlines
346
			$text = preg_replace( '%(^.*<body>)|(</body>.*$)%UDs', '', $doc->saveHTML() );
347
			$text .= $parser->recursiveTagParseFully( implode( $definitions ) );
348
349
			$parser->getOutput()->setText( $text );
350
		}
351
352
		return true;
353
	}
354
355
	/**
356
	 * @param Parser $parser
357
	 */
358
	protected function loadModules( &$parser ) {
359
		global $wgOut;
360
361
		$parserOutput = $parser->getOutput();
362
363
		// load scripts
364
		$parserOutput->addModules( 'ext.Lingo' );
365
366
		if ( !$wgOut->isArticle() ) {
367
			$wgOut->addModules( 'ext.Lingo' );
368
		}
369
	}
370
371
	/**
372
	 * Purges the lingo tree from the cache.
373
	 *
374
	 * @deprecated 2.0.2
375
	 */
376
	public static function purgeCache() {
377
		self::getInstance()->purgeGlossaryFromCache();
378
	}
379
380
	/**
381
	 * Purges the lingo tree from the cache.
382
	 *
383
	 * @since 2.0.2
384
	 */
385
	public function purgeGlossaryFromCache() {
386 7
		global $wgexLingoCacheType;
387 7
		$cache = ( $wgexLingoCacheType !== null ) ? wfGetCache( $wgexLingoCacheType ) : wfGetMainCache();
388
		$cache->delete( $this->getCacheKey() );
389 7
	}
390 1
391
	/**
392
	 * @since 2.0.1
393 6
	 * @param Backend $backend
394 1
	 */
395
	public function setBackend( Backend $backend ) {
396
		$this->mLingoBackend = $backend;
397 5
		$backend->setLingoParser( $this );
398
	}
399 5
400 1
	/**
401
	 * Set parser options from API
402
	 *
403 4
	 * @param array $params
404
	 */
405 4
	public function setApiParams( array $params ) {
406 1
		$this->mApiParams = $params;
407
	}
408
409 3
	/**
410
	 * @param Parser $parser
411
	 * @return bool
412
	 */
413
	protected function shouldParse( &$parser ) {
414
		global $wgexLingoUseNamespaces;
415
416
		if ( !( $parser instanceof Parser || $parser instanceof StubObject ) ) {
417
			return false;
418
		}
419
420
		if ( isset( $parser->mDoubleUnderscores[ 'noglossary' ] ) ) { // __NOGLOSSARY__ found in wikitext
421
			return false;
422
		}
423
424
		$title = $parser->getTitle();
425
426
		if ( !( $title instanceof Title ) ) {
427
			return false;
428
		}
429
430
		$namespace = $title->getNamespace();
431
432
		if ( isset( $wgexLingoUseNamespaces[ $namespace ] ) && $wgexLingoUseNamespaces[ $namespace ] === false ) {
433
			return false;
434
		};
435
436
		return true;
437
	}
438
}
439