Completed
Push — master ( 841b7d...3ed1d3 )
by
unknown
13:44 queued 04:45
created

src/LingoParser.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
/**
4
 * File holding the Lingo\LingoParser class.
5
 *
6
 * This file is part of the MediaWiki extension Lingo.
7
 *
8
 * @copyright 2011 - 2018, Stephan Gambke
9
 * @license   GNU General Public License, version 2 (or any later version)
10
 *
11
 * The Lingo extension is free software: you can redistribute it and/or modify
12
 * it under the terms of the GNU General Public License as published by the Free
13
 * Software Foundation; either version 2 of the License, or (at your option) any
14
 * later version.
15
 *
16
 * The Lingo extension is distributed in the hope that it will be useful, but
17
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
19
 * details.
20
 *
21
 * You should have received a copy of the GNU General Public License along
22
 * with this program. If not, see <http://www.gnu.org/licenses/>.
23
 *
24
 * @author Stephan Gambke
25
 *
26
 * @file
27
 * @ingroup Lingo
28
 */
29
namespace Lingo;
30
31
use DOMDocument;
32
use DOMXPath;
33
use ObjectCache;
34
use Parser;
35
use Title;
36
37
/**
38
 * This class parses the given text and enriches it with definitions for defined
39
 * terms.
40
 *
41
 * Contains a static function to initiate the parsing.
42
 *
43
 * @ingroup Lingo
44
 */
45
class LingoParser {
46
47
	const WORD_VALUE = 0;
48
	const WORD_OFFSET = 1;
49
50
	private $mLingoTree = null;
51
52
	/**
53
	 * @var Backend
54
	 */
55
	private $mLingoBackend = null;
56
	private static $parserSingleton = null;
57
58
	// The RegEx to split a chunk of text into words
59
	public $regex = null;
60
61
	/**
62
	 * Lingo\LingoParser constructor.
63
	 * @param MessageLog|null $messages
64
	 */
65 1
	public function __construct( MessageLog &$messages = null ) {
66
		// The RegEx to split a chunk of text into words
67
		// Words are: placeholders for stripped items, sequences of letters and numbers, single characters that are neither letter nor number
68 1
		$this->regex = '/' . preg_quote( Parser::MARKER_PREFIX, '/' ) . '.*?' . preg_quote( Parser::MARKER_SUFFIX, '/' ) . '|[\p{L}\p{N}]+|[^\p{L}\p{N}]/u';
69 1
	}
70
71
	/**
72
	 * @return Boolean
73
	 */
74
	public function parse( /*$content, $title, $po */ ) {
75
76
		/** @var \Parser $parser */
77
		$parser = $GLOBALS[ 'wgParser' ];
78
79
		if ( $this->shouldParse( $parser ) ) {
80
			$this->realParse( $parser );
81
		}
82
83
		return true;
84
	}
85
86
	/**
87
	 * @return LingoParser
88
	 * @since 2.0.1
89
	 */
90
	public static function getInstance() {
91
		if ( !self::$parserSingleton ) {
92
			self::$parserSingleton = new LingoParser();
93
94
		}
95
96
		return self::$parserSingleton;
97
	}
98
99
	/**
100
	 * @return string
101
	 */
102
	private function getCacheKey() {
103
		// FIXME: If Lingo ever stores the glossary tree per user, then the cache key also needs to include the user id (see T163608)
104
		return ObjectCache::getLocalClusterInstance()->makeKey( 'ext', 'lingo', 'lingotree', Tree::TREE_VERSION, get_class( self::getInstance()->getBackend() ) );
105
	}
106
107
	/**
108
	 * @return Backend the backend used by the parser
109
	 * @throws \MWException
110
	 */
111
	public function getBackend() {
112
113
		if ( $this->mLingoBackend === null ) {
114
			throw new \MWException( 'No Lingo backend available!' );
115
		}
116
117
		return $this->mLingoBackend;
118
	}
119
120
	/**
121
	 * Returns the list of terms in the glossary
122
	 *
123
	 * @return array an array mapping terms (keys) to descriptions (values)
124
	 */
125
	public function getLingoArray() {
126
		return $this->getLingoTree()->getTermList();
127
	}
128
129
	/**
130
	 * Returns the list of terms in the glossary as a Lingo\Tree
131
	 *
132
	 * @return Tree a Lingo\Tree mapping terms (keys) to descriptions (values)
133
	 */
134
	public function getLingoTree() {
135
136
		// build glossary array only once per request
137
		if ( !$this->mLingoTree ) {
138
139
			// use cache if enabled
140
			if ( $this->mLingoBackend->useCache() ) {
141
142
				// Try cache first
143
				global $wgexLingoCacheType;
144
				$cache = ( $wgexLingoCacheType !== null ) ? wfGetCache( $wgexLingoCacheType ) : wfGetMainCache();
145
				$cachekey = $this->getCacheKey();
146
				$cachedLingoTree = $cache->get( $cachekey );
147
148
				// cache hit?
149
				if ( $cachedLingoTree !== false && $cachedLingoTree !== null ) {
150
151
					wfDebug( "Cache hit: Got lingo tree from cache.\n" );
152
					$this->mLingoTree = &$cachedLingoTree;
153
154
					wfDebug( "Re-cached lingo tree.\n" );
155
				} else {
156
157
					wfDebug( "Cache miss: Lingo tree not found in cache.\n" );
158
					$this->mLingoTree =& $this->buildLingo();
159
					wfDebug( "Cached lingo tree.\n" );
160
				}
161
162
				// Keep for one month
163
				// Limiting the cache validity will allow to purge stale cache
164
				// entries inserted by older versions after one month
165
				$cache->set( $cachekey, $this->mLingoTree, 60 * 60 * 24 * 30 );
166
167
			} else {
168
				wfDebug( "Caching of lingo tree disabled.\n" );
169
				$this->mLingoTree =& $this->buildLingo();
170
			}
171
172
		}
173
174
		return $this->mLingoTree;
175
	}
176
177
	/**
178
	 * @return Tree
179
	 */
180
	protected function &buildLingo() {
181
182
		$lingoTree = new Tree();
183
		$backend = &$this->mLingoBackend;
184
185
		// assemble the result array
186
		while ( $elementData = $backend->next() ) {
187
			$lingoTree->addTerm( $elementData[ Element::ELEMENT_TERM ], $elementData );
188
		}
189
190
		return $lingoTree;
191
	}
192
193
	/**
194
	 * Parses the given text and enriches applicable terms
195
	 *
196
	 * This method currently only recognizes terms consisting of max one word
197
	 *
198
	 * @param Parser $parser
199
	 *
200
	 * @return Boolean
201
	 */
202
	protected function realParse( &$parser ) {
203
204
		$text = $parser->getOutput()->getText();
205
206
		if ( $text === null || $text === '' ) {
207
			return true;
208
		}
209
210
		// Get array of terms
211
		$glossary = $this->getLingoTree();
212
213
		if ( $glossary == null ) {
214
			return true;
215
		}
216
217
		// Parse HTML from page
218
		\MediaWiki\suppressWarnings();
219
220
		$doc = new DOMDocument( '1.0', 'utf-8' );
221
		$doc->loadHTML( '<html><head><meta http-equiv="content-type" content="charset=utf-8"/></head><body>' . $text . '</body></html>' );
222
223
		\MediaWiki\restoreWarnings();
224
225
		// Find all text in HTML.
226
		$xpath = new DOMXPath( $doc );
227
		$textElements = $xpath->query(
228
			"//*[not(ancestor-or-self::*[@class='noglossary'] or ancestor-or-self::a)][text()!=' ']/text()"
229
		);
230
231
		// Iterate all HTML text matches
232
		$numberOfTextElements = $textElements->length;
233
234
		$definitions = [];
235
236
		for ( $textElementIndex = 0; $textElementIndex < $numberOfTextElements; $textElementIndex++ ) {
237
			$textElement = $textElements->item( $textElementIndex );
238
239
			if ( strlen( $textElement->nodeValue ) < $glossary->getMinTermLength() ) {
240
				continue;
241
			}
242
243
			$matches = [];
244
			preg_match_all(
245
				$this->regex,
246
				$textElement->nodeValue,
247
				$matches,
248
				PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER
249
			);
250
251
			if ( count( $matches ) === 0 || count( $matches[ 0 ] ) === 0 ) {
252
				continue;
253
			}
254
255
			$wordDescriptors = &$matches[ 0 ];  // See __construct() for definition of "word"
256
			$numberOfWordDescriptors = count( $wordDescriptors );
257
258
			$parentNode = &$textElement->parentNode;
259
260
			$wordDescriptorIndex = 0;
261
			$changedElem = false;
262
263
			while ( $wordDescriptorIndex < $numberOfWordDescriptors ) {
264
265
				/** @var \Lingo\Element $definition */
266
				list( $skippedWords, $usedWords, $definition ) =
267
					$glossary->findNextTerm( $wordDescriptors, $wordDescriptorIndex, $numberOfWordDescriptors );
268
269
				if ( $usedWords > 0 ) { // found a term
270
271
					if ( $skippedWords > 0 ) { // skipped some text, insert it as is
272
273
						$start = $wordDescriptors[$wordDescriptorIndex][self::WORD_OFFSET];
274
						$length = $wordDescriptors[$wordDescriptorIndex + $skippedWords][self::WORD_OFFSET] - $start;
275
276
						$parentNode->insertBefore(
277
							$doc->createTextNode(
278
								substr( $textElement->nodeValue, $start, $length)
279
							),
280
							$textElement
281
						);
282
					}
283
284
					$parentNode->insertBefore( $definition->getFormattedTerm( $doc ), $textElement );
285
286
					$definitions[ $definition->getId() ] = $definition->getFormattedDefinitions();
287
288
					$changedElem = true;
289
290
				} else { // did not find any term, just use the rest of the text
291
292
					// If we found no term now and no term before, there was no
293
					// term in the whole element. Might as well not change the
294
					// element at all.
295
296
					// Only change element if found term before
297
					if ( $changedElem === true ) {
298
299
						$start = $wordDescriptors[$wordDescriptorIndex][self::WORD_OFFSET];
300
301
						$parentNode->insertBefore(
302
							$doc->createTextNode(
303
								substr( $textElement->nodeValue, $start)
304
							),
305
							$textElement
306
						);
307
308
					}
309
310
					// In principle superfluous, the loop would run out anyway. Might save a bit of time.
311
					break;
312
				}
313
314
				$wordDescriptorIndex += $usedWords + $skippedWords;
315
			}
316
317
			if ( $changedElem ) {
318
				$parentNode->removeChild( $textElement );
319
			}
320
		}
321
322
		if ( count( $definitions ) > 0 ) {
323
324
			$this->loadModules( $parser );
325
326
			// U - Ungreedy, D - dollar matches only end of string, s - dot matches newlines
327
			$text = preg_replace( '%(^.*<body>)|(</body>.*$)%UDs', '', $doc->saveHTML() );
328
			$text .= $parser->recursiveTagParseFully( join( $definitions ) );
329
330
			$parser->getOutput()->setText( $text );
331
		}
332
333
		return true;
334
	}
335
336
	/**
337
	 * @param Parser $parser
338
	 */
339
	protected function loadModules( &$parser ) {
340
		global $wgOut;
0 ignored issues
show
Compatibility Best Practice introduced by
Use of global functionality is not recommended; it makes your code harder to test, and less reusable.

Instead of relying on global state, we recommend one of these alternatives:

1. Pass all data via parameters

function myFunction($a, $b) {
    // Do something
}

2. Create a class that maintains your state

class MyClass {
    private $a;
    private $b;

    public function __construct($a, $b) {
        $this->a = $a;
        $this->b = $b;
    }

    public function myFunction() {
        // Do something
    }
}
Loading history...
341
342
		$parserOutput = $parser->getOutput();
343
344
		// load scripts
345
		$parserOutput->addModules( 'ext.Lingo.Scripts' );
346
347
		if ( !$wgOut->isArticle() ) {
348
			$wgOut->addModules( 'ext.Lingo.Scripts' );
349
		}
350
351
		// load styles
352
		$parserOutput->addModuleStyles( 'ext.Lingo.Styles' );
353
354
		if ( !$wgOut->isArticle() ) {
355
			$wgOut->addModuleStyles( 'ext.Lingo.Styles' );
356
		}
357
	}
358
359
	/**
360
	 * Purges the lingo tree from the cache.
361
	 *
362
	 * @deprecated 2.0.2
363
	 */
364
	public static function purgeCache() {
365
366
		self::getInstance()->purgeGlossaryFromCache();
367
	}
368
369
	/**
370
	 * Purges the lingo tree from the cache.
371
	 *
372
	 * @since 2.0.2
373
	 */
374
	public function purgeGlossaryFromCache() {
375
376
		global $wgexLingoCacheType;
377
		$cache = ( $wgexLingoCacheType !== null ) ? wfGetCache( $wgexLingoCacheType ) : wfGetMainCache();
378
		$cache->delete( $this->getCacheKey() );
379
	}
380
381
	/**
382
	 * @since 2.0.1
383
	 * @param Backend $backend
384
	 */
385
	public function setBackend( Backend $backend ) {
386
		$this->mLingoBackend = $backend;
387
		$backend->setLingoParser( $this );
388
	}
389
390
	/**
391
	 * @param Parser $parser
392
	 * @return bool
393
	 */
394
	protected function shouldParse( &$parser ) {
395
		global $wgexLingoUseNamespaces;
396
397
		if ( !( $parser instanceof Parser ) ) {
398
			return false;
399
		}
400
401
		if ( isset( $parser->mDoubleUnderscores[ 'noglossary' ] ) ) { // __NOGLOSSARY__ found in wikitext
402
			return false;
403
		}
404
405
		$title = $parser->getTitle();
406
407
		if ( !( $title instanceof Title ) ) {
408
			return false;
409
		}
410
411
		$namespace = $title->getNamespace();
412
413
		if ( isset( $wgexLingoUseNamespaces[ $namespace ] ) && $wgexLingoUseNamespaces[ $namespace ] === false ) {
414
			return false;
415
		};
416
417
		return true;
418
	}
419
}
420
421