These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | /** |
||
4 | * File holding the Lingo\LingoParser class. |
||
5 | * |
||
6 | * This file is part of the MediaWiki extension Lingo. |
||
7 | * |
||
8 | * @copyright 2011 - 2018, Stephan Gambke |
||
9 | * @license GNU General Public License, version 2 (or any later version) |
||
10 | * |
||
11 | * The Lingo extension is free software: you can redistribute it and/or modify |
||
12 | * it under the terms of the GNU General Public License as published by the Free |
||
13 | * Software Foundation; either version 2 of the License, or (at your option) any |
||
14 | * later version. |
||
15 | * |
||
16 | * The Lingo extension is distributed in the hope that it will be useful, but |
||
17 | * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
||
18 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
||
19 | * details. |
||
20 | * |
||
21 | * You should have received a copy of the GNU General Public License along |
||
22 | * with this program. If not, see <http://www.gnu.org/licenses/>. |
||
23 | * |
||
24 | * @author Stephan Gambke |
||
25 | * |
||
26 | * @file |
||
27 | * @ingroup Lingo |
||
28 | */ |
||
29 | namespace Lingo; |
||
30 | |||
31 | use DOMDocument; |
||
32 | use DOMXPath; |
||
33 | use ObjectCache; |
||
34 | use Parser; |
||
35 | use Title; |
||
36 | |||
37 | /** |
||
38 | * This class parses the given text and enriches it with definitions for defined |
||
39 | * terms. |
||
40 | * |
||
41 | * Contains a static function to initiate the parsing. |
||
42 | * |
||
43 | * @ingroup Lingo |
||
44 | */ |
||
45 | class LingoParser { |
||
46 | |||
47 | const WORD_VALUE = 0; |
||
48 | const WORD_OFFSET = 1; |
||
49 | |||
50 | private $mLingoTree = null; |
||
51 | |||
52 | /** |
||
53 | * @var Backend |
||
54 | */ |
||
55 | private $mLingoBackend = null; |
||
56 | private static $parserSingleton = null; |
||
57 | |||
58 | // The RegEx to split a chunk of text into words |
||
59 | public $regex = null; |
||
60 | |||
61 | /** |
||
62 | * Lingo\LingoParser constructor. |
||
63 | * @param MessageLog|null $messages |
||
64 | */ |
||
65 | 1 | public function __construct( MessageLog &$messages = null ) { |
|
66 | // The RegEx to split a chunk of text into words |
||
67 | // Words are: placeholders for stripped items, sequences of letters and numbers, single characters that are neither letter nor number |
||
68 | 1 | $this->regex = '/' . preg_quote( Parser::MARKER_PREFIX, '/' ) . '.*?' . preg_quote( Parser::MARKER_SUFFIX, '/' ) . '|[\p{L}\p{N}]+|[^\p{L}\p{N}]/u'; |
|
69 | 1 | } |
|
70 | |||
71 | /** |
||
72 | * @return Boolean |
||
73 | */ |
||
74 | public function parse( /*$content, $title, $po */ ) { |
||
0 ignored issues
–
show
|
|||
75 | |||
76 | /** @var \Parser $parser */ |
||
77 | $parser = $GLOBALS[ 'wgParser' ]; |
||
78 | |||
79 | if ( $this->shouldParse( $parser ) ) { |
||
80 | $this->realParse( $parser ); |
||
81 | } |
||
82 | |||
83 | return true; |
||
84 | } |
||
85 | |||
86 | /** |
||
87 | * @return LingoParser |
||
88 | * @since 2.0.1 |
||
89 | */ |
||
90 | public static function getInstance() { |
||
91 | if ( !self::$parserSingleton ) { |
||
92 | self::$parserSingleton = new LingoParser(); |
||
93 | |||
94 | } |
||
95 | |||
96 | return self::$parserSingleton; |
||
97 | } |
||
98 | |||
99 | /** |
||
100 | * @return string |
||
101 | */ |
||
102 | private function getCacheKey() { |
||
103 | // FIXME: If Lingo ever stores the glossary tree per user, then the cache key also needs to include the user id (see T163608) |
||
104 | return ObjectCache::getLocalClusterInstance()->makeKey( 'ext', 'lingo', 'lingotree', Tree::TREE_VERSION, get_class( self::getInstance()->getBackend() ) ); |
||
105 | } |
||
106 | |||
107 | /** |
||
108 | * @return Backend the backend used by the parser |
||
109 | * @throws \MWException |
||
110 | */ |
||
111 | public function getBackend() { |
||
112 | |||
113 | if ( $this->mLingoBackend === null ) { |
||
114 | throw new \MWException( 'No Lingo backend available!' ); |
||
115 | } |
||
116 | |||
117 | return $this->mLingoBackend; |
||
118 | } |
||
119 | |||
120 | /** |
||
121 | * Returns the list of terms in the glossary |
||
122 | * |
||
123 | * @return array an array mapping terms (keys) to descriptions (values) |
||
124 | */ |
||
125 | public function getLingoArray() { |
||
126 | return $this->getLingoTree()->getTermList(); |
||
127 | } |
||
128 | |||
129 | /** |
||
130 | * Returns the list of terms in the glossary as a Lingo\Tree |
||
131 | * |
||
132 | * @return Tree a Lingo\Tree mapping terms (keys) to descriptions (values) |
||
133 | */ |
||
134 | public function getLingoTree() { |
||
135 | |||
136 | // build glossary array only once per request |
||
137 | if ( !$this->mLingoTree ) { |
||
138 | |||
139 | // use cache if enabled |
||
140 | if ( $this->mLingoBackend->useCache() ) { |
||
141 | |||
142 | // Try cache first |
||
143 | global $wgexLingoCacheType; |
||
144 | $cache = ( $wgexLingoCacheType !== null ) ? wfGetCache( $wgexLingoCacheType ) : wfGetMainCache(); |
||
145 | $cachekey = $this->getCacheKey(); |
||
146 | $cachedLingoTree = $cache->get( $cachekey ); |
||
147 | |||
148 | // cache hit? |
||
149 | if ( $cachedLingoTree !== false && $cachedLingoTree !== null ) { |
||
150 | |||
151 | wfDebug( "Cache hit: Got lingo tree from cache.\n" ); |
||
152 | $this->mLingoTree = &$cachedLingoTree; |
||
153 | |||
154 | wfDebug( "Re-cached lingo tree.\n" ); |
||
155 | } else { |
||
156 | |||
157 | wfDebug( "Cache miss: Lingo tree not found in cache.\n" ); |
||
158 | $this->mLingoTree =& $this->buildLingo(); |
||
159 | wfDebug( "Cached lingo tree.\n" ); |
||
160 | } |
||
161 | |||
162 | // Keep for one month |
||
163 | // Limiting the cache validity will allow to purge stale cache |
||
164 | // entries inserted by older versions after one month |
||
165 | $cache->set( $cachekey, $this->mLingoTree, 60 * 60 * 24 * 30 ); |
||
166 | |||
167 | } else { |
||
168 | wfDebug( "Caching of lingo tree disabled.\n" ); |
||
169 | $this->mLingoTree =& $this->buildLingo(); |
||
170 | } |
||
171 | |||
172 | } |
||
173 | |||
174 | return $this->mLingoTree; |
||
175 | } |
||
176 | |||
177 | /** |
||
178 | * @return Tree |
||
179 | */ |
||
180 | protected function &buildLingo() { |
||
181 | |||
182 | $lingoTree = new Tree(); |
||
183 | $backend = &$this->mLingoBackend; |
||
184 | |||
185 | // assemble the result array |
||
186 | while ( $elementData = $backend->next() ) { |
||
187 | $lingoTree->addTerm( $elementData[ Element::ELEMENT_TERM ], $elementData ); |
||
188 | } |
||
189 | |||
190 | return $lingoTree; |
||
191 | } |
||
192 | |||
193 | /** |
||
194 | * Parses the given text and enriches applicable terms |
||
195 | * |
||
196 | * This method currently only recognizes terms consisting of max one word |
||
197 | * |
||
198 | * @param Parser $parser |
||
199 | * |
||
200 | * @return Boolean |
||
201 | */ |
||
202 | protected function realParse( &$parser ) { |
||
203 | |||
204 | $text = $parser->getOutput()->getText(); |
||
205 | |||
206 | if ( $text === null || $text === '' ) { |
||
207 | return true; |
||
208 | } |
||
209 | |||
210 | // Get array of terms |
||
211 | $glossary = $this->getLingoTree(); |
||
212 | |||
213 | if ( $glossary == null ) { |
||
214 | return true; |
||
215 | } |
||
216 | |||
217 | // Parse HTML from page |
||
218 | \MediaWiki\suppressWarnings(); |
||
219 | |||
220 | $doc = new DOMDocument( '1.0', 'utf-8' ); |
||
221 | $doc->loadHTML( '<html><head><meta http-equiv="content-type" content="charset=utf-8"/></head><body>' . $text . '</body></html>' ); |
||
222 | |||
223 | \MediaWiki\restoreWarnings(); |
||
224 | |||
225 | // Find all text in HTML. |
||
226 | $xpath = new DOMXPath( $doc ); |
||
227 | $textElements = $xpath->query( |
||
228 | "//*[not(ancestor-or-self::*[@class='noglossary'] or ancestor-or-self::a)][text()!=' ']/text()" |
||
229 | ); |
||
230 | |||
231 | // Iterate all HTML text matches |
||
232 | $numberOfTextElements = $textElements->length; |
||
233 | |||
234 | $definitions = []; |
||
235 | |||
236 | for ( $textElementIndex = 0; $textElementIndex < $numberOfTextElements; $textElementIndex++ ) { |
||
237 | $textElement = $textElements->item( $textElementIndex ); |
||
238 | |||
239 | if ( strlen( $textElement->nodeValue ) < $glossary->getMinTermLength() ) { |
||
240 | continue; |
||
241 | } |
||
242 | |||
243 | $matches = []; |
||
244 | preg_match_all( |
||
245 | $this->regex, |
||
246 | $textElement->nodeValue, |
||
247 | $matches, |
||
248 | PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER |
||
249 | ); |
||
250 | |||
251 | if ( count( $matches ) === 0 || count( $matches[ 0 ] ) === 0 ) { |
||
252 | continue; |
||
253 | } |
||
254 | |||
255 | $wordDescriptors = &$matches[ 0 ]; // See __construct() for definition of "word" |
||
256 | $numberOfWordDescriptors = count( $wordDescriptors ); |
||
257 | |||
258 | $parentNode = &$textElement->parentNode; |
||
259 | |||
260 | $wordDescriptorIndex = 0; |
||
261 | $changedElem = false; |
||
262 | |||
263 | while ( $wordDescriptorIndex < $numberOfWordDescriptors ) { |
||
264 | |||
265 | /** @var \Lingo\Element $definition */ |
||
266 | list( $skippedWords, $usedWords, $definition ) = |
||
267 | $glossary->findNextTerm( $wordDescriptors, $wordDescriptorIndex, $numberOfWordDescriptors ); |
||
268 | |||
269 | if ( $usedWords > 0 ) { // found a term |
||
270 | |||
271 | if ( $skippedWords > 0 ) { // skipped some text, insert it as is |
||
272 | |||
273 | $start = $wordDescriptors[$wordDescriptorIndex][self::WORD_OFFSET]; |
||
274 | $length = $wordDescriptors[$wordDescriptorIndex + $skippedWords][self::WORD_OFFSET] - $start; |
||
275 | |||
276 | $parentNode->insertBefore( |
||
277 | $doc->createTextNode( |
||
278 | substr( $textElement->nodeValue, $start, $length) |
||
279 | ), |
||
280 | $textElement |
||
281 | ); |
||
282 | } |
||
283 | |||
284 | $parentNode->insertBefore( $definition->getFormattedTerm( $doc ), $textElement ); |
||
285 | |||
286 | $definitions[ $definition->getId() ] = $definition->getFormattedDefinitions(); |
||
287 | |||
288 | $changedElem = true; |
||
289 | |||
290 | } else { // did not find any term, just use the rest of the text |
||
291 | |||
292 | // If we found no term now and no term before, there was no |
||
293 | // term in the whole element. Might as well not change the |
||
294 | // element at all. |
||
295 | |||
296 | // Only change element if found term before |
||
297 | if ( $changedElem === true ) { |
||
298 | |||
299 | $start = $wordDescriptors[$wordDescriptorIndex][self::WORD_OFFSET]; |
||
300 | |||
301 | $parentNode->insertBefore( |
||
302 | $doc->createTextNode( |
||
303 | substr( $textElement->nodeValue, $start) |
||
304 | ), |
||
305 | $textElement |
||
306 | ); |
||
307 | |||
308 | } |
||
309 | |||
310 | // In principle superfluous, the loop would run out anyway. Might save a bit of time. |
||
311 | break; |
||
312 | } |
||
313 | |||
314 | $wordDescriptorIndex += $usedWords + $skippedWords; |
||
315 | } |
||
316 | |||
317 | if ( $changedElem ) { |
||
318 | $parentNode->removeChild( $textElement ); |
||
319 | } |
||
320 | } |
||
321 | |||
322 | if ( count( $definitions ) > 0 ) { |
||
323 | |||
324 | $this->loadModules( $parser ); |
||
325 | |||
326 | // U - Ungreedy, D - dollar matches only end of string, s - dot matches newlines |
||
327 | $text = preg_replace( '%(^.*<body>)|(</body>.*$)%UDs', '', $doc->saveHTML() ); |
||
328 | $text .= $parser->recursiveTagParseFully( join( $definitions ) ); |
||
329 | |||
330 | $parser->getOutput()->setText( $text ); |
||
331 | } |
||
332 | |||
333 | return true; |
||
334 | } |
||
335 | |||
336 | /** |
||
337 | * @param Parser $parser |
||
338 | */ |
||
339 | protected function loadModules( &$parser ) { |
||
340 | global $wgOut; |
||
341 | |||
342 | $parserOutput = $parser->getOutput(); |
||
343 | |||
344 | // load scripts |
||
345 | $parserOutput->addModules( 'ext.Lingo.Scripts' ); |
||
346 | |||
347 | if ( !$wgOut->isArticle() ) { |
||
348 | $wgOut->addModules( 'ext.Lingo.Scripts' ); |
||
349 | } |
||
350 | |||
351 | // load styles |
||
352 | $parserOutput->addModuleStyles( 'ext.Lingo.Styles' ); |
||
353 | |||
354 | if ( !$wgOut->isArticle() ) { |
||
355 | $wgOut->addModuleStyles( 'ext.Lingo.Styles' ); |
||
356 | } |
||
357 | } |
||
358 | |||
359 | /** |
||
360 | * Purges the lingo tree from the cache. |
||
361 | * |
||
362 | * @deprecated 2.0.2 |
||
363 | */ |
||
364 | public static function purgeCache() { |
||
365 | |||
366 | self::getInstance()->purgeGlossaryFromCache(); |
||
367 | } |
||
368 | |||
369 | /** |
||
370 | * Purges the lingo tree from the cache. |
||
371 | * |
||
372 | * @since 2.0.2 |
||
373 | */ |
||
374 | public function purgeGlossaryFromCache() { |
||
375 | |||
376 | global $wgexLingoCacheType; |
||
377 | $cache = ( $wgexLingoCacheType !== null ) ? wfGetCache( $wgexLingoCacheType ) : wfGetMainCache(); |
||
378 | $cache->delete( $this->getCacheKey() ); |
||
379 | } |
||
380 | |||
381 | /** |
||
382 | * @since 2.0.1 |
||
383 | * @param Backend $backend |
||
384 | */ |
||
385 | public function setBackend( Backend $backend ) { |
||
386 | $this->mLingoBackend = $backend; |
||
387 | $backend->setLingoParser( $this ); |
||
388 | } |
||
389 | |||
390 | /** |
||
391 | * @param Parser $parser |
||
392 | * @return bool |
||
393 | */ |
||
394 | protected function shouldParse( &$parser ) { |
||
395 | global $wgexLingoUseNamespaces; |
||
396 | |||
397 | if ( !( $parser instanceof Parser ) ) { |
||
398 | return false; |
||
399 | } |
||
400 | |||
401 | if ( isset( $parser->mDoubleUnderscores[ 'noglossary' ] ) ) { // __NOGLOSSARY__ found in wikitext |
||
402 | return false; |
||
403 | } |
||
404 | |||
405 | $title = $parser->getTitle(); |
||
406 | |||
407 | if ( !( $title instanceof Title ) ) { |
||
408 | return false; |
||
409 | } |
||
410 | |||
411 | $namespace = $title->getNamespace(); |
||
412 | |||
413 | if ( isset( $wgexLingoUseNamespaces[ $namespace ] ) && $wgexLingoUseNamespaces[ $namespace ] === false ) { |
||
414 | return false; |
||
415 | }; |
||
416 | |||
417 | return true; |
||
418 | } |
||
419 | } |
||
420 | |||
421 |
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.