These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | /** |
||
3 | * MediaWiki page data importer. |
||
4 | * |
||
5 | * Copyright © 2003,2005 Brion Vibber <[email protected]> |
||
6 | * https://www.mediawiki.org/ |
||
7 | * |
||
8 | * This program is free software; you can redistribute it and/or modify |
||
9 | * it under the terms of the GNU General Public License as published by |
||
10 | * the Free Software Foundation; either version 2 of the License, or |
||
11 | * (at your option) any later version. |
||
12 | * |
||
13 | * This program is distributed in the hope that it will be useful, |
||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
16 | * GNU General Public License for more details. |
||
17 | * |
||
18 | * You should have received a copy of the GNU General Public License along |
||
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
||
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
||
21 | * http://www.gnu.org/copyleft/gpl.html |
||
22 | * |
||
23 | * @file |
||
24 | * @ingroup SpecialPage |
||
25 | */ |
||
26 | |||
27 | /** |
||
28 | * XML file reader for the page data importer. |
||
29 | * |
||
30 | * implements Special:Import |
||
31 | * @ingroup SpecialPage |
||
32 | */ |
||
33 | class WikiImporter { |
||
34 | private $reader = null; |
||
35 | private $foreignNamespaces = null; |
||
36 | private $mLogItemCallback, $mUploadCallback, $mRevisionCallback, $mPageCallback; |
||
37 | private $mSiteInfoCallback, $mPageOutCallback; |
||
0 ignored issues
–
show
|
|||
38 | private $mNoticeCallback, $mDebug; |
||
39 | private $mImportUploads, $mImageBasePath; |
||
40 | private $mNoUpdates = false; |
||
41 | /** @var Config */ |
||
42 | private $config; |
||
43 | /** @var ImportTitleFactory */ |
||
44 | private $importTitleFactory; |
||
45 | /** @var array */ |
||
46 | private $countableCache = []; |
||
47 | |||
48 | /** |
||
49 | * Creates an ImportXMLReader drawing from the source provided |
||
50 | * @param ImportSource $source |
||
51 | * @param Config $config |
||
52 | * @throws Exception |
||
53 | */ |
||
54 | function __construct( ImportSource $source, Config $config = null ) { |
||
55 | if ( !class_exists( 'XMLReader' ) ) { |
||
56 | throw new Exception( 'Import requires PHP to have been compiled with libxml support' ); |
||
57 | } |
||
58 | |||
59 | $this->reader = new XMLReader(); |
||
60 | if ( !$config ) { |
||
61 | wfDeprecated( __METHOD__ . ' without a Config instance', '1.25' ); |
||
62 | $config = ConfigFactory::getDefaultInstance()->makeConfig( 'main' ); |
||
63 | } |
||
64 | $this->config = $config; |
||
65 | |||
66 | if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) { |
||
67 | stream_wrapper_register( 'uploadsource', 'UploadSourceAdapter' ); |
||
68 | } |
||
69 | $id = UploadSourceAdapter::registerSource( $source ); |
||
70 | |||
71 | // Enable the entity loader, as it is needed for loading external URLs via |
||
72 | // XMLReader::open (T86036) |
||
73 | $oldDisable = libxml_disable_entity_loader( false ); |
||
74 | if ( defined( 'LIBXML_PARSEHUGE' ) ) { |
||
75 | $status = $this->reader->open( "uploadsource://$id", null, LIBXML_PARSEHUGE ); |
||
76 | } else { |
||
77 | $status = $this->reader->open( "uploadsource://$id" ); |
||
78 | } |
||
79 | if ( !$status ) { |
||
80 | $error = libxml_get_last_error(); |
||
81 | libxml_disable_entity_loader( $oldDisable ); |
||
82 | throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' . |
||
83 | $error->message ); |
||
84 | } |
||
85 | libxml_disable_entity_loader( $oldDisable ); |
||
86 | |||
87 | // Default callbacks |
||
88 | $this->setPageCallback( [ $this, 'beforeImportPage' ] ); |
||
89 | $this->setRevisionCallback( [ $this, "importRevision" ] ); |
||
90 | $this->setUploadCallback( [ $this, 'importUpload' ] ); |
||
91 | $this->setLogItemCallback( [ $this, 'importLogItem' ] ); |
||
92 | $this->setPageOutCallback( [ $this, 'finishImportPage' ] ); |
||
93 | |||
94 | $this->importTitleFactory = new NaiveImportTitleFactory(); |
||
95 | } |
||
96 | |||
97 | /** |
||
98 | * @return null|XMLReader |
||
99 | */ |
||
100 | public function getReader() { |
||
101 | return $this->reader; |
||
102 | } |
||
103 | |||
104 | public function throwXmlError( $err ) { |
||
105 | $this->debug( "FAILURE: $err" ); |
||
106 | wfDebug( "WikiImporter XML error: $err\n" ); |
||
107 | } |
||
108 | |||
109 | public function debug( $data ) { |
||
110 | if ( $this->mDebug ) { |
||
111 | wfDebug( "IMPORT: $data\n" ); |
||
112 | } |
||
113 | } |
||
114 | |||
115 | public function warn( $data ) { |
||
116 | wfDebug( "IMPORT: $data\n" ); |
||
117 | } |
||
118 | |||
119 | public function notice( $msg /*, $param, ...*/ ) { |
||
120 | $params = func_get_args(); |
||
121 | array_shift( $params ); |
||
122 | |||
123 | if ( is_callable( $this->mNoticeCallback ) ) { |
||
124 | call_user_func( $this->mNoticeCallback, $msg, $params ); |
||
125 | } else { # No ImportReporter -> CLI |
||
126 | echo wfMessage( $msg, $params )->text() . "\n"; |
||
127 | } |
||
128 | } |
||
129 | |||
130 | /** |
||
131 | * Set debug mode... |
||
132 | * @param bool $debug |
||
133 | */ |
||
134 | function setDebug( $debug ) { |
||
135 | $this->mDebug = $debug; |
||
136 | } |
||
137 | |||
138 | /** |
||
139 | * Set 'no updates' mode. In this mode, the link tables will not be updated by the importer |
||
140 | * @param bool $noupdates |
||
141 | */ |
||
142 | function setNoUpdates( $noupdates ) { |
||
143 | $this->mNoUpdates = $noupdates; |
||
144 | } |
||
145 | |||
146 | /** |
||
147 | * Set a callback that displays notice messages |
||
148 | * |
||
149 | * @param callable $callback |
||
150 | * @return callable |
||
151 | */ |
||
152 | public function setNoticeCallback( $callback ) { |
||
153 | return wfSetVar( $this->mNoticeCallback, $callback ); |
||
154 | } |
||
155 | |||
156 | /** |
||
157 | * Sets the action to perform as each new page in the stream is reached. |
||
158 | * @param callable $callback |
||
159 | * @return callable |
||
160 | */ |
||
161 | public function setPageCallback( $callback ) { |
||
162 | $previous = $this->mPageCallback; |
||
163 | $this->mPageCallback = $callback; |
||
164 | return $previous; |
||
165 | } |
||
166 | |||
167 | /** |
||
168 | * Sets the action to perform as each page in the stream is completed. |
||
169 | * Callback accepts the page title (as a Title object), a second object |
||
170 | * with the original title form (in case it's been overridden into a |
||
171 | * local namespace), and a count of revisions. |
||
172 | * |
||
173 | * @param callable $callback |
||
174 | * @return callable |
||
175 | */ |
||
176 | public function setPageOutCallback( $callback ) { |
||
177 | $previous = $this->mPageOutCallback; |
||
178 | $this->mPageOutCallback = $callback; |
||
179 | return $previous; |
||
180 | } |
||
181 | |||
182 | /** |
||
183 | * Sets the action to perform as each page revision is reached. |
||
184 | * @param callable $callback |
||
185 | * @return callable |
||
186 | */ |
||
187 | public function setRevisionCallback( $callback ) { |
||
188 | $previous = $this->mRevisionCallback; |
||
189 | $this->mRevisionCallback = $callback; |
||
190 | return $previous; |
||
191 | } |
||
192 | |||
193 | /** |
||
194 | * Sets the action to perform as each file upload version is reached. |
||
195 | * @param callable $callback |
||
196 | * @return callable |
||
197 | */ |
||
198 | public function setUploadCallback( $callback ) { |
||
199 | $previous = $this->mUploadCallback; |
||
200 | $this->mUploadCallback = $callback; |
||
201 | return $previous; |
||
202 | } |
||
203 | |||
204 | /** |
||
205 | * Sets the action to perform as each log item reached. |
||
206 | * @param callable $callback |
||
207 | * @return callable |
||
208 | */ |
||
209 | public function setLogItemCallback( $callback ) { |
||
210 | $previous = $this->mLogItemCallback; |
||
211 | $this->mLogItemCallback = $callback; |
||
212 | return $previous; |
||
213 | } |
||
214 | |||
215 | /** |
||
216 | * Sets the action to perform when site info is encountered |
||
217 | * @param callable $callback |
||
218 | * @return callable |
||
219 | */ |
||
220 | public function setSiteInfoCallback( $callback ) { |
||
221 | $previous = $this->mSiteInfoCallback; |
||
222 | $this->mSiteInfoCallback = $callback; |
||
223 | return $previous; |
||
224 | } |
||
225 | |||
226 | /** |
||
227 | * Sets the factory object to use to convert ForeignTitle objects into local |
||
228 | * Title objects |
||
229 | * @param ImportTitleFactory $factory |
||
230 | */ |
||
231 | public function setImportTitleFactory( $factory ) { |
||
232 | $this->importTitleFactory = $factory; |
||
233 | } |
||
234 | |||
235 | /** |
||
236 | * Set a target namespace to override the defaults |
||
237 | * @param null|int $namespace |
||
238 | * @return bool |
||
239 | */ |
||
240 | public function setTargetNamespace( $namespace ) { |
||
241 | if ( is_null( $namespace ) ) { |
||
242 | // Don't override namespaces |
||
243 | $this->setImportTitleFactory( new NaiveImportTitleFactory() ); |
||
244 | return true; |
||
245 | } elseif ( |
||
246 | $namespace >= 0 && |
||
247 | MWNamespace::exists( intval( $namespace ) ) |
||
248 | ) { |
||
249 | $namespace = intval( $namespace ); |
||
250 | $this->setImportTitleFactory( new NamespaceImportTitleFactory( $namespace ) ); |
||
251 | return true; |
||
252 | } else { |
||
253 | return false; |
||
254 | } |
||
255 | } |
||
256 | |||
257 | /** |
||
258 | * Set a target root page under which all pages are imported |
||
259 | * @param null|string $rootpage |
||
260 | * @return Status |
||
261 | */ |
||
262 | public function setTargetRootPage( $rootpage ) { |
||
263 | $status = Status::newGood(); |
||
264 | if ( is_null( $rootpage ) ) { |
||
265 | // No rootpage |
||
266 | $this->setImportTitleFactory( new NaiveImportTitleFactory() ); |
||
267 | } elseif ( $rootpage !== '' ) { |
||
268 | $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes |
||
269 | $title = Title::newFromText( $rootpage ); |
||
270 | |||
271 | if ( !$title || $title->isExternal() ) { |
||
272 | $status->fatal( 'import-rootpage-invalid' ); |
||
273 | } else { |
||
274 | if ( !MWNamespace::hasSubpages( $title->getNamespace() ) ) { |
||
275 | global $wgContLang; |
||
276 | |||
277 | $displayNSText = $title->getNamespace() == NS_MAIN |
||
278 | ? wfMessage( 'blanknamespace' )->text() |
||
279 | : $wgContLang->getNsText( $title->getNamespace() ); |
||
280 | $status->fatal( 'import-rootpage-nosubpage', $displayNSText ); |
||
281 | } else { |
||
282 | // set namespace to 'all', so the namespace check in processTitle() can pass |
||
283 | $this->setTargetNamespace( null ); |
||
284 | $this->setImportTitleFactory( new SubpageImportTitleFactory( $title ) ); |
||
285 | } |
||
286 | } |
||
287 | } |
||
288 | return $status; |
||
289 | } |
||
290 | |||
291 | /** |
||
292 | * @param string $dir |
||
293 | */ |
||
294 | public function setImageBasePath( $dir ) { |
||
295 | $this->mImageBasePath = $dir; |
||
296 | } |
||
297 | |||
298 | /** |
||
299 | * @param bool $import |
||
300 | */ |
||
301 | public function setImportUploads( $import ) { |
||
302 | $this->mImportUploads = $import; |
||
303 | } |
||
304 | |||
305 | /** |
||
306 | * Default per-page callback. Sets up some things related to site statistics |
||
307 | * @param array $titleAndForeignTitle Two-element array, with Title object at |
||
308 | * index 0 and ForeignTitle object at index 1 |
||
309 | * @return bool |
||
310 | */ |
||
311 | public function beforeImportPage( $titleAndForeignTitle ) { |
||
312 | $title = $titleAndForeignTitle[0]; |
||
313 | $page = WikiPage::factory( $title ); |
||
314 | $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable(); |
||
315 | return true; |
||
316 | } |
||
317 | |||
318 | /** |
||
319 | * Default per-revision callback, performs the import. |
||
320 | * @param WikiRevision $revision |
||
321 | * @return bool |
||
322 | */ |
||
323 | public function importRevision( $revision ) { |
||
324 | if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) { |
||
325 | $this->notice( 'import-error-bad-location', |
||
326 | $revision->getTitle()->getPrefixedText(), |
||
327 | $revision->getID(), |
||
328 | $revision->getModel(), |
||
329 | $revision->getFormat() ); |
||
330 | |||
331 | return false; |
||
332 | } |
||
333 | |||
334 | try { |
||
335 | return $revision->importOldRevision(); |
||
336 | } catch ( MWContentSerializationException $ex ) { |
||
337 | $this->notice( 'import-error-unserialize', |
||
338 | $revision->getTitle()->getPrefixedText(), |
||
339 | $revision->getID(), |
||
340 | $revision->getModel(), |
||
341 | $revision->getFormat() ); |
||
342 | } |
||
343 | |||
344 | return false; |
||
345 | } |
||
346 | |||
347 | /** |
||
348 | * Default per-revision callback, performs the import. |
||
349 | * @param WikiRevision $revision |
||
350 | * @return bool |
||
351 | */ |
||
352 | public function importLogItem( $revision ) { |
||
353 | return $revision->importLogItem(); |
||
354 | } |
||
355 | |||
356 | /** |
||
357 | * Dummy for now... |
||
358 | * @param WikiRevision $revision |
||
359 | * @return bool |
||
360 | */ |
||
361 | public function importUpload( $revision ) { |
||
362 | return $revision->importUpload(); |
||
363 | } |
||
364 | |||
365 | /** |
||
366 | * Mostly for hook use |
||
367 | * @param Title $title |
||
368 | * @param ForeignTitle $foreignTitle |
||
369 | * @param int $revCount |
||
370 | * @param int $sRevCount |
||
371 | * @param array $pageInfo |
||
372 | * @return bool |
||
373 | */ |
||
374 | public function finishImportPage( $title, $foreignTitle, $revCount, |
||
375 | $sRevCount, $pageInfo ) { |
||
376 | |||
377 | // Update article count statistics (T42009) |
||
378 | // The normal counting logic in WikiPage->doEditUpdates() is designed for |
||
379 | // one-revision-at-a-time editing, not bulk imports. In this situation it |
||
380 | // suffers from issues of replica DB lag. We let WikiPage handle the total page |
||
381 | // and revision count, and we implement our own custom logic for the |
||
382 | // article (content page) count. |
||
383 | $page = WikiPage::factory( $title ); |
||
384 | $page->loadPageData( 'fromdbmaster' ); |
||
385 | $content = $page->getContent(); |
||
386 | if ( $content === null ) { |
||
387 | wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $title . |
||
388 | ' because WikiPage::getContent() returned null' ); |
||
389 | } else { |
||
390 | $editInfo = $page->prepareContentForEdit( $content ); |
||
391 | $countKey = 'title_' . $title->getPrefixedText(); |
||
392 | $countable = $page->isCountable( $editInfo ); |
||
393 | if ( array_key_exists( $countKey, $this->countableCache ) && |
||
394 | $countable != $this->countableCache[$countKey] ) { |
||
395 | DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [ |
||
396 | 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] ) |
||
397 | ] ) ); |
||
398 | } |
||
399 | } |
||
400 | |||
401 | $args = func_get_args(); |
||
402 | return Hooks::run( 'AfterImportPage', $args ); |
||
403 | } |
||
404 | |||
405 | /** |
||
406 | * Alternate per-revision callback, for debugging. |
||
407 | * @param WikiRevision $revision |
||
408 | */ |
||
409 | public function debugRevisionHandler( &$revision ) { |
||
410 | $this->debug( "Got revision:" ); |
||
411 | if ( is_object( $revision->title ) ) { |
||
412 | $this->debug( "-- Title: " . $revision->title->getPrefixedText() ); |
||
413 | } else { |
||
414 | $this->debug( "-- Title: <invalid>" ); |
||
415 | } |
||
416 | $this->debug( "-- User: " . $revision->user_text ); |
||
417 | $this->debug( "-- Timestamp: " . $revision->timestamp ); |
||
418 | $this->debug( "-- Comment: " . $revision->comment ); |
||
419 | $this->debug( "-- Text: " . $revision->text ); |
||
420 | } |
||
421 | |||
422 | /** |
||
423 | * Notify the callback function of site info |
||
424 | * @param array $siteInfo |
||
425 | * @return bool|mixed |
||
426 | */ |
||
427 | private function siteInfoCallback( $siteInfo ) { |
||
428 | if ( isset( $this->mSiteInfoCallback ) ) { |
||
429 | return call_user_func_array( $this->mSiteInfoCallback, |
||
430 | [ $siteInfo, $this ] ); |
||
431 | } else { |
||
432 | return false; |
||
433 | } |
||
434 | } |
||
435 | |||
436 | /** |
||
437 | * Notify the callback function when a new "<page>" is reached. |
||
438 | * @param Title $title |
||
439 | */ |
||
440 | function pageCallback( $title ) { |
||
441 | if ( isset( $this->mPageCallback ) ) { |
||
442 | call_user_func( $this->mPageCallback, $title ); |
||
443 | } |
||
444 | } |
||
445 | |||
446 | /** |
||
447 | * Notify the callback function when a "</page>" is closed. |
||
448 | * @param Title $title |
||
449 | * @param ForeignTitle $foreignTitle |
||
450 | * @param int $revCount |
||
451 | * @param int $sucCount Number of revisions for which callback returned true |
||
452 | * @param array $pageInfo Associative array of page information |
||
453 | */ |
||
454 | private function pageOutCallback( $title, $foreignTitle, $revCount, |
||
455 | $sucCount, $pageInfo ) { |
||
456 | if ( isset( $this->mPageOutCallback ) ) { |
||
457 | $args = func_get_args(); |
||
458 | call_user_func_array( $this->mPageOutCallback, $args ); |
||
459 | } |
||
460 | } |
||
461 | |||
462 | /** |
||
463 | * Notify the callback function of a revision |
||
464 | * @param WikiRevision $revision |
||
465 | * @return bool|mixed |
||
466 | */ |
||
467 | private function revisionCallback( $revision ) { |
||
468 | if ( isset( $this->mRevisionCallback ) ) { |
||
469 | return call_user_func_array( $this->mRevisionCallback, |
||
470 | [ $revision, $this ] ); |
||
471 | } else { |
||
472 | return false; |
||
473 | } |
||
474 | } |
||
475 | |||
476 | /** |
||
477 | * Notify the callback function of a new log item |
||
478 | * @param WikiRevision $revision |
||
479 | * @return bool|mixed |
||
480 | */ |
||
481 | private function logItemCallback( $revision ) { |
||
482 | if ( isset( $this->mLogItemCallback ) ) { |
||
483 | return call_user_func_array( $this->mLogItemCallback, |
||
484 | [ $revision, $this ] ); |
||
485 | } else { |
||
486 | return false; |
||
487 | } |
||
488 | } |
||
489 | |||
490 | /** |
||
491 | * Retrieves the contents of the named attribute of the current element. |
||
492 | * @param string $attr The name of the attribute |
||
493 | * @return string The value of the attribute or an empty string if it is not set in the current |
||
494 | * element. |
||
495 | */ |
||
496 | public function nodeAttribute( $attr ) { |
||
497 | return $this->reader->getAttribute( $attr ); |
||
498 | } |
||
499 | |||
500 | /** |
||
501 | * Shouldn't something like this be built-in to XMLReader? |
||
502 | * Fetches text contents of the current element, assuming |
||
503 | * no sub-elements or such scary things. |
||
504 | * @return string |
||
505 | * @access private |
||
506 | */ |
||
507 | View Code Duplication | public function nodeContents() { |
|
508 | if ( $this->reader->isEmptyElement ) { |
||
509 | return ""; |
||
510 | } |
||
511 | $buffer = ""; |
||
512 | while ( $this->reader->read() ) { |
||
513 | switch ( $this->reader->nodeType ) { |
||
514 | case XMLReader::TEXT: |
||
515 | case XMLReader::CDATA: |
||
516 | case XMLReader::SIGNIFICANT_WHITESPACE: |
||
517 | $buffer .= $this->reader->value; |
||
518 | break; |
||
519 | case XMLReader::END_ELEMENT: |
||
520 | return $buffer; |
||
521 | } |
||
522 | } |
||
523 | |||
524 | $this->reader->close(); |
||
525 | return ''; |
||
526 | } |
||
527 | |||
528 | /** |
||
529 | * Primary entry point |
||
530 | * @throws MWException |
||
531 | * @return bool |
||
532 | */ |
||
533 | public function doImport() { |
||
534 | // Calls to reader->read need to be wrapped in calls to |
||
535 | // libxml_disable_entity_loader() to avoid local file |
||
536 | // inclusion attacks (bug 46932). |
||
537 | $oldDisable = libxml_disable_entity_loader( true ); |
||
538 | $this->reader->read(); |
||
539 | |||
540 | if ( $this->reader->localName != 'mediawiki' ) { |
||
541 | libxml_disable_entity_loader( $oldDisable ); |
||
542 | throw new MWException( "Expected <mediawiki> tag, got " . |
||
543 | $this->reader->localName ); |
||
544 | } |
||
545 | $this->debug( "<mediawiki> tag is correct." ); |
||
546 | |||
547 | $this->debug( "Starting primary dump processing loop." ); |
||
548 | |||
549 | $keepReading = $this->reader->read(); |
||
550 | $skip = false; |
||
551 | $rethrow = null; |
||
552 | try { |
||
553 | while ( $keepReading ) { |
||
554 | $tag = $this->reader->localName; |
||
555 | $type = $this->reader->nodeType; |
||
556 | |||
557 | if ( !Hooks::run( 'ImportHandleToplevelXMLTag', [ $this ] ) ) { |
||
558 | // Do nothing |
||
559 | } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) { |
||
560 | break; |
||
561 | } elseif ( $tag == 'siteinfo' ) { |
||
562 | $this->handleSiteInfo(); |
||
563 | } elseif ( $tag == 'page' ) { |
||
564 | $this->handlePage(); |
||
565 | } elseif ( $tag == 'logitem' ) { |
||
566 | $this->handleLogItem(); |
||
567 | } elseif ( $tag != '#text' ) { |
||
568 | $this->warn( "Unhandled top-level XML tag $tag" ); |
||
569 | |||
570 | $skip = true; |
||
571 | } |
||
572 | |||
573 | if ( $skip ) { |
||
574 | $keepReading = $this->reader->next(); |
||
575 | $skip = false; |
||
576 | $this->debug( "Skip" ); |
||
577 | } else { |
||
578 | $keepReading = $this->reader->read(); |
||
579 | } |
||
580 | } |
||
581 | } catch ( Exception $ex ) { |
||
582 | $rethrow = $ex; |
||
583 | } |
||
584 | |||
585 | // finally |
||
586 | libxml_disable_entity_loader( $oldDisable ); |
||
587 | $this->reader->close(); |
||
588 | |||
589 | if ( $rethrow ) { |
||
590 | throw $rethrow; |
||
591 | } |
||
592 | |||
593 | return true; |
||
594 | } |
||
595 | |||
596 | private function handleSiteInfo() { |
||
597 | $this->debug( "Enter site info handler." ); |
||
598 | $siteInfo = []; |
||
599 | |||
600 | // Fields that can just be stuffed in the siteInfo object |
||
601 | $normalFields = [ 'sitename', 'base', 'generator', 'case' ]; |
||
602 | |||
603 | while ( $this->reader->read() ) { |
||
604 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
||
605 | $this->reader->localName == 'siteinfo' ) { |
||
606 | break; |
||
607 | } |
||
608 | |||
609 | $tag = $this->reader->localName; |
||
610 | |||
611 | if ( $tag == 'namespace' ) { |
||
612 | $this->foreignNamespaces[$this->nodeAttribute( 'key' )] = |
||
613 | $this->nodeContents(); |
||
614 | } elseif ( in_array( $tag, $normalFields ) ) { |
||
615 | $siteInfo[$tag] = $this->nodeContents(); |
||
616 | } |
||
617 | } |
||
618 | |||
619 | $siteInfo['_namespaces'] = $this->foreignNamespaces; |
||
620 | $this->siteInfoCallback( $siteInfo ); |
||
621 | } |
||
622 | |||
623 | private function handleLogItem() { |
||
624 | $this->debug( "Enter log item handler." ); |
||
625 | $logInfo = []; |
||
626 | |||
627 | // Fields that can just be stuffed in the pageInfo object |
||
628 | $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp', |
||
629 | 'logtitle', 'params' ]; |
||
630 | |||
631 | while ( $this->reader->read() ) { |
||
632 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
||
633 | $this->reader->localName == 'logitem' ) { |
||
634 | break; |
||
635 | } |
||
636 | |||
637 | $tag = $this->reader->localName; |
||
638 | |||
639 | View Code Duplication | if ( !Hooks::run( 'ImportHandleLogItemXMLTag', [ |
|
640 | $this, $logInfo |
||
641 | ] ) ) { |
||
642 | // Do nothing |
||
643 | } elseif ( in_array( $tag, $normalFields ) ) { |
||
644 | $logInfo[$tag] = $this->nodeContents(); |
||
645 | } elseif ( $tag == 'contributor' ) { |
||
646 | $logInfo['contributor'] = $this->handleContributor(); |
||
647 | } elseif ( $tag != '#text' ) { |
||
648 | $this->warn( "Unhandled log-item XML tag $tag" ); |
||
649 | } |
||
650 | } |
||
651 | |||
652 | $this->processLogItem( $logInfo ); |
||
653 | } |
||
654 | |||
655 | /** |
||
656 | * @param array $logInfo |
||
657 | * @return bool|mixed |
||
658 | */ |
||
659 | private function processLogItem( $logInfo ) { |
||
660 | |||
661 | $revision = new WikiRevision( $this->config ); |
||
662 | |||
663 | if ( isset( $logInfo['id'] ) ) { |
||
664 | $revision->setID( $logInfo['id'] ); |
||
665 | } |
||
666 | $revision->setType( $logInfo['type'] ); |
||
667 | $revision->setAction( $logInfo['action'] ); |
||
668 | if ( isset( $logInfo['timestamp'] ) ) { |
||
669 | $revision->setTimestamp( $logInfo['timestamp'] ); |
||
670 | } |
||
671 | if ( isset( $logInfo['params'] ) ) { |
||
672 | $revision->setParams( $logInfo['params'] ); |
||
673 | } |
||
674 | if ( isset( $logInfo['logtitle'] ) ) { |
||
675 | // @todo Using Title for non-local titles is a recipe for disaster. |
||
676 | // We should use ForeignTitle here instead. |
||
677 | $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) ); |
||
678 | } |
||
679 | |||
680 | $revision->setNoUpdates( $this->mNoUpdates ); |
||
681 | |||
682 | if ( isset( $logInfo['comment'] ) ) { |
||
683 | $revision->setComment( $logInfo['comment'] ); |
||
684 | } |
||
685 | |||
686 | if ( isset( $logInfo['contributor']['ip'] ) ) { |
||
687 | $revision->setUserIP( $logInfo['contributor']['ip'] ); |
||
688 | } |
||
689 | |||
690 | if ( !isset( $logInfo['contributor']['username'] ) ) { |
||
691 | $revision->setUsername( 'Unknown user' ); |
||
692 | } else { |
||
693 | $revision->setUsername( $logInfo['contributor']['username'] ); |
||
694 | } |
||
695 | |||
696 | return $this->logItemCallback( $revision ); |
||
697 | } |
||
698 | |||
699 | private function handlePage() { |
||
700 | // Handle page data. |
||
701 | $this->debug( "Enter page handler." ); |
||
702 | $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ]; |
||
703 | |||
704 | // Fields that can just be stuffed in the pageInfo object |
||
705 | $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ]; |
||
706 | |||
707 | $skip = false; |
||
708 | $badTitle = false; |
||
709 | |||
710 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
||
711 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
||
712 | $this->reader->localName == 'page' ) { |
||
713 | break; |
||
714 | } |
||
715 | |||
716 | $skip = false; |
||
717 | |||
718 | $tag = $this->reader->localName; |
||
719 | |||
720 | if ( $badTitle ) { |
||
721 | // The title is invalid, bail out of this page |
||
722 | $skip = true; |
||
723 | } elseif ( !Hooks::run( 'ImportHandlePageXMLTag', [ $this, |
||
724 | &$pageInfo ] ) ) { |
||
725 | // Do nothing |
||
726 | } elseif ( in_array( $tag, $normalFields ) ) { |
||
727 | // An XML snippet: |
||
728 | // <page> |
||
729 | // <id>123</id> |
||
730 | // <title>Page</title> |
||
731 | // <redirect title="NewTitle"/> |
||
732 | // ... |
||
733 | // Because the redirect tag is built differently, we need special handling for that case. |
||
734 | if ( $tag == 'redirect' ) { |
||
735 | $pageInfo[$tag] = $this->nodeAttribute( 'title' ); |
||
736 | } else { |
||
737 | $pageInfo[$tag] = $this->nodeContents(); |
||
738 | } |
||
739 | } elseif ( $tag == 'revision' || $tag == 'upload' ) { |
||
740 | if ( !isset( $title ) ) { |
||
741 | $title = $this->processTitle( $pageInfo['title'], |
||
742 | isset( $pageInfo['ns'] ) ? $pageInfo['ns'] : null ); |
||
743 | |||
744 | // $title is either an array of two titles or false. |
||
745 | if ( is_array( $title ) ) { |
||
746 | $this->pageCallback( $title ); |
||
747 | list( $pageInfo['_title'], $foreignTitle ) = $title; |
||
748 | } else { |
||
749 | $badTitle = true; |
||
750 | $skip = true; |
||
751 | } |
||
752 | } |
||
753 | |||
754 | if ( $title ) { |
||
755 | if ( $tag == 'revision' ) { |
||
756 | $this->handleRevision( $pageInfo ); |
||
757 | } else { |
||
758 | $this->handleUpload( $pageInfo ); |
||
759 | } |
||
760 | } |
||
761 | } elseif ( $tag != '#text' ) { |
||
762 | $this->warn( "Unhandled page XML tag $tag" ); |
||
763 | $skip = true; |
||
764 | } |
||
765 | } |
||
766 | |||
767 | // @note $pageInfo is only set if a valid $title is processed above with |
||
768 | // no error. If we have a valid $title, then pageCallback is called |
||
769 | // above, $pageInfo['title'] is set and we do pageOutCallback here. |
||
770 | // If $pageInfo['_title'] is not set, then $foreignTitle is also not |
||
771 | // set since they both come from $title above. |
||
772 | if ( array_key_exists( '_title', $pageInfo ) ) { |
||
773 | $this->pageOutCallback( $pageInfo['_title'], $foreignTitle, |
||
774 | $pageInfo['revisionCount'], |
||
775 | $pageInfo['successfulRevisionCount'], |
||
776 | $pageInfo ); |
||
777 | } |
||
778 | } |
||
779 | |||
780 | /** |
||
781 | * @param array $pageInfo |
||
782 | */ |
||
783 | private function handleRevision( &$pageInfo ) { |
||
784 | $this->debug( "Enter revision handler" ); |
||
785 | $revisionInfo = []; |
||
786 | |||
787 | $normalFields = [ 'id', 'timestamp', 'comment', 'minor', 'model', 'format', 'text' ]; |
||
788 | |||
789 | $skip = false; |
||
790 | |||
791 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
||
792 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
||
793 | $this->reader->localName == 'revision' ) { |
||
794 | break; |
||
795 | } |
||
796 | |||
797 | $tag = $this->reader->localName; |
||
798 | |||
799 | View Code Duplication | if ( !Hooks::run( 'ImportHandleRevisionXMLTag', [ |
|
800 | $this, $pageInfo, $revisionInfo |
||
801 | ] ) ) { |
||
802 | // Do nothing |
||
803 | } elseif ( in_array( $tag, $normalFields ) ) { |
||
804 | $revisionInfo[$tag] = $this->nodeContents(); |
||
805 | } elseif ( $tag == 'contributor' ) { |
||
806 | $revisionInfo['contributor'] = $this->handleContributor(); |
||
807 | } elseif ( $tag != '#text' ) { |
||
808 | $this->warn( "Unhandled revision XML tag $tag" ); |
||
809 | $skip = true; |
||
810 | } |
||
811 | } |
||
812 | |||
813 | $pageInfo['revisionCount']++; |
||
814 | if ( $this->processRevision( $pageInfo, $revisionInfo ) ) { |
||
815 | $pageInfo['successfulRevisionCount']++; |
||
816 | } |
||
817 | } |
||
818 | |||
819 | /** |
||
820 | * @param array $pageInfo |
||
821 | * @param array $revisionInfo |
||
822 | * @return bool|mixed |
||
823 | */ |
||
824 | private function processRevision( $pageInfo, $revisionInfo ) { |
||
825 | global $wgMaxArticleSize; |
||
826 | |||
827 | // Make sure revisions won't violate $wgMaxArticleSize, which could lead to |
||
828 | // database errors and instability. Testing for revisions with only listed |
||
829 | // content models, as other content models might use serialization formats |
||
830 | // which aren't checked against $wgMaxArticleSize. |
||
831 | if ( ( !isset( $revisionInfo['model'] ) || |
||
832 | in_array( $revisionInfo['model'], [ |
||
833 | 'wikitext', |
||
834 | 'css', |
||
835 | 'json', |
||
836 | 'javascript', |
||
837 | 'text', |
||
838 | '' |
||
839 | ] ) ) && |
||
840 | strlen( $revisionInfo['text'] ) > $wgMaxArticleSize * 1024 |
||
841 | ) { |
||
842 | throw new MWException( 'The text of ' . |
||
843 | ( isset( $revisionInfo['id'] ) ? |
||
844 | "the revision with ID $revisionInfo[id]" : |
||
845 | 'a revision' |
||
846 | ) . " exceeds the maximum allowable size ($wgMaxArticleSize KB)" ); |
||
847 | } |
||
848 | |||
849 | $revision = new WikiRevision( $this->config ); |
||
850 | |||
851 | if ( isset( $revisionInfo['id'] ) ) { |
||
852 | $revision->setID( $revisionInfo['id'] ); |
||
853 | } |
||
854 | if ( isset( $revisionInfo['model'] ) ) { |
||
855 | $revision->setModel( $revisionInfo['model'] ); |
||
856 | } |
||
857 | if ( isset( $revisionInfo['format'] ) ) { |
||
858 | $revision->setFormat( $revisionInfo['format'] ); |
||
859 | } |
||
860 | $revision->setTitle( $pageInfo['_title'] ); |
||
861 | |||
862 | if ( isset( $revisionInfo['text'] ) ) { |
||
863 | $handler = $revision->getContentHandler(); |
||
864 | $text = $handler->importTransform( |
||
865 | $revisionInfo['text'], |
||
866 | $revision->getFormat() ); |
||
867 | |||
868 | $revision->setText( $text ); |
||
869 | } |
||
870 | if ( isset( $revisionInfo['timestamp'] ) ) { |
||
871 | $revision->setTimestamp( $revisionInfo['timestamp'] ); |
||
872 | } else { |
||
873 | $revision->setTimestamp( wfTimestampNow() ); |
||
874 | } |
||
875 | |||
876 | if ( isset( $revisionInfo['comment'] ) ) { |
||
877 | $revision->setComment( $revisionInfo['comment'] ); |
||
878 | } |
||
879 | |||
880 | if ( isset( $revisionInfo['minor'] ) ) { |
||
881 | $revision->setMinor( true ); |
||
882 | } |
||
883 | if ( isset( $revisionInfo['contributor']['ip'] ) ) { |
||
884 | $revision->setUserIP( $revisionInfo['contributor']['ip'] ); |
||
885 | } elseif ( isset( $revisionInfo['contributor']['username'] ) ) { |
||
886 | $revision->setUsername( $revisionInfo['contributor']['username'] ); |
||
887 | } else { |
||
888 | $revision->setUsername( 'Unknown user' ); |
||
889 | } |
||
890 | $revision->setNoUpdates( $this->mNoUpdates ); |
||
891 | |||
892 | return $this->revisionCallback( $revision ); |
||
893 | } |
||
894 | |||
895 | /** |
||
896 | * @param array $pageInfo |
||
897 | * @return mixed |
||
898 | */ |
||
899 | private function handleUpload( &$pageInfo ) { |
||
900 | $this->debug( "Enter upload handler" ); |
||
901 | $uploadInfo = []; |
||
902 | |||
903 | $normalFields = [ 'timestamp', 'comment', 'filename', 'text', |
||
904 | 'src', 'size', 'sha1base36', 'archivename', 'rel' ]; |
||
905 | |||
906 | $skip = false; |
||
907 | |||
908 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
||
909 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
||
910 | $this->reader->localName == 'upload' ) { |
||
911 | break; |
||
912 | } |
||
913 | |||
914 | $tag = $this->reader->localName; |
||
915 | |||
916 | if ( !Hooks::run( 'ImportHandleUploadXMLTag', [ |
||
917 | $this, $pageInfo |
||
918 | ] ) ) { |
||
919 | // Do nothing |
||
920 | } elseif ( in_array( $tag, $normalFields ) ) { |
||
921 | $uploadInfo[$tag] = $this->nodeContents(); |
||
922 | } elseif ( $tag == 'contributor' ) { |
||
923 | $uploadInfo['contributor'] = $this->handleContributor(); |
||
924 | } elseif ( $tag == 'contents' ) { |
||
925 | $contents = $this->nodeContents(); |
||
926 | $encoding = $this->reader->getAttribute( 'encoding' ); |
||
927 | if ( $encoding === 'base64' ) { |
||
928 | $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) ); |
||
929 | $uploadInfo['isTempSrc'] = true; |
||
930 | } |
||
931 | } elseif ( $tag != '#text' ) { |
||
932 | $this->warn( "Unhandled upload XML tag $tag" ); |
||
933 | $skip = true; |
||
934 | } |
||
935 | } |
||
936 | |||
937 | if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) { |
||
938 | $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}"; |
||
939 | if ( file_exists( $path ) ) { |
||
940 | $uploadInfo['fileSrc'] = $path; |
||
941 | $uploadInfo['isTempSrc'] = false; |
||
942 | } |
||
943 | } |
||
944 | |||
945 | if ( $this->mImportUploads ) { |
||
946 | return $this->processUpload( $pageInfo, $uploadInfo ); |
||
947 | } |
||
948 | } |
||
949 | |||
950 | /** |
||
951 | * @param string $contents |
||
952 | * @return string |
||
953 | */ |
||
954 | private function dumpTemp( $contents ) { |
||
955 | $filename = tempnam( wfTempDir(), 'importupload' ); |
||
956 | file_put_contents( $filename, $contents ); |
||
957 | return $filename; |
||
958 | } |
||
959 | |||
960 | /** |
||
961 | * @param array $pageInfo |
||
962 | * @param array $uploadInfo |
||
963 | * @return mixed |
||
964 | */ |
||
965 | private function processUpload( $pageInfo, $uploadInfo ) { |
||
966 | $revision = new WikiRevision( $this->config ); |
||
967 | $text = isset( $uploadInfo['text'] ) ? $uploadInfo['text'] : ''; |
||
968 | |||
969 | $revision->setTitle( $pageInfo['_title'] ); |
||
970 | $revision->setID( $pageInfo['id'] ); |
||
971 | $revision->setTimestamp( $uploadInfo['timestamp'] ); |
||
972 | $revision->setText( $text ); |
||
973 | $revision->setFilename( $uploadInfo['filename'] ); |
||
974 | if ( isset( $uploadInfo['archivename'] ) ) { |
||
975 | $revision->setArchiveName( $uploadInfo['archivename'] ); |
||
976 | } |
||
977 | $revision->setSrc( $uploadInfo['src'] ); |
||
978 | if ( isset( $uploadInfo['fileSrc'] ) ) { |
||
979 | $revision->setFileSrc( $uploadInfo['fileSrc'], |
||
980 | !empty( $uploadInfo['isTempSrc'] ) ); |
||
981 | } |
||
982 | if ( isset( $uploadInfo['sha1base36'] ) ) { |
||
983 | $revision->setSha1Base36( $uploadInfo['sha1base36'] ); |
||
984 | } |
||
985 | $revision->setSize( intval( $uploadInfo['size'] ) ); |
||
986 | $revision->setComment( $uploadInfo['comment'] ); |
||
987 | |||
988 | if ( isset( $uploadInfo['contributor']['ip'] ) ) { |
||
989 | $revision->setUserIP( $uploadInfo['contributor']['ip'] ); |
||
990 | } |
||
991 | if ( isset( $uploadInfo['contributor']['username'] ) ) { |
||
992 | $revision->setUsername( $uploadInfo['contributor']['username'] ); |
||
993 | } |
||
994 | $revision->setNoUpdates( $this->mNoUpdates ); |
||
995 | |||
996 | return call_user_func( $this->mUploadCallback, $revision ); |
||
997 | } |
||
998 | |||
999 | /** |
||
1000 | * @return array |
||
1001 | */ |
||
1002 | private function handleContributor() { |
||
1003 | $fields = [ 'id', 'ip', 'username' ]; |
||
1004 | $info = []; |
||
1005 | |||
1006 | if ( $this->reader->isEmptyElement ) { |
||
1007 | return $info; |
||
1008 | } |
||
1009 | while ( $this->reader->read() ) { |
||
1010 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
||
1011 | $this->reader->localName == 'contributor' ) { |
||
1012 | break; |
||
1013 | } |
||
1014 | |||
1015 | $tag = $this->reader->localName; |
||
1016 | |||
1017 | if ( in_array( $tag, $fields ) ) { |
||
1018 | $info[$tag] = $this->nodeContents(); |
||
1019 | } |
||
1020 | } |
||
1021 | |||
1022 | return $info; |
||
1023 | } |
||
1024 | |||
1025 | /** |
||
1026 | * @param string $text |
||
1027 | * @param string|null $ns |
||
1028 | * @return array|bool |
||
1029 | */ |
||
1030 | private function processTitle( $text, $ns = null ) { |
||
1031 | if ( is_null( $this->foreignNamespaces ) ) { |
||
1032 | $foreignTitleFactory = new NaiveForeignTitleFactory(); |
||
1033 | } else { |
||
1034 | $foreignTitleFactory = new NamespaceAwareForeignTitleFactory( |
||
1035 | $this->foreignNamespaces ); |
||
1036 | } |
||
1037 | |||
1038 | $foreignTitle = $foreignTitleFactory->createForeignTitle( $text, |
||
1039 | intval( $ns ) ); |
||
1040 | |||
1041 | $title = $this->importTitleFactory->createTitleFromForeignTitle( |
||
1042 | $foreignTitle ); |
||
1043 | |||
1044 | $commandLineMode = $this->config->get( 'CommandLineMode' ); |
||
1045 | if ( is_null( $title ) ) { |
||
1046 | # Invalid page title? Ignore the page |
||
1047 | $this->notice( 'import-error-invalid', $foreignTitle->getFullText() ); |
||
1048 | return false; |
||
1049 | } elseif ( $title->isExternal() ) { |
||
1050 | $this->notice( 'import-error-interwiki', $title->getPrefixedText() ); |
||
1051 | return false; |
||
1052 | } elseif ( !$title->canExist() ) { |
||
1053 | $this->notice( 'import-error-special', $title->getPrefixedText() ); |
||
1054 | return false; |
||
1055 | } elseif ( !$title->userCan( 'edit' ) && !$commandLineMode ) { |
||
1056 | # Do not import if the importing wiki user cannot edit this page |
||
1057 | $this->notice( 'import-error-edit', $title->getPrefixedText() ); |
||
1058 | return false; |
||
1059 | } elseif ( !$title->exists() && !$title->userCan( 'create' ) && !$commandLineMode ) { |
||
1060 | # Do not import if the importing wiki user cannot create this page |
||
1061 | $this->notice( 'import-error-create', $title->getPrefixedText() ); |
||
1062 | return false; |
||
1063 | } |
||
1064 | |||
1065 | return [ $title, $foreignTitle ]; |
||
1066 | } |
||
1067 | } |
||
1068 |
Only declaring a single property per statement allows you to later on add doc comments more easily.
It is also recommended by PSR2, so it is a common style that many people expect.