Completed
Branch master (939199)
by
unknown
39:35
created

includes/import/WikiImporter.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
/**
3
 * MediaWiki page data importer.
4
 *
5
 * Copyright © 2003,2005 Brion Vibber <[email protected]>
6
 * https://www.mediawiki.org/
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License along
19
 * with this program; if not, write to the Free Software Foundation, Inc.,
20
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21
 * http://www.gnu.org/copyleft/gpl.html
22
 *
23
 * @file
24
 * @ingroup SpecialPage
25
 */
26
27
/**
28
 * XML file reader for the page data importer.
29
 *
30
 * implements Special:Import
31
 * @ingroup SpecialPage
32
 */
33
class WikiImporter {
34
	private $reader = null;
35
	private $foreignNamespaces = null;
36
	private $mLogItemCallback, $mUploadCallback, $mRevisionCallback, $mPageCallback;
37
	private $mSiteInfoCallback, $mPageOutCallback;
0 ignored issues
show
It is generally advisable to only define one property per statement.

Only declaring a single property per statement allows you to later on add doc comments more easily.

It is also recommended by PSR2, so it is a common style that many people expect.

Loading history...
38
	private $mNoticeCallback, $mDebug;
39
	private $mImportUploads, $mImageBasePath;
40
	private $mNoUpdates = false;
41
	/** @var Config */
42
	private $config;
43
	/** @var ImportTitleFactory */
44
	private $importTitleFactory;
45
	/** @var array */
46
	private $countableCache = [];
47
48
	/**
49
	 * Creates an ImportXMLReader drawing from the source provided
50
	 * @param ImportSource $source
51
	 * @param Config $config
52
	 * @throws Exception
53
	 */
54
	function __construct( ImportSource $source, Config $config = null ) {
55
		if ( !class_exists( 'XMLReader' ) ) {
56
			throw new Exception( 'Import requires PHP to have been compiled with libxml support' );
57
		}
58
59
		$this->reader = new XMLReader();
60
		if ( !$config ) {
61
			wfDeprecated( __METHOD__ . ' without a Config instance', '1.25' );
62
			$config = ConfigFactory::getDefaultInstance()->makeConfig( 'main' );
63
		}
64
		$this->config = $config;
65
66
		if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
67
			stream_wrapper_register( 'uploadsource', 'UploadSourceAdapter' );
68
		}
69
		$id = UploadSourceAdapter::registerSource( $source );
70
71
		// Enable the entity loader, as it is needed for loading external URLs via
72
		// XMLReader::open (T86036)
73
		$oldDisable = libxml_disable_entity_loader( false );
74
		if ( defined( 'LIBXML_PARSEHUGE' ) ) {
75
			$status = $this->reader->open( "uploadsource://$id", null, LIBXML_PARSEHUGE );
76
		} else {
77
			$status = $this->reader->open( "uploadsource://$id" );
78
		}
79
		if ( !$status ) {
80
			$error = libxml_get_last_error();
81
			libxml_disable_entity_loader( $oldDisable );
82
			throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
83
				$error->message );
84
		}
85
		libxml_disable_entity_loader( $oldDisable );
86
87
		// Default callbacks
88
		$this->setPageCallback( [ $this, 'beforeImportPage' ] );
89
		$this->setRevisionCallback( [ $this, "importRevision" ] );
90
		$this->setUploadCallback( [ $this, 'importUpload' ] );
91
		$this->setLogItemCallback( [ $this, 'importLogItem' ] );
92
		$this->setPageOutCallback( [ $this, 'finishImportPage' ] );
93
94
		$this->importTitleFactory = new NaiveImportTitleFactory();
95
	}
96
97
	/**
98
	 * @return null|XMLReader
99
	 */
100
	public function getReader() {
101
		return $this->reader;
102
	}
103
104
	public function throwXmlError( $err ) {
105
		$this->debug( "FAILURE: $err" );
106
		wfDebug( "WikiImporter XML error: $err\n" );
107
	}
108
109
	public function debug( $data ) {
110
		if ( $this->mDebug ) {
111
			wfDebug( "IMPORT: $data\n" );
112
		}
113
	}
114
115
	public function warn( $data ) {
116
		wfDebug( "IMPORT: $data\n" );
117
	}
118
119
	public function notice( $msg /*, $param, ...*/ ) {
120
		$params = func_get_args();
121
		array_shift( $params );
122
123
		if ( is_callable( $this->mNoticeCallback ) ) {
124
			call_user_func( $this->mNoticeCallback, $msg, $params );
125
		} else { # No ImportReporter -> CLI
126
			echo wfMessage( $msg, $params )->text() . "\n";
127
		}
128
	}
129
130
	/**
131
	 * Set debug mode...
132
	 * @param bool $debug
133
	 */
134
	function setDebug( $debug ) {
135
		$this->mDebug = $debug;
136
	}
137
138
	/**
139
	 * Set 'no updates' mode. In this mode, the link tables will not be updated by the importer
140
	 * @param bool $noupdates
141
	 */
142
	function setNoUpdates( $noupdates ) {
143
		$this->mNoUpdates = $noupdates;
144
	}
145
146
	/**
147
	 * Set a callback that displays notice messages
148
	 *
149
	 * @param callable $callback
150
	 * @return callable
151
	 */
152
	public function setNoticeCallback( $callback ) {
153
		return wfSetVar( $this->mNoticeCallback, $callback );
154
	}
155
156
	/**
157
	 * Sets the action to perform as each new page in the stream is reached.
158
	 * @param callable $callback
159
	 * @return callable
160
	 */
161
	public function setPageCallback( $callback ) {
162
		$previous = $this->mPageCallback;
163
		$this->mPageCallback = $callback;
164
		return $previous;
165
	}
166
167
	/**
168
	 * Sets the action to perform as each page in the stream is completed.
169
	 * Callback accepts the page title (as a Title object), a second object
170
	 * with the original title form (in case it's been overridden into a
171
	 * local namespace), and a count of revisions.
172
	 *
173
	 * @param callable $callback
174
	 * @return callable
175
	 */
176
	public function setPageOutCallback( $callback ) {
177
		$previous = $this->mPageOutCallback;
178
		$this->mPageOutCallback = $callback;
179
		return $previous;
180
	}
181
182
	/**
183
	 * Sets the action to perform as each page revision is reached.
184
	 * @param callable $callback
185
	 * @return callable
186
	 */
187
	public function setRevisionCallback( $callback ) {
188
		$previous = $this->mRevisionCallback;
189
		$this->mRevisionCallback = $callback;
190
		return $previous;
191
	}
192
193
	/**
194
	 * Sets the action to perform as each file upload version is reached.
195
	 * @param callable $callback
196
	 * @return callable
197
	 */
198
	public function setUploadCallback( $callback ) {
199
		$previous = $this->mUploadCallback;
200
		$this->mUploadCallback = $callback;
201
		return $previous;
202
	}
203
204
	/**
205
	 * Sets the action to perform as each log item reached.
206
	 * @param callable $callback
207
	 * @return callable
208
	 */
209
	public function setLogItemCallback( $callback ) {
210
		$previous = $this->mLogItemCallback;
211
		$this->mLogItemCallback = $callback;
212
		return $previous;
213
	}
214
215
	/**
216
	 * Sets the action to perform when site info is encountered
217
	 * @param callable $callback
218
	 * @return callable
219
	 */
220
	public function setSiteInfoCallback( $callback ) {
221
		$previous = $this->mSiteInfoCallback;
222
		$this->mSiteInfoCallback = $callback;
223
		return $previous;
224
	}
225
226
	/**
227
	 * Sets the factory object to use to convert ForeignTitle objects into local
228
	 * Title objects
229
	 * @param ImportTitleFactory $factory
230
	 */
231
	public function setImportTitleFactory( $factory ) {
232
		$this->importTitleFactory = $factory;
233
	}
234
235
	/**
236
	 * Set a target namespace to override the defaults
237
	 * @param null|int $namespace
238
	 * @return bool
239
	 */
240
	public function setTargetNamespace( $namespace ) {
241
		if ( is_null( $namespace ) ) {
242
			// Don't override namespaces
243
			$this->setImportTitleFactory( new NaiveImportTitleFactory() );
244
			return true;
245
		} elseif (
246
			$namespace >= 0 &&
247
			MWNamespace::exists( intval( $namespace ) )
248
		) {
249
			$namespace = intval( $namespace );
250
			$this->setImportTitleFactory( new NamespaceImportTitleFactory( $namespace ) );
251
			return true;
252
		} else {
253
			return false;
254
		}
255
	}
256
257
	/**
258
	 * Set a target root page under which all pages are imported
259
	 * @param null|string $rootpage
260
	 * @return Status
261
	 */
262
	public function setTargetRootPage( $rootpage ) {
263
		$status = Status::newGood();
264
		if ( is_null( $rootpage ) ) {
265
			// No rootpage
266
			$this->setImportTitleFactory( new NaiveImportTitleFactory() );
267
		} elseif ( $rootpage !== '' ) {
268
			$rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
269
			$title = Title::newFromText( $rootpage );
270
271
			if ( !$title || $title->isExternal() ) {
272
				$status->fatal( 'import-rootpage-invalid' );
273
			} else {
274
				if ( !MWNamespace::hasSubpages( $title->getNamespace() ) ) {
275
					global $wgContLang;
276
277
					$displayNSText = $title->getNamespace() == NS_MAIN
278
						? wfMessage( 'blanknamespace' )->text()
279
						: $wgContLang->getNsText( $title->getNamespace() );
280
					$status->fatal( 'import-rootpage-nosubpage', $displayNSText );
281
				} else {
282
					// set namespace to 'all', so the namespace check in processTitle() can pass
283
					$this->setTargetNamespace( null );
284
					$this->setImportTitleFactory( new SubpageImportTitleFactory( $title ) );
285
				}
286
			}
287
		}
288
		return $status;
289
	}
290
291
	/**
292
	 * @param string $dir
293
	 */
294
	public function setImageBasePath( $dir ) {
295
		$this->mImageBasePath = $dir;
296
	}
297
298
	/**
299
	 * @param bool $import
300
	 */
301
	public function setImportUploads( $import ) {
302
		$this->mImportUploads = $import;
303
	}
304
305
	/**
306
	 * Default per-page callback. Sets up some things related to site statistics
307
	 * @param array $titleAndForeignTitle Two-element array, with Title object at
308
	 * index 0 and ForeignTitle object at index 1
309
	 * @return bool
310
	 */
311
	public function beforeImportPage( $titleAndForeignTitle ) {
312
		$title = $titleAndForeignTitle[0];
313
		$page = WikiPage::factory( $title );
314
		$this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
315
		return true;
316
	}
317
318
	/**
319
	 * Default per-revision callback, performs the import.
320
	 * @param WikiRevision $revision
321
	 * @return bool
322
	 */
323
	public function importRevision( $revision ) {
324
		if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
325
			$this->notice( 'import-error-bad-location',
326
				$revision->getTitle()->getPrefixedText(),
327
				$revision->getID(),
328
				$revision->getModel(),
329
				$revision->getFormat() );
330
331
			return false;
332
		}
333
334
		try {
335
			return $revision->importOldRevision();
336
		} catch ( MWContentSerializationException $ex ) {
337
			$this->notice( 'import-error-unserialize',
338
				$revision->getTitle()->getPrefixedText(),
339
				$revision->getID(),
340
				$revision->getModel(),
341
				$revision->getFormat() );
342
		}
343
344
		return false;
345
	}
346
347
	/**
348
	 * Default per-revision callback, performs the import.
349
	 * @param WikiRevision $revision
350
	 * @return bool
351
	 */
352
	public function importLogItem( $revision ) {
353
		return $revision->importLogItem();
354
	}
355
356
	/**
357
	 * Dummy for now...
358
	 * @param WikiRevision $revision
359
	 * @return bool
360
	 */
361
	public function importUpload( $revision ) {
362
		return $revision->importUpload();
363
	}
364
365
	/**
366
	 * Mostly for hook use
367
	 * @param Title $title
368
	 * @param ForeignTitle $foreignTitle
369
	 * @param int $revCount
370
	 * @param int $sRevCount
371
	 * @param array $pageInfo
372
	 * @return bool
373
	 */
374
	public function finishImportPage( $title, $foreignTitle, $revCount,
375
			$sRevCount, $pageInfo ) {
376
377
		// Update article count statistics (T42009)
378
		// The normal counting logic in WikiPage->doEditUpdates() is designed for
379
		// one-revision-at-a-time editing, not bulk imports. In this situation it
380
		// suffers from issues of replica DB lag. We let WikiPage handle the total page
381
		// and revision count, and we implement our own custom logic for the
382
		// article (content page) count.
383
		$page = WikiPage::factory( $title );
384
		$page->loadPageData( 'fromdbmaster' );
385
		$content = $page->getContent();
386
		if ( $content === null ) {
387
			wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $title .
388
				' because WikiPage::getContent() returned null' );
389
		} else {
390
			$editInfo = $page->prepareContentForEdit( $content );
391
			$countKey = 'title_' . $title->getPrefixedText();
392
			$countable = $page->isCountable( $editInfo );
393
			if ( array_key_exists( $countKey, $this->countableCache ) &&
394
				$countable != $this->countableCache[$countKey] ) {
395
				DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
396
					'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
397
				] ) );
398
			}
399
		}
400
401
		$args = func_get_args();
402
		return Hooks::run( 'AfterImportPage', $args );
403
	}
404
405
	/**
406
	 * Alternate per-revision callback, for debugging.
407
	 * @param WikiRevision $revision
408
	 */
409
	public function debugRevisionHandler( &$revision ) {
410
		$this->debug( "Got revision:" );
411
		if ( is_object( $revision->title ) ) {
412
			$this->debug( "-- Title: " . $revision->title->getPrefixedText() );
413
		} else {
414
			$this->debug( "-- Title: <invalid>" );
415
		}
416
		$this->debug( "-- User: " . $revision->user_text );
417
		$this->debug( "-- Timestamp: " . $revision->timestamp );
418
		$this->debug( "-- Comment: " . $revision->comment );
419
		$this->debug( "-- Text: " . $revision->text );
420
	}
421
422
	/**
423
	 * Notify the callback function of site info
424
	 * @param array $siteInfo
425
	 * @return bool|mixed
426
	 */
427
	private function siteInfoCallback( $siteInfo ) {
428
		if ( isset( $this->mSiteInfoCallback ) ) {
429
			return call_user_func_array( $this->mSiteInfoCallback,
430
					[ $siteInfo, $this ] );
431
		} else {
432
			return false;
433
		}
434
	}
435
436
	/**
437
	 * Notify the callback function when a new "<page>" is reached.
438
	 * @param Title $title
439
	 */
440
	function pageCallback( $title ) {
441
		if ( isset( $this->mPageCallback ) ) {
442
			call_user_func( $this->mPageCallback, $title );
443
		}
444
	}
445
446
	/**
447
	 * Notify the callback function when a "</page>" is closed.
448
	 * @param Title $title
449
	 * @param ForeignTitle $foreignTitle
450
	 * @param int $revCount
451
	 * @param int $sucCount Number of revisions for which callback returned true
452
	 * @param array $pageInfo Associative array of page information
453
	 */
454
	private function pageOutCallback( $title, $foreignTitle, $revCount,
455
			$sucCount, $pageInfo ) {
456
		if ( isset( $this->mPageOutCallback ) ) {
457
			$args = func_get_args();
458
			call_user_func_array( $this->mPageOutCallback, $args );
459
		}
460
	}
461
462
	/**
463
	 * Notify the callback function of a revision
464
	 * @param WikiRevision $revision
465
	 * @return bool|mixed
466
	 */
467
	private function revisionCallback( $revision ) {
468
		if ( isset( $this->mRevisionCallback ) ) {
469
			return call_user_func_array( $this->mRevisionCallback,
470
					[ $revision, $this ] );
471
		} else {
472
			return false;
473
		}
474
	}
475
476
	/**
477
	 * Notify the callback function of a new log item
478
	 * @param WikiRevision $revision
479
	 * @return bool|mixed
480
	 */
481
	private function logItemCallback( $revision ) {
482
		if ( isset( $this->mLogItemCallback ) ) {
483
			return call_user_func_array( $this->mLogItemCallback,
484
					[ $revision, $this ] );
485
		} else {
486
			return false;
487
		}
488
	}
489
490
	/**
491
	 * Retrieves the contents of the named attribute of the current element.
492
	 * @param string $attr The name of the attribute
493
	 * @return string The value of the attribute or an empty string if it is not set in the current
494
	 * element.
495
	 */
496
	public function nodeAttribute( $attr ) {
497
		return $this->reader->getAttribute( $attr );
498
	}
499
500
	/**
501
	 * Shouldn't something like this be built-in to XMLReader?
502
	 * Fetches text contents of the current element, assuming
503
	 * no sub-elements or such scary things.
504
	 * @return string
505
	 * @access private
506
	 */
507 View Code Duplication
	public function nodeContents() {
508
		if ( $this->reader->isEmptyElement ) {
509
			return "";
510
		}
511
		$buffer = "";
512
		while ( $this->reader->read() ) {
513
			switch ( $this->reader->nodeType ) {
514
			case XMLReader::TEXT:
515
			case XMLReader::CDATA:
516
			case XMLReader::SIGNIFICANT_WHITESPACE:
517
				$buffer .= $this->reader->value;
518
				break;
519
			case XMLReader::END_ELEMENT:
520
				return $buffer;
521
			}
522
		}
523
524
		$this->reader->close();
525
		return '';
526
	}
527
528
	/**
529
	 * Primary entry point
530
	 * @throws MWException
531
	 * @return bool
532
	 */
533
	public function doImport() {
534
		// Calls to reader->read need to be wrapped in calls to
535
		// libxml_disable_entity_loader() to avoid local file
536
		// inclusion attacks (bug 46932).
537
		$oldDisable = libxml_disable_entity_loader( true );
538
		$this->reader->read();
539
540
		if ( $this->reader->localName != 'mediawiki' ) {
541
			libxml_disable_entity_loader( $oldDisable );
542
			throw new MWException( "Expected <mediawiki> tag, got " .
543
				$this->reader->localName );
544
		}
545
		$this->debug( "<mediawiki> tag is correct." );
546
547
		$this->debug( "Starting primary dump processing loop." );
548
549
		$keepReading = $this->reader->read();
550
		$skip = false;
551
		$rethrow = null;
552
		try {
553
			while ( $keepReading ) {
554
				$tag = $this->reader->localName;
555
				$type = $this->reader->nodeType;
556
557
				if ( !Hooks::run( 'ImportHandleToplevelXMLTag', [ $this ] ) ) {
558
					// Do nothing
559
				} elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
560
					break;
561
				} elseif ( $tag == 'siteinfo' ) {
562
					$this->handleSiteInfo();
563
				} elseif ( $tag == 'page' ) {
564
					$this->handlePage();
565
				} elseif ( $tag == 'logitem' ) {
566
					$this->handleLogItem();
567
				} elseif ( $tag != '#text' ) {
568
					$this->warn( "Unhandled top-level XML tag $tag" );
569
570
					$skip = true;
571
				}
572
573
				if ( $skip ) {
574
					$keepReading = $this->reader->next();
575
					$skip = false;
576
					$this->debug( "Skip" );
577
				} else {
578
					$keepReading = $this->reader->read();
579
				}
580
			}
581
		} catch ( Exception $ex ) {
582
			$rethrow = $ex;
583
		}
584
585
		// finally
586
		libxml_disable_entity_loader( $oldDisable );
587
		$this->reader->close();
588
589
		if ( $rethrow ) {
590
			throw $rethrow;
591
		}
592
593
		return true;
594
	}
595
596
	private function handleSiteInfo() {
597
		$this->debug( "Enter site info handler." );
598
		$siteInfo = [];
599
600
		// Fields that can just be stuffed in the siteInfo object
601
		$normalFields = [ 'sitename', 'base', 'generator', 'case' ];
602
603
		while ( $this->reader->read() ) {
604
			if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
605
					$this->reader->localName == 'siteinfo' ) {
606
				break;
607
			}
608
609
			$tag = $this->reader->localName;
610
611
			if ( $tag == 'namespace' ) {
612
				$this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
613
					$this->nodeContents();
614
			} elseif ( in_array( $tag, $normalFields ) ) {
615
				$siteInfo[$tag] = $this->nodeContents();
616
			}
617
		}
618
619
		$siteInfo['_namespaces'] = $this->foreignNamespaces;
620
		$this->siteInfoCallback( $siteInfo );
621
	}
622
623
	private function handleLogItem() {
624
		$this->debug( "Enter log item handler." );
625
		$logInfo = [];
626
627
		// Fields that can just be stuffed in the pageInfo object
628
		$normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
629
					'logtitle', 'params' ];
630
631
		while ( $this->reader->read() ) {
632
			if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
633
					$this->reader->localName == 'logitem' ) {
634
				break;
635
			}
636
637
			$tag = $this->reader->localName;
638
639 View Code Duplication
			if ( !Hooks::run( 'ImportHandleLogItemXMLTag', [
640
				$this, $logInfo
641
			] ) ) {
642
				// Do nothing
643
			} elseif ( in_array( $tag, $normalFields ) ) {
644
				$logInfo[$tag] = $this->nodeContents();
645
			} elseif ( $tag == 'contributor' ) {
646
				$logInfo['contributor'] = $this->handleContributor();
647
			} elseif ( $tag != '#text' ) {
648
				$this->warn( "Unhandled log-item XML tag $tag" );
649
			}
650
		}
651
652
		$this->processLogItem( $logInfo );
653
	}
654
655
	/**
656
	 * @param array $logInfo
657
	 * @return bool|mixed
658
	 */
659
	private function processLogItem( $logInfo ) {
660
661
		$revision = new WikiRevision( $this->config );
662
663
		if ( isset( $logInfo['id'] ) ) {
664
			$revision->setID( $logInfo['id'] );
665
		}
666
		$revision->setType( $logInfo['type'] );
667
		$revision->setAction( $logInfo['action'] );
668
		if ( isset( $logInfo['timestamp'] ) ) {
669
			$revision->setTimestamp( $logInfo['timestamp'] );
670
		}
671
		if ( isset( $logInfo['params'] ) ) {
672
			$revision->setParams( $logInfo['params'] );
673
		}
674
		if ( isset( $logInfo['logtitle'] ) ) {
675
			// @todo Using Title for non-local titles is a recipe for disaster.
676
			// We should use ForeignTitle here instead.
677
			$revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
678
		}
679
680
		$revision->setNoUpdates( $this->mNoUpdates );
681
682
		if ( isset( $logInfo['comment'] ) ) {
683
			$revision->setComment( $logInfo['comment'] );
684
		}
685
686
		if ( isset( $logInfo['contributor']['ip'] ) ) {
687
			$revision->setUserIP( $logInfo['contributor']['ip'] );
688
		}
689
690
		if ( !isset( $logInfo['contributor']['username'] ) ) {
691
			$revision->setUsername( 'Unknown user' );
692
		} else {
693
			$revision->setUsername( $logInfo['contributor']['username'] );
694
		}
695
696
		return $this->logItemCallback( $revision );
697
	}
698
699
	private function handlePage() {
700
		// Handle page data.
701
		$this->debug( "Enter page handler." );
702
		$pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
703
704
		// Fields that can just be stuffed in the pageInfo object
705
		$normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
706
707
		$skip = false;
708
		$badTitle = false;
709
710
		while ( $skip ? $this->reader->next() : $this->reader->read() ) {
711
			if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
712
					$this->reader->localName == 'page' ) {
713
				break;
714
			}
715
716
			$skip = false;
717
718
			$tag = $this->reader->localName;
719
720
			if ( $badTitle ) {
721
				// The title is invalid, bail out of this page
722
				$skip = true;
723
			} elseif ( !Hooks::run( 'ImportHandlePageXMLTag', [ $this,
724
						&$pageInfo ] ) ) {
725
				// Do nothing
726
			} elseif ( in_array( $tag, $normalFields ) ) {
727
				// An XML snippet:
728
				// <page>
729
				//     <id>123</id>
730
				//     <title>Page</title>
731
				//     <redirect title="NewTitle"/>
732
				//     ...
733
				// Because the redirect tag is built differently, we need special handling for that case.
734
				if ( $tag == 'redirect' ) {
735
					$pageInfo[$tag] = $this->nodeAttribute( 'title' );
736
				} else {
737
					$pageInfo[$tag] = $this->nodeContents();
738
				}
739
			} elseif ( $tag == 'revision' || $tag == 'upload' ) {
740
				if ( !isset( $title ) ) {
741
					$title = $this->processTitle( $pageInfo['title'],
742
						isset( $pageInfo['ns'] ) ? $pageInfo['ns'] : null );
743
744
					// $title is either an array of two titles or false.
745
					if ( is_array( $title ) ) {
746
						$this->pageCallback( $title );
747
						list( $pageInfo['_title'], $foreignTitle ) = $title;
748
					} else {
749
						$badTitle = true;
750
						$skip = true;
751
					}
752
				}
753
754
				if ( $title ) {
755
					if ( $tag == 'revision' ) {
756
						$this->handleRevision( $pageInfo );
757
					} else {
758
						$this->handleUpload( $pageInfo );
759
					}
760
				}
761
			} elseif ( $tag != '#text' ) {
762
				$this->warn( "Unhandled page XML tag $tag" );
763
				$skip = true;
764
			}
765
		}
766
767
		// @note $pageInfo is only set if a valid $title is processed above with
768
		//       no error. If we have a valid $title, then pageCallback is called
769
		//       above, $pageInfo['title'] is set and we do pageOutCallback here.
770
		//       If $pageInfo['_title'] is not set, then $foreignTitle is also not
771
		//       set since they both come from $title above.
772
		if ( array_key_exists( '_title', $pageInfo ) ) {
773
			$this->pageOutCallback( $pageInfo['_title'], $foreignTitle,
774
					$pageInfo['revisionCount'],
775
					$pageInfo['successfulRevisionCount'],
776
					$pageInfo );
777
		}
778
	}
779
780
	/**
781
	 * @param array $pageInfo
782
	 */
783
	private function handleRevision( &$pageInfo ) {
784
		$this->debug( "Enter revision handler" );
785
		$revisionInfo = [];
786
787
		$normalFields = [ 'id', 'timestamp', 'comment', 'minor', 'model', 'format', 'text' ];
788
789
		$skip = false;
790
791
		while ( $skip ? $this->reader->next() : $this->reader->read() ) {
792
			if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
793
					$this->reader->localName == 'revision' ) {
794
				break;
795
			}
796
797
			$tag = $this->reader->localName;
798
799 View Code Duplication
			if ( !Hooks::run( 'ImportHandleRevisionXMLTag', [
800
				$this, $pageInfo, $revisionInfo
801
			] ) ) {
802
				// Do nothing
803
			} elseif ( in_array( $tag, $normalFields ) ) {
804
				$revisionInfo[$tag] = $this->nodeContents();
805
			} elseif ( $tag == 'contributor' ) {
806
				$revisionInfo['contributor'] = $this->handleContributor();
807
			} elseif ( $tag != '#text' ) {
808
				$this->warn( "Unhandled revision XML tag $tag" );
809
				$skip = true;
810
			}
811
		}
812
813
		$pageInfo['revisionCount']++;
814
		if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
815
			$pageInfo['successfulRevisionCount']++;
816
		}
817
	}
818
819
	/**
820
	 * @param array $pageInfo
821
	 * @param array $revisionInfo
822
	 * @return bool|mixed
823
	 */
824
	private function processRevision( $pageInfo, $revisionInfo ) {
825
		global $wgMaxArticleSize;
826
827
		// Make sure revisions won't violate $wgMaxArticleSize, which could lead to
828
		// database errors and instability. Testing for revisions with only listed
829
		// content models, as other content models might use serialization formats
830
		// which aren't checked against $wgMaxArticleSize.
831
		if ( ( !isset( $revisionInfo['model'] ) ||
832
			in_array( $revisionInfo['model'], [
833
				'wikitext',
834
				'css',
835
				'json',
836
				'javascript',
837
				'text',
838
				''
839
			] ) ) &&
840
			strlen( $revisionInfo['text'] ) > $wgMaxArticleSize * 1024
841
		) {
842
			throw new MWException( 'The text of ' .
843
				( isset( $revisionInfo['id'] ) ?
844
					"the revision with ID $revisionInfo[id]" :
845
					'a revision'
846
				) . " exceeds the maximum allowable size ($wgMaxArticleSize KB)" );
847
		}
848
849
		$revision = new WikiRevision( $this->config );
850
851
		if ( isset( $revisionInfo['id'] ) ) {
852
			$revision->setID( $revisionInfo['id'] );
853
		}
854
		if ( isset( $revisionInfo['model'] ) ) {
855
			$revision->setModel( $revisionInfo['model'] );
856
		}
857
		if ( isset( $revisionInfo['format'] ) ) {
858
			$revision->setFormat( $revisionInfo['format'] );
859
		}
860
		$revision->setTitle( $pageInfo['_title'] );
861
862
		if ( isset( $revisionInfo['text'] ) ) {
863
			$handler = $revision->getContentHandler();
864
			$text = $handler->importTransform(
865
				$revisionInfo['text'],
866
				$revision->getFormat() );
867
868
			$revision->setText( $text );
869
		}
870
		if ( isset( $revisionInfo['timestamp'] ) ) {
871
			$revision->setTimestamp( $revisionInfo['timestamp'] );
872
		} else {
873
			$revision->setTimestamp( wfTimestampNow() );
874
		}
875
876
		if ( isset( $revisionInfo['comment'] ) ) {
877
			$revision->setComment( $revisionInfo['comment'] );
878
		}
879
880
		if ( isset( $revisionInfo['minor'] ) ) {
881
			$revision->setMinor( true );
882
		}
883
		if ( isset( $revisionInfo['contributor']['ip'] ) ) {
884
			$revision->setUserIP( $revisionInfo['contributor']['ip'] );
885
		} elseif ( isset( $revisionInfo['contributor']['username'] ) ) {
886
			$revision->setUsername( $revisionInfo['contributor']['username'] );
887
		} else {
888
			$revision->setUsername( 'Unknown user' );
889
		}
890
		$revision->setNoUpdates( $this->mNoUpdates );
891
892
		return $this->revisionCallback( $revision );
893
	}
894
895
	/**
896
	 * @param array $pageInfo
897
	 * @return mixed
898
	 */
899
	private function handleUpload( &$pageInfo ) {
900
		$this->debug( "Enter upload handler" );
901
		$uploadInfo = [];
902
903
		$normalFields = [ 'timestamp', 'comment', 'filename', 'text',
904
					'src', 'size', 'sha1base36', 'archivename', 'rel' ];
905
906
		$skip = false;
907
908
		while ( $skip ? $this->reader->next() : $this->reader->read() ) {
909
			if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
910
					$this->reader->localName == 'upload' ) {
911
				break;
912
			}
913
914
			$tag = $this->reader->localName;
915
916
			if ( !Hooks::run( 'ImportHandleUploadXMLTag', [
917
				$this, $pageInfo
918
			] ) ) {
919
				// Do nothing
920
			} elseif ( in_array( $tag, $normalFields ) ) {
921
				$uploadInfo[$tag] = $this->nodeContents();
922
			} elseif ( $tag == 'contributor' ) {
923
				$uploadInfo['contributor'] = $this->handleContributor();
924
			} elseif ( $tag == 'contents' ) {
925
				$contents = $this->nodeContents();
926
				$encoding = $this->reader->getAttribute( 'encoding' );
927
				if ( $encoding === 'base64' ) {
928
					$uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
929
					$uploadInfo['isTempSrc'] = true;
930
				}
931
			} elseif ( $tag != '#text' ) {
932
				$this->warn( "Unhandled upload XML tag $tag" );
933
				$skip = true;
934
			}
935
		}
936
937
		if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
938
			$path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
939
			if ( file_exists( $path ) ) {
940
				$uploadInfo['fileSrc'] = $path;
941
				$uploadInfo['isTempSrc'] = false;
942
			}
943
		}
944
945
		if ( $this->mImportUploads ) {
946
			return $this->processUpload( $pageInfo, $uploadInfo );
947
		}
948
	}
949
950
	/**
951
	 * @param string $contents
952
	 * @return string
953
	 */
954
	private function dumpTemp( $contents ) {
955
		$filename = tempnam( wfTempDir(), 'importupload' );
956
		file_put_contents( $filename, $contents );
957
		return $filename;
958
	}
959
960
	/**
961
	 * @param array $pageInfo
962
	 * @param array $uploadInfo
963
	 * @return mixed
964
	 */
965
	private function processUpload( $pageInfo, $uploadInfo ) {
966
		$revision = new WikiRevision( $this->config );
967
		$text = isset( $uploadInfo['text'] ) ? $uploadInfo['text'] : '';
968
969
		$revision->setTitle( $pageInfo['_title'] );
970
		$revision->setID( $pageInfo['id'] );
971
		$revision->setTimestamp( $uploadInfo['timestamp'] );
972
		$revision->setText( $text );
973
		$revision->setFilename( $uploadInfo['filename'] );
974
		if ( isset( $uploadInfo['archivename'] ) ) {
975
			$revision->setArchiveName( $uploadInfo['archivename'] );
976
		}
977
		$revision->setSrc( $uploadInfo['src'] );
978
		if ( isset( $uploadInfo['fileSrc'] ) ) {
979
			$revision->setFileSrc( $uploadInfo['fileSrc'],
980
				!empty( $uploadInfo['isTempSrc'] ) );
981
		}
982
		if ( isset( $uploadInfo['sha1base36'] ) ) {
983
			$revision->setSha1Base36( $uploadInfo['sha1base36'] );
984
		}
985
		$revision->setSize( intval( $uploadInfo['size'] ) );
986
		$revision->setComment( $uploadInfo['comment'] );
987
988
		if ( isset( $uploadInfo['contributor']['ip'] ) ) {
989
			$revision->setUserIP( $uploadInfo['contributor']['ip'] );
990
		}
991
		if ( isset( $uploadInfo['contributor']['username'] ) ) {
992
			$revision->setUsername( $uploadInfo['contributor']['username'] );
993
		}
994
		$revision->setNoUpdates( $this->mNoUpdates );
995
996
		return call_user_func( $this->mUploadCallback, $revision );
997
	}
998
999
	/**
1000
	 * @return array
1001
	 */
1002
	private function handleContributor() {
1003
		$fields = [ 'id', 'ip', 'username' ];
1004
		$info = [];
1005
1006
		if ( $this->reader->isEmptyElement ) {
1007
			return $info;
1008
		}
1009
		while ( $this->reader->read() ) {
1010
			if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1011
					$this->reader->localName == 'contributor' ) {
1012
				break;
1013
			}
1014
1015
			$tag = $this->reader->localName;
1016
1017
			if ( in_array( $tag, $fields ) ) {
1018
				$info[$tag] = $this->nodeContents();
1019
			}
1020
		}
1021
1022
		return $info;
1023
	}
1024
1025
	/**
1026
	 * @param string $text
1027
	 * @param string|null $ns
1028
	 * @return array|bool
1029
	 */
1030
	private function processTitle( $text, $ns = null ) {
1031
		if ( is_null( $this->foreignNamespaces ) ) {
1032
			$foreignTitleFactory = new NaiveForeignTitleFactory();
1033
		} else {
1034
			$foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1035
				$this->foreignNamespaces );
1036
		}
1037
1038
		$foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1039
			intval( $ns ) );
1040
1041
		$title = $this->importTitleFactory->createTitleFromForeignTitle(
1042
			$foreignTitle );
1043
1044
		$commandLineMode = $this->config->get( 'CommandLineMode' );
1045
		if ( is_null( $title ) ) {
1046
			# Invalid page title? Ignore the page
1047
			$this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1048
			return false;
1049
		} elseif ( $title->isExternal() ) {
1050
			$this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1051
			return false;
1052
		} elseif ( !$title->canExist() ) {
1053
			$this->notice( 'import-error-special', $title->getPrefixedText() );
1054
			return false;
1055
		} elseif ( !$title->userCan( 'edit' ) && !$commandLineMode ) {
1056
			# Do not import if the importing wiki user cannot edit this page
1057
			$this->notice( 'import-error-edit', $title->getPrefixedText() );
1058
			return false;
1059
		} elseif ( !$title->exists() && !$title->userCan( 'create' ) && !$commandLineMode ) {
1060
			# Do not import if the importing wiki user cannot create this page
1061
			$this->notice( 'import-error-create', $title->getPrefixedText() );
1062
			return false;
1063
		}
1064
1065
		return [ $title, $foreignTitle ];
1066
	}
1067
}
1068