Completed
Branch master (bb6f05)
by
unknown
31:42
created

MediaWikiTitleCodec::getTitleInvalidRegex()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 19
Code Lines 11

Duplication

Lines 0
Ratio 0 %
Metric Value
dl 0
loc 19
rs 9.4285
cc 2
eloc 11
nc 2
nop 0
1
<?php
2
/**
3
 * A codec for %MediaWiki page titles.
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License along
16
 * with this program; if not, write to the Free Software Foundation, Inc.,
17
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18
 * http://www.gnu.org/copyleft/gpl.html
19
 *
20
 * @file
21
 * @license GPL 2+
22
 * @author Daniel Kinzler
23
 */
24
use MediaWiki\Linker\LinkTarget;
25
26
/**
27
 * A codec for %MediaWiki page titles.
28
 *
29
 * @note Normalization and validation is applied while parsing, not when formatting.
30
 * It's possible to construct a TitleValue with an invalid title, and use MediaWikiTitleCodec
31
 * to generate an (invalid) title string from it. TitleValues should be constructed only
32
 * via parseTitle() or from a (semi)trusted source, such as the database.
33
 *
34
 * @see https://www.mediawiki.org/wiki/Requests_for_comment/TitleValue
35
 * @since 1.23
36
 */
37
class MediaWikiTitleCodec implements TitleFormatter, TitleParser {
38
	/**
39
	 * @var Language
40
	 */
41
	protected $language;
42
43
	/**
44
	 * @var GenderCache
45
	 */
46
	protected $genderCache;
47
48
	/**
49
	 * @var string[]
50
	 */
51
	protected $localInterwikis;
52
53
	/**
54
	 * @param Language $language The language object to use for localizing namespace names.
55
	 * @param GenderCache $genderCache The gender cache for generating gendered namespace names
56
	 * @param string[]|string $localInterwikis
57
	 */
58
	public function __construct( Language $language, GenderCache $genderCache,
59
		$localInterwikis = []
60
	) {
61
		$this->language = $language;
62
		$this->genderCache = $genderCache;
63
		$this->localInterwikis = (array)$localInterwikis;
64
	}
65
66
	/**
67
	 * @see TitleFormatter::getNamespaceName()
68
	 *
69
	 * @param int $namespace
70
	 * @param string $text
71
	 *
72
	 * @throws InvalidArgumentException If the namespace is invalid
73
	 * @return string
74
	 */
75
	public function getNamespaceName( $namespace, $text ) {
76
		if ( $this->language->needsGenderDistinction() &&
77
			MWNamespace::hasGenderDistinction( $namespace )
78
		) {
79
80
			// NOTE: we are assuming here that the title text is a user name!
81
			$gender = $this->genderCache->getGenderOf( $text, __METHOD__ );
82
			$name = $this->language->getGenderNsText( $namespace, $gender );
83
		} else {
84
			$name = $this->language->getNsText( $namespace );
85
		}
86
87
		if ( $name === false ) {
88
			throw new InvalidArgumentException( 'Unknown namespace ID: ' . $namespace );
89
		}
90
91
		return $name;
92
	}
93
94
	/**
95
	 * @see TitleFormatter::formatTitle()
96
	 *
97
	 * @param int|bool $namespace The namespace ID (or false, if the namespace should be ignored)
98
	 * @param string $text The page title. Should be valid. Only minimal normalization is applied.
99
	 *        Underscores will be replaced.
100
	 * @param string $fragment The fragment name (may be empty).
101
	 * @param string $interwiki The interwiki name (may be empty).
102
	 *
103
	 * @throws InvalidArgumentException If the namespace is invalid
104
	 * @return string
105
	 */
106
	public function formatTitle( $namespace, $text, $fragment = '', $interwiki = '' ) {
107
		if ( $namespace !== false ) {
108
			$namespace = $this->getNamespaceName( $namespace, $text );
0 ignored issues
show
Bug introduced by
It seems like $namespace defined by $this->getNamespaceName($namespace, $text) on line 108 can also be of type boolean; however, MediaWikiTitleCodec::getNamespaceName() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
109
110
			if ( $namespace !== '' ) {
111
				$text = $namespace . ':' . $text;
112
			}
113
		}
114
115
		if ( $fragment !== '' ) {
116
			$text = $text . '#' . $fragment;
117
		}
118
119
		if ( $interwiki !== '' ) {
120
			$text = $interwiki . ':' . $text;
121
		}
122
123
		$text = str_replace( '_', ' ', $text );
124
125
		return $text;
126
	}
127
128
	/**
129
	 * Parses the given text and constructs a TitleValue. Normalization
130
	 * is applied according to the rules appropriate for the form specified by $form.
131
	 *
132
	 * @param string $text The text to parse
133
	 * @param int $defaultNamespace Namespace to assume per default (usually NS_MAIN)
134
	 *
135
	 * @throws MalformedTitleException
136
	 * @return TitleValue
137
	 */
138
	public function parseTitle( $text, $defaultNamespace ) {
139
		// NOTE: this is an ugly cludge that allows this class to share the
140
		// code for parsing with the old Title class. The parser code should
141
		// be refactored to avoid this.
142
		$parts = $this->splitTitleString( $text, $defaultNamespace );
143
144
		// Relative fragment links are not supported by TitleValue
145
		if ( $parts['dbkey'] === '' ) {
146
			throw new MalformedTitleException( 'title-invalid-empty', $text );
147
		}
148
149
		return new TitleValue(
150
			$parts['namespace'],
151
			$parts['dbkey'],
152
			$parts['fragment'],
153
			$parts['interwiki']
154
		);
155
	}
156
157
	/**
158
	 * @see TitleFormatter::getText()
159
	 *
160
	 * @param LinkTarget $title
161
	 *
162
	 * @return string $title->getText()
163
	 */
164
	public function getText( LinkTarget $title ) {
165
		return $this->formatTitle( false, $title->getText(), '' );
166
	}
167
168
	/**
169
	 * @see TitleFormatter::getText()
170
	 *
171
	 * @param LinkTarget $title
172
	 *
173
	 * @return string
174
	 */
175
	public function getPrefixedText( LinkTarget $title ) {
176
		return $this->formatTitle(
177
			$title->getNamespace(),
178
			$title->getText(),
179
			'',
180
			$title->getInterwiki()
181
		);
182
	}
183
184
	/**
185
	 * @since 1.27
186
	 * @see TitleFormatter::getPrefixedDBkey()
187
	 * @param LinkTarget $target
188
	 * @return string
189
	 */
190
	public function getPrefixedDBkey( LinkTarget $target ) {
191
		$key = '';
192
		if ( $target->isExternal() ) {
193
			$key .= $target->getInterwiki() . ':';
194
		}
195
		// Try to get a namespace name, but fallback
196
		// to empty string if it doesn't exist
197
		try {
198
			$nsName = $this->getNamespaceName(
199
				$target->getNamespace(),
200
				$target->getText()
201
			);
202
		} catch ( InvalidArgumentException $e ) {
203
			$nsName = '';
204
		}
205
206
		if ( $target->getNamespace() !== 0 ) {
207
			$key .= $nsName . ':';
208
		}
209
210
		$key .= $target->getText();
211
212
		return strtr( $key, ' ', '_' );
213
	}
214
215
	/**
216
	 * @see TitleFormatter::getText()
217
	 *
218
	 * @param LinkTarget $title
219
	 *
220
	 * @return string
221
	 */
222
	public function getFullText( LinkTarget $title ) {
223
		return $this->formatTitle(
224
			$title->getNamespace(),
225
			$title->getText(),
226
			$title->getFragment(),
227
			$title->getInterwiki()
228
		);
229
	}
230
231
	/**
232
	 * Normalizes and splits a title string.
233
	 *
234
	 * This function removes illegal characters, splits off the interwiki and
235
	 * namespace prefixes, sets the other forms, and canonicalizes
236
	 * everything.
237
	 *
238
	 * @todo this method is only exposed as a temporary measure to ease refactoring.
239
	 * It was copied with minimal changes from Title::secureAndSplit().
240
	 *
241
	 * @todo This method should be split up and an appropriate interface
242
	 * defined for use by the Title class.
243
	 *
244
	 * @param string $text
245
	 * @param int $defaultNamespace
246
	 *
247
	 * @throws MalformedTitleException If $text is not a valid title string.
248
	 * @return array A map with the fields 'interwiki', 'fragment', 'namespace',
249
	 *         'user_case_dbkey', and 'dbkey'.
250
	 */
251
	public function splitTitleString( $text, $defaultNamespace = NS_MAIN ) {
252
		$dbkey = str_replace( ' ', '_', $text );
253
254
		# Initialisation
255
		$parts = [
256
			'interwiki' => '',
257
			'local_interwiki' => false,
258
			'fragment' => '',
259
			'namespace' => $defaultNamespace,
260
			'dbkey' => $dbkey,
261
			'user_case_dbkey' => $dbkey,
262
		];
263
264
		# Strip Unicode bidi override characters.
265
		# Sometimes they slip into cut-n-pasted page titles, where the
266
		# override chars get included in list displays.
267
		$dbkey = preg_replace( '/\xE2\x80[\x8E\x8F\xAA-\xAE]/S', '', $dbkey );
268
269
		# Clean up whitespace
270
		# Note: use of the /u option on preg_replace here will cause
271
		# input with invalid UTF-8 sequences to be nullified out in PHP 5.2.x,
272
		# conveniently disabling them.
273
		$dbkey = preg_replace(
274
			'/[ _\xA0\x{1680}\x{180E}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}]+/u',
275
			'_',
276
			$dbkey
277
		);
278
		$dbkey = trim( $dbkey, '_' );
279
280
		if ( strpos( $dbkey, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
281
			# Contained illegal UTF-8 sequences or forbidden Unicode chars.
282
			throw new MalformedTitleException( 'title-invalid-utf8', $text );
283
		}
284
285
		$parts['dbkey'] = $dbkey;
286
287
		# Initial colon indicates main namespace rather than specified default
288
		# but should not create invalid {ns,title} pairs such as {0,Project:Foo}
289 View Code Duplication
		if ( $dbkey !== '' && ':' == $dbkey[0] ) {
290
			$parts['namespace'] = NS_MAIN;
291
			$dbkey = substr( $dbkey, 1 ); # remove the colon but continue processing
292
			$dbkey = trim( $dbkey, '_' ); # remove any subsequent whitespace
293
		}
294
295
		if ( $dbkey == '' ) {
296
			throw new MalformedTitleException( 'title-invalid-empty', $text );
297
		}
298
299
		# Namespace or interwiki prefix
300
		$prefixRegexp = "/^(.+?)_*:_*(.*)$/S";
301
		do {
302
			$m = [];
303
			if ( preg_match( $prefixRegexp, $dbkey, $m ) ) {
304
				$p = $m[1];
305
				$ns = $this->language->getNsIndex( $p );
306
				if ( $ns !== false ) {
307
					# Ordinary namespace
308
					$dbkey = $m[2];
309
					$parts['namespace'] = $ns;
310
					# For Talk:X pages, check if X has a "namespace" prefix
311
					if ( $ns == NS_TALK && preg_match( $prefixRegexp, $dbkey, $x ) ) {
312
						if ( $this->language->getNsIndex( $x[1] ) ) {
313
							# Disallow Talk:File:x type titles...
314
							throw new MalformedTitleException( 'title-invalid-talk-namespace', $text );
315
						} elseif ( Interwiki::isValidInterwiki( $x[1] ) ) {
316
							// TODO: get rid of global state!
317
							# Disallow Talk:Interwiki:x type titles...
318
							throw new MalformedTitleException( 'title-invalid-talk-namespace', $text );
319
						}
320
					}
321
				} elseif ( Interwiki::isValidInterwiki( $p ) ) {
322
					# Interwiki link
323
					$dbkey = $m[2];
324
					$parts['interwiki'] = $this->language->lc( $p );
325
326
					# Redundant interwiki prefix to the local wiki
327
					foreach ( $this->localInterwikis as $localIW ) {
328
						if ( 0 == strcasecmp( $parts['interwiki'], $localIW ) ) {
329
							if ( $dbkey == '' ) {
330
								# Empty self-links should point to the Main Page, to ensure
331
								# compatibility with cross-wiki transclusions and the like.
332
								$mainPage = Title::newMainPage();
333
								return [
334
									'interwiki' => $mainPage->getInterwiki(),
335
									'local_interwiki' => true,
336
									'fragment' => $mainPage->getFragment(),
337
									'namespace' => $mainPage->getNamespace(),
338
									'dbkey' => $mainPage->getDBkey(),
339
									'user_case_dbkey' => $mainPage->getUserCaseDBKey()
340
								];
341
							}
342
							$parts['interwiki'] = '';
343
							# local interwikis should behave like initial-colon links
344
							$parts['local_interwiki'] = true;
345
346
							# Do another namespace split...
347
							continue 2;
348
						}
349
					}
350
351
					# If there's an initial colon after the interwiki, that also
352
					# resets the default namespace
353 View Code Duplication
					if ( $dbkey !== '' && $dbkey[0] == ':' ) {
354
						$parts['namespace'] = NS_MAIN;
355
						$dbkey = substr( $dbkey, 1 );
356
					}
357
				}
358
				# If there's no recognized interwiki or namespace,
359
				# then let the colon expression be part of the title.
360
			}
361
			break;
362
		} while ( true );
363
364
		$fragment = strstr( $dbkey, '#' );
365
		if ( false !== $fragment ) {
366
			$parts['fragment'] = str_replace( '_', ' ', substr( $fragment, 1 ) );
367
			$dbkey = substr( $dbkey, 0, strlen( $dbkey ) - strlen( $fragment ) );
368
			# remove whitespace again: prevents "Foo_bar_#"
369
			# becoming "Foo_bar_"
370
			$dbkey = preg_replace( '/_*$/', '', $dbkey );
371
		}
372
373
		# Reject illegal characters.
374
		$rxTc = self::getTitleInvalidRegex();
375
		$matches = [];
376
		if ( preg_match( $rxTc, $dbkey, $matches ) ) {
377
			throw new MalformedTitleException( 'title-invalid-characters', $text, [ $matches[0] ] );
378
		}
379
380
		# Pages with "/./" or "/../" appearing in the URLs will often be un-
381
		# reachable due to the way web browsers deal with 'relative' URLs.
382
		# Also, they conflict with subpage syntax.  Forbid them explicitly.
383 View Code Duplication
		if (
384
			strpos( $dbkey, '.' ) !== false &&
385
			(
386
				$dbkey === '.' || $dbkey === '..' ||
387
				strpos( $dbkey, './' ) === 0 ||
388
				strpos( $dbkey, '../' ) === 0 ||
389
				strpos( $dbkey, '/./' ) !== false ||
390
				strpos( $dbkey, '/../' ) !== false ||
391
				substr( $dbkey, -2 ) == '/.' ||
392
				substr( $dbkey, -3 ) == '/..'
393
			)
394
		) {
395
			throw new MalformedTitleException( 'title-invalid-relative', $text );
396
		}
397
398
		# Magic tilde sequences? Nu-uh!
399
		if ( strpos( $dbkey, '~~~' ) !== false ) {
400
			throw new MalformedTitleException( 'title-invalid-magic-tilde', $text );
401
		}
402
403
		# Limit the size of titles to 255 bytes. This is typically the size of the
404
		# underlying database field. We make an exception for special pages, which
405
		# don't need to be stored in the database, and may edge over 255 bytes due
406
		# to subpage syntax for long titles, e.g. [[Special:Block/Long name]]
407
		$maxLength = ( $parts['namespace'] != NS_SPECIAL ) ? 255 : 512;
408
		if ( strlen( $dbkey ) > $maxLength ) {
409
			throw new MalformedTitleException( 'title-invalid-too-long', $text,
410
				[ Message::numParam( $maxLength ) ] );
411
		}
412
413
		# Normally, all wiki links are forced to have an initial capital letter so [[foo]]
414
		# and [[Foo]] point to the same place.  Don't force it for interwikis, since the
415
		# other site might be case-sensitive.
416
		$parts['user_case_dbkey'] = $dbkey;
417
		if ( $parts['interwiki'] === '' ) {
418
			$dbkey = Title::capitalize( $dbkey, $parts['namespace'] );
419
		}
420
421
		# Can't make a link to a namespace alone... "empty" local links can only be
422
		# self-links with a fragment identifier.
423
		if ( $dbkey == '' && $parts['interwiki'] === '' ) {
424
			if ( $parts['namespace'] != NS_MAIN ) {
425
				throw new MalformedTitleException( 'title-invalid-empty', $text );
426
			}
427
		}
428
429
		// Allow IPv6 usernames to start with '::' by canonicalizing IPv6 titles.
430
		// IP names are not allowed for accounts, and can only be referring to
431
		// edits from the IP. Given '::' abbreviations and caps/lowercaps,
432
		// there are numerous ways to present the same IP. Having sp:contribs scan
433
		// them all is silly and having some show the edits and others not is
434
		// inconsistent. Same for talk/userpages. Keep them normalized instead.
435
		if ( $parts['namespace'] == NS_USER || $parts['namespace'] == NS_USER_TALK ) {
436
			$dbkey = IP::sanitizeIP( $dbkey );
437
		}
438
439
		// Any remaining initial :s are illegal.
440
		if ( $dbkey !== '' && ':' == $dbkey[0] ) {
441
			throw new MalformedTitleException( 'title-invalid-leading-colon', $text );
442
		}
443
444
		# Fill fields
445
		$parts['dbkey'] = $dbkey;
446
447
		return $parts;
448
	}
449
450
	/**
451
	 * Returns a simple regex that will match on characters and sequences invalid in titles.
452
	 * Note that this doesn't pick up many things that could be wrong with titles, but that
453
	 * replacing this regex with something valid will make many titles valid.
454
	 * Previously Title::getTitleInvalidRegex()
455
	 *
456
	 * @return string Regex string
457
	 * @since 1.25
458
	 */
459
	public static function getTitleInvalidRegex() {
460
		static $rxTc = false;
461
		if ( !$rxTc ) {
462
			# Matching titles will be held as illegal.
463
			$rxTc = '/' .
464
				# Any character not allowed is forbidden...
465
				'[^' . Title::legalChars() . ']' .
466
				# URL percent encoding sequences interfere with the ability
467
				# to round-trip titles -- you can't link to them consistently.
468
				'|%[0-9A-Fa-f]{2}' .
469
				# XML/HTML character references produce similar issues.
470
				'|&[A-Za-z0-9\x80-\xff]+;' .
471
				'|&#[0-9]+;' .
472
				'|&#x[0-9A-Fa-f]+;' .
473
				'/S';
474
		}
475
476
		return $rxTc;
477
	}
478
}
479