BacklinkCache::queryLinks()   C
last analyzed

Complexity

Conditions 14
Paths 33

Size

Total Lines 57
Code Lines 36

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 14
eloc 36
nc 33
nop 5
dl 0
loc 57
rs 6.5728
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * Class for fetching backlink lists, approximate backlink counts and
4
 * partitions.
5
 *
6
 * This program is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * This program is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License along
17
 * with this program; if not, write to the Free Software Foundation, Inc.,
18
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19
 * http://www.gnu.org/copyleft/gpl.html
20
 *
21
 * @file
22
 * @author Tim Starling
23
 * @author Aaron Schulz
24
 * @copyright © 2009, Tim Starling, Domas Mituzas
25
 * @copyright © 2010, Max Sem
26
 * @copyright © 2011, Antoine Musso
27
 */
28
29
/**
30
 * Class for fetching backlink lists, approximate backlink counts and
31
 * partitions. This is a shared cache.
32
 *
33
 * Instances of this class should typically be fetched with the method
34
 * $title->getBacklinkCache().
35
 *
36
 * Ideally you should only get your backlinks from here when you think
37
 * there is some advantage in caching them. Otherwise it's just a waste
38
 * of memory.
39
 *
40
 * Introduced by r47317
41
 */
42
class BacklinkCache {
43
	/** @var BacklinkCache */
44
	protected static $instance;
45
46
	/**
47
	 * Multi dimensions array representing batches. Keys are:
48
	 *  > (string) links table name
49
	 *   > (int) batch size
50
	 *    > 'numRows' : Number of rows for this link table
51
	 *    > 'batches' : [ $start, $end ]
52
	 *
53
	 * @see BacklinkCache::partitionResult()
54
	 *
55
	 * Cleared with BacklinkCache::clear()
56
	 * @var array[]
57
	 */
58
	protected $partitionCache = [];
59
60
	/**
61
	 * Contains the whole links from a database result.
62
	 * This is raw data that will be partitioned in $partitionCache
63
	 *
64
	 * Initialized with BacklinkCache::getLinks()
65
	 * Cleared with BacklinkCache::clear()
66
	 * @var ResultWrapper[]
67
	 */
68
	protected $fullResultCache = [];
69
70
	/**
71
	 * Local copy of a database object.
72
	 *
73
	 * Accessor: BacklinkCache::getDB()
74
	 * Mutator : BacklinkCache::setDB()
75
	 * Cleared with BacklinkCache::clear()
76
	 */
77
	protected $db;
78
79
	/**
80
	 * Local copy of a Title object
81
	 */
82
	protected $title;
83
84
	const CACHE_EXPIRY = 3600;
85
86
	/**
87
	 * Create a new BacklinkCache
88
	 *
89
	 * @param Title $title : Title object to create a backlink cache for
90
	 */
91
	public function __construct( Title $title ) {
92
		$this->title = $title;
93
	}
94
95
	/**
96
	 * Create a new BacklinkCache or reuse any existing one.
97
	 * Currently, only one cache instance can exist; callers that
98
	 * need multiple backlink cache objects should keep them in scope.
99
	 *
100
	 * @param Title $title Title object to get a backlink cache for
101
	 * @return BacklinkCache
102
	 */
103
	public static function get( Title $title ) {
104
		if ( !self::$instance || !self::$instance->title->equals( $title ) ) {
105
			self::$instance = new self( $title );
106
		}
107
		return self::$instance;
108
	}
109
110
	/**
111
	 * Serialization handler, diasallows to serialize the database to prevent
112
	 * failures after this class is deserialized from cache with dead DB
113
	 * connection.
114
	 *
115
	 * @return array
116
	 */
117
	function __sleep() {
118
		return [ 'partitionCache', 'fullResultCache', 'title' ];
119
	}
120
121
	/**
122
	 * Clear locally stored data and database object.
123
	 */
124
	public function clear() {
125
		$this->partitionCache = [];
126
		$this->fullResultCache = [];
127
		unset( $this->db );
128
	}
129
130
	/**
131
	 * Set the Database object to use
132
	 *
133
	 * @param IDatabase $db
134
	 */
135
	public function setDB( $db ) {
136
		$this->db = $db;
137
	}
138
139
	/**
140
	 * Get the replica DB connection to the database
141
	 * When non existing, will initialize the connection.
142
	 * @return Database
143
	 */
144
	protected function getDB() {
145
		if ( !isset( $this->db ) ) {
146
			$this->db = wfGetDB( DB_REPLICA );
147
		}
148
149
		return $this->db;
150
	}
151
152
	/**
153
	 * Get the backlinks for a given table. Cached in process memory only.
154
	 * @param string $table
155
	 * @param int|bool $startId
156
	 * @param int|bool $endId
157
	 * @param int $max
158
	 * @return TitleArrayFromResult
159
	 */
160
	public function getLinks( $table, $startId = false, $endId = false, $max = INF ) {
161
		return TitleArray::newFromResult( $this->queryLinks( $table, $startId, $endId, $max ) );
162
	}
163
164
	/**
165
	 * Get the backlinks for a given table. Cached in process memory only.
166
	 * @param string $table
167
	 * @param int|bool $startId
168
	 * @param int|bool $endId
169
	 * @param int $max
170
	 * @param string $select 'all' or 'ids'
171
	 * @return ResultWrapper
172
	 */
173
	protected function queryLinks( $table, $startId, $endId, $max, $select = 'all' ) {
174
175
		$fromField = $this->getPrefix( $table ) . '_from';
176
177
		if ( !$startId && !$endId && is_infinite( $max )
178
			&& isset( $this->fullResultCache[$table] )
179
		) {
180
			wfDebug( __METHOD__ . ": got results from cache\n" );
181
			$res = $this->fullResultCache[$table];
182
		} else {
183
			wfDebug( __METHOD__ . ": got results from DB\n" );
184
			$conds = $this->getConditions( $table );
185
			// Use the from field in the condition rather than the joined page_id,
186
			// because databases are stupid and don't necessarily propagate indexes.
187
			if ( $startId ) {
188
				$conds[] = "$fromField >= " . intval( $startId );
189
			}
190
			if ( $endId ) {
191
				$conds[] = "$fromField <= " . intval( $endId );
192
			}
193
			$options = [ 'ORDER BY' => $fromField ];
194
			if ( is_finite( $max ) && $max > 0 ) {
195
				$options['LIMIT'] = $max;
196
			}
197
198
			if ( $select === 'ids' ) {
199
				// Just select from the backlink table and ignore the page JOIN
200
				$res = $this->getDB()->select(
201
					$table,
202
					[ $this->getPrefix( $table ) . '_from AS page_id' ],
203
					array_filter( $conds, function ( $clause ) { // kind of janky
204
						return !preg_match( '/(\b|=)page_id(\b|=)/', $clause );
205
					} ),
206
					__METHOD__,
207
					$options
208
				);
209
			} else {
210
				// Select from the backlink table and JOIN with page title information
211
				$res = $this->getDB()->select(
212
					[ $table, 'page' ],
213
					[ 'page_namespace', 'page_title', 'page_id' ],
214
					$conds,
215
					__METHOD__,
216
					array_merge( [ 'STRAIGHT_JOIN' ], $options )
217
				);
218
			}
219
220
			if ( $select === 'all' && !$startId && !$endId && $res->numRows() < $max ) {
221
				// The full results fit within the limit, so cache them
222
				$this->fullResultCache[$table] = $res;
223
			} else {
224
				wfDebug( __METHOD__ . ": results from DB were uncacheable\n" );
225
			}
226
		}
227
228
		return $res;
229
	}
230
231
	/**
232
	 * Get the field name prefix for a given table
233
	 * @param string $table
234
	 * @throws MWException
235
	 * @return null|string
236
	 */
237
	protected function getPrefix( $table ) {
238
		static $prefixes = [
239
			'pagelinks' => 'pl',
240
			'imagelinks' => 'il',
241
			'categorylinks' => 'cl',
242
			'templatelinks' => 'tl',
243
			'redirect' => 'rd',
244
		];
245
246
		if ( isset( $prefixes[$table] ) ) {
247
			return $prefixes[$table];
248
		} else {
249
			$prefix = null;
250
			Hooks::run( 'BacklinkCacheGetPrefix', [ $table, &$prefix ] );
251
			if ( $prefix ) {
252
				return $prefix;
253
			} else {
254
				throw new MWException( "Invalid table \"$table\" in " . __CLASS__ );
255
			}
256
		}
257
	}
258
259
	/**
260
	 * Get the SQL condition array for selecting backlinks, with a join
261
	 * on the page table.
262
	 * @param string $table
263
	 * @throws MWException
264
	 * @return array|null
265
	 */
266
	protected function getConditions( $table ) {
267
		$prefix = $this->getPrefix( $table );
268
269
		switch ( $table ) {
270
			case 'pagelinks':
271
			case 'templatelinks':
272
				$conds = [
273
					"{$prefix}_namespace" => $this->title->getNamespace(),
274
					"{$prefix}_title" => $this->title->getDBkey(),
275
					"page_id={$prefix}_from"
276
				];
277
				break;
278
			case 'redirect':
279
				$conds = [
280
					"{$prefix}_namespace" => $this->title->getNamespace(),
281
					"{$prefix}_title" => $this->title->getDBkey(),
282
					$this->getDB()->makeList( [
283
						"{$prefix}_interwiki" => '',
284
						"{$prefix}_interwiki IS NULL",
285
					], LIST_OR ),
286
					"page_id={$prefix}_from"
287
				];
288
				break;
289
			case 'imagelinks':
290
			case 'categorylinks':
291
				$conds = [
292
					"{$prefix}_to" => $this->title->getDBkey(),
293
					"page_id={$prefix}_from"
294
				];
295
				break;
296
			default:
297
				$conds = null;
298
				Hooks::run( 'BacklinkCacheGetConditions', [ $table, $this->title, &$conds ] );
299
				if ( !$conds ) {
300
					throw new MWException( "Invalid table \"$table\" in " . __CLASS__ );
301
				}
302
		}
303
304
		return $conds;
305
	}
306
307
	/**
308
	 * Check if there are any backlinks
309
	 * @param string $table
310
	 * @return bool
311
	 */
312
	public function hasLinks( $table ) {
313
		return ( $this->getNumLinks( $table, 1 ) > 0 );
314
	}
315
316
	/**
317
	 * Get the approximate number of backlinks
318
	 * @param string $table
319
	 * @param int $max Only count up to this many backlinks
320
	 * @return int
321
	 */
322
	public function getNumLinks( $table, $max = INF ) {
323
		global $wgUpdateRowsPerJob;
324
325
		$cache = ObjectCache::getMainWANInstance();
326
		// 1) try partition cache ...
327
		if ( isset( $this->partitionCache[$table] ) ) {
328
			$entry = reset( $this->partitionCache[$table] );
329
330
			return min( $max, $entry['numRows'] );
331
		}
332
333
		// 2) ... then try full result cache ...
334
		if ( isset( $this->fullResultCache[$table] ) ) {
335
			return min( $max, $this->fullResultCache[$table]->numRows() );
336
		}
337
338
		$memcKey = wfMemcKey( 'numbacklinks', md5( $this->title->getPrefixedDBkey() ), $table );
339
340
		// 3) ... fallback to memcached ...
341
		$count = $cache->get( $memcKey );
342
		if ( $count ) {
343
			return min( $max, $count );
344
		}
345
346
		// 4) fetch from the database ...
347
		if ( is_infinite( $max ) ) { // no limit at all
348
			// Use partition() since it will batch the query and skip the JOIN.
349
			// Use $wgUpdateRowsPerJob just to encourage cache reuse for jobs.
350
			$this->partition( $table, $wgUpdateRowsPerJob ); // updates $this->partitionCache
351
			return $this->partitionCache[$table][$wgUpdateRowsPerJob]['numRows'];
352
		} else { // probably some sane limit
353
			// Fetch the full title info, since the caller will likely need it next
354
			$count = $this->getLinks( $table, false, false, $max )->count();
355
			if ( $count < $max ) { // full count
356
				$cache->set( $memcKey, $count, self::CACHE_EXPIRY );
357
			}
358
		}
359
360
		return min( $max, $count );
361
	}
362
363
	/**
364
	 * Partition the backlinks into batches.
365
	 * Returns an array giving the start and end of each range. The first
366
	 * batch has a start of false, and the last batch has an end of false.
367
	 *
368
	 * @param string $table The links table name
369
	 * @param int $batchSize
370
	 * @return array
371
	 */
372
	public function partition( $table, $batchSize ) {
373
		// 1) try partition cache ...
374
		if ( isset( $this->partitionCache[$table][$batchSize] ) ) {
375
			wfDebug( __METHOD__ . ": got from partition cache\n" );
376
377
			return $this->partitionCache[$table][$batchSize]['batches'];
378
		}
379
380
		$cache = ObjectCache::getMainWANInstance();
381
		$this->partitionCache[$table][$batchSize] = false;
382
		$cacheEntry =& $this->partitionCache[$table][$batchSize];
383
384
		// 2) ... then try full result cache ...
385
		if ( isset( $this->fullResultCache[$table] ) ) {
386
			$cacheEntry = $this->partitionResult( $this->fullResultCache[$table], $batchSize );
387
			wfDebug( __METHOD__ . ": got from full result cache\n" );
388
389
			return $cacheEntry['batches'];
390
		}
391
392
		$memcKey = wfMemcKey(
393
			'backlinks',
394
			md5( $this->title->getPrefixedDBkey() ),
395
			$table,
396
			$batchSize
397
		);
398
399
		// 3) ... fallback to memcached ...
400
		$memcValue = $cache->get( $memcKey );
401
		if ( is_array( $memcValue ) ) {
402
			$cacheEntry = $memcValue;
403
			wfDebug( __METHOD__ . ": got from memcached $memcKey\n" );
404
405
			return $cacheEntry['batches'];
406
		}
407
408
		// 4) ... finally fetch from the slow database :(
409
		$cacheEntry = [ 'numRows' => 0, 'batches' => [] ]; // final result
410
		// Do the selects in batches to avoid client-side OOMs (bug 43452).
411
		// Use a LIMIT that plays well with $batchSize to keep equal sized partitions.
412
		$selectSize = max( $batchSize, 200000 - ( 200000 % $batchSize ) );
413
		$start = false;
414
		do {
415
			$res = $this->queryLinks( $table, $start, false, $selectSize, 'ids' );
416
			$partitions = $this->partitionResult( $res, $batchSize, false );
417
			// Merge the link count and range partitions for this chunk
418
			$cacheEntry['numRows'] += $partitions['numRows'];
419
			$cacheEntry['batches'] = array_merge( $cacheEntry['batches'], $partitions['batches'] );
420
			if ( count( $partitions['batches'] ) ) {
421
				list( , $lEnd ) = end( $partitions['batches'] );
422
				$start = $lEnd + 1; // pick up after this inclusive range
423
			}
424
		} while ( $partitions['numRows'] >= $selectSize );
425
		// Make sure the first range has start=false and the last one has end=false
426
		if ( count( $cacheEntry['batches'] ) ) {
427
			$cacheEntry['batches'][0][0] = false;
428
			$cacheEntry['batches'][count( $cacheEntry['batches'] ) - 1][1] = false;
429
		}
430
431
		// Save partitions to memcached
432
		$cache->set( $memcKey, $cacheEntry, self::CACHE_EXPIRY );
433
434
		// Save backlink count to memcached
435
		$memcKey = wfMemcKey( 'numbacklinks', md5( $this->title->getPrefixedDBkey() ), $table );
436
		$cache->set( $memcKey, $cacheEntry['numRows'], self::CACHE_EXPIRY );
437
438
		wfDebug( __METHOD__ . ": got from database\n" );
439
440
		return $cacheEntry['batches'];
441
	}
442
443
	/**
444
	 * Partition a DB result with backlinks in it into batches
445
	 * @param ResultWrapper $res Database result
446
	 * @param int $batchSize
447
	 * @param bool $isComplete Whether $res includes all the backlinks
448
	 * @throws MWException
449
	 * @return array
450
	 */
451
	protected function partitionResult( $res, $batchSize, $isComplete = true ) {
452
		$batches = [];
453
		$numRows = $res->numRows();
454
		$numBatches = ceil( $numRows / $batchSize );
455
456
		for ( $i = 0; $i < $numBatches; $i++ ) {
457
			if ( $i == 0 && $isComplete ) {
458
				$start = false;
459
			} else {
460
				$rowNum = $i * $batchSize;
461
				$res->seek( $rowNum );
462
				$row = $res->fetchObject();
463
				$start = (int)$row->page_id;
464
			}
465
466
			if ( $i == ( $numBatches - 1 ) && $isComplete ) {
467
				$end = false;
468
			} else {
469
				$rowNum = min( $numRows - 1, ( $i + 1 ) * $batchSize - 1 );
470
				$res->seek( $rowNum );
471
				$row = $res->fetchObject();
472
				$end = (int)$row->page_id;
473
			}
474
475
			# Sanity check order
476
			if ( $start && $end && $start > $end ) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $start of type false|integer is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
Bug Best Practice introduced by
The expression $end of type false|integer is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
477
				throw new MWException( __METHOD__ . ': Internal error: query result out of order' );
478
			}
479
480
			$batches[] = [ $start, $end ];
481
		}
482
483
		return [ 'numRows' => $numRows, 'batches' => $batches ];
484
	}
485
486
	/**
487
	 * Get a Title iterator for cascade-protected template/file use backlinks
488
	 *
489
	 * @return TitleArray
490
	 * @since 1.25
491
	 */
492
	public function getCascadeProtectedLinks() {
493
		$dbr = $this->getDB();
494
495
		// @todo: use UNION without breaking tests that use temp tables
496
		$resSets = [];
497
		$resSets[] = $dbr->select(
498
			[ 'templatelinks', 'page_restrictions', 'page' ],
499
			[ 'page_namespace', 'page_title', 'page_id' ],
500
			[
501
				'tl_namespace' => $this->title->getNamespace(),
502
				'tl_title' => $this->title->getDBkey(),
503
				'tl_from = pr_page',
504
				'pr_cascade' => 1,
505
				'page_id = tl_from'
506
			],
507
			__METHOD__,
508
			[ 'DISTINCT' ]
509
		);
510
		if ( $this->title->getNamespace() == NS_FILE ) {
511
			$resSets[] = $dbr->select(
512
				[ 'imagelinks', 'page_restrictions', 'page' ],
513
				[ 'page_namespace', 'page_title', 'page_id' ],
514
				[
515
					'il_to' => $this->title->getDBkey(),
516
					'il_from = pr_page',
517
					'pr_cascade' => 1,
518
					'page_id = il_from'
519
				],
520
				__METHOD__,
521
				[ 'DISTINCT' ]
522
			);
523
		}
524
525
		// Combine and de-duplicate the results
526
		$mergedRes = [];
527
		foreach ( $resSets as $res ) {
528
			foreach ( $res as $row ) {
529
				$mergedRes[$row->page_id] = $row;
530
			}
531
		}
532
533
		return TitleArray::newFromResult(
534
			new FakeResultWrapper( array_values( $mergedRes ) ) );
535
	}
536
}
537