Completed
Branch master (771964)
by
unknown
26:13
created

UpdateCollation   B

Complexity

Total Complexity 44

Size/Duplication

Total Lines 302
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 5

Importance

Changes 1
Bugs 0 Features 1
Metric Value
wmc 44
lcom 1
cbo 5
dl 0
loc 302
rs 8.3396
c 1
b 0
f 1

5 Methods

Rating   Name   Duplication   Size   Complexity  
B __construct() 0 27 1
F execute() 0 164 24
B getBatchCondition() 0 31 6
A updateSortKeySizeHistogram() 0 7 2
C showSortKeySizeHistogram() 0 55 11

How to fix   Complexity   

Complex Class

Complex classes like UpdateCollation often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use UpdateCollation, and based on these observations, apply Extract Interface, too.

1
<?php
0 ignored issues
show
Coding Style Compatibility introduced by
For compatibility and reusability of your code, PSR1 recommends that a file should introduce either new symbols (like classes, functions, etc.) or have side-effects (like outputting something, or including other files), but not both at the same time. The first symbol is defined on line 35 and the first side effect is on line 27.

The PSR-1: Basic Coding Standard recommends that a file should either introduce new symbols, that is classes, functions, constants or similar, or have side effects. Side effects are anything that executes logic, like for example printing output, changing ini settings or writing to a file.

The idea behind this recommendation is that merely auto-loading a class should not change the state of an application. It also promotes a cleaner style of programming and makes your code less prone to errors, because the logic is not spread out all over the place.

To learn more about the PSR-1, please see the PHP-FIG site on the PSR-1.

Loading history...
2
/**
3
 * Find all rows in the categorylinks table whose collation is out-of-date
4
 * (cl_collation != $wgCategoryCollation) and repopulate cl_sortkey
5
 * using the page title and cl_sortkey_prefix.
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License along
18
 * with this program; if not, write to the Free Software Foundation, Inc.,
19
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20
 * http://www.gnu.org/copyleft/gpl.html
21
 *
22
 * @file
23
 * @ingroup Maintenance
24
 * @author Aryeh Gregor (Simetrical)
25
 */
26
27
require_once __DIR__ . '/Maintenance.php';
28
29
/**
30
 * Maintenance script that will find all rows in the categorylinks table
31
 * whose collation is out-of-date.
32
 *
33
 * @ingroup Maintenance
34
 */
35
class UpdateCollation extends Maintenance {
36
	const BATCH_SIZE = 100; // Number of rows to process in one batch
37
	const SYNC_INTERVAL = 20; // Wait for slaves after this many batches
38
39
	public $sizeHistogram = [];
40
41
	public function __construct() {
42
		parent::__construct();
43
44
		global $wgCategoryCollation;
45
		$this->addDescription( <<<TEXT
46
This script will find all rows in the categorylinks table whose collation is
47
out-of-date (cl_collation != '$wgCategoryCollation') and repopulate cl_sortkey
48
using the page title and cl_sortkey_prefix.  If all collations are
49
up-to-date, it will do nothing.
50
TEXT
51
		);
52
53
		$this->addOption( 'force', 'Run on all rows, even if the collation is ' .
54
			'supposed to be up-to-date.' );
55
		$this->addOption( 'previous-collation', 'Set the previous value of ' .
56
			'$wgCategoryCollation here to speed up this script, especially if your ' .
57
			'categorylinks table is large. This will only update rows with that ' .
58
			'collation, though, so it may miss out-of-date rows with a different, ' .
59
			'even older collation.', false, true );
60
		$this->addOption( 'target-collation', 'Set this to the new collation type to ' .
61
			'use instead of $wgCategoryCollation. Usually you should not use this, ' .
62
			'you should just update $wgCategoryCollation in LocalSettings.php.',
63
			false, true );
64
		$this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
65
			'compile statistics.' );
66
		$this->addOption( 'verbose-stats', 'Show more statistics.' );
67
	}
68
69
	public function execute() {
70
		global $wgCategoryCollation;
71
72
		$dbw = $this->getDB( DB_MASTER );
73
		$force = $this->getOption( 'force' );
74
		$dryRun = $this->getOption( 'dry-run' );
75
		$verboseStats = $this->getOption( 'verbose-stats' );
76
		if ( $this->hasOption( 'target-collation' ) ) {
77
			$collationName = $this->getOption( 'target-collation' );
78
			$collation = Collation::factory( $collationName );
79
		} else {
80
			$collationName = $wgCategoryCollation;
81
			$collation = Collation::singleton();
82
		}
83
84
		// Collation sanity check: in some cases the constructor will work,
85
		// but this will raise an exception, breaking all category pages
86
		$collation->getFirstLetter( 'MediaWiki' );
87
88
		// Locally at least, (my local is a rather old version of mysql)
89
		// mysql seems to filesort if there is both an equality
90
		// (but not for an inequality) condition on cl_collation in the
91
		// WHERE and it is also the first item in the ORDER BY.
92
		if ( $this->hasOption( 'previous-collation' ) ) {
93
			$orderBy = 'cl_to, cl_type, cl_from';
94
		} else {
95
			$orderBy = 'cl_collation, cl_to, cl_type, cl_from';
96
		}
97
		$options = [
98
			'LIMIT' => self::BATCH_SIZE,
99
			'ORDER BY' => $orderBy,
100
		];
101
102
		if ( $force || $dryRun ) {
103
			$collationConds = [];
104
		} else {
105
			if ( $this->hasOption( 'previous-collation' ) ) {
106
				$collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
0 ignored issues
show
Coding Style Comprehensibility introduced by
$collationConds was never initialized. Although not strictly required by PHP, it is generally a good practice to add $collationConds = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
107
			} else {
108
				$collationConds = [ 0 =>
109
					'cl_collation != ' . $dbw->addQuotes( $collationName )
110
				];
111
			}
112
113
			$count = $dbw->estimateRowCount(
114
				'categorylinks',
115
				'*',
116
				$collationConds,
117
				__METHOD__
118
			);
119
			// Improve estimate if feasible
120
			if ( $count < 1000000 ) {
121
				$count = $dbw->selectField(
122
					'categorylinks',
123
					'COUNT(*)',
124
					$collationConds,
125
					__METHOD__
126
				);
127
			}
128
			if ( $count == 0 ) {
129
				$this->output( "Collations up-to-date.\n" );
130
131
				return;
132
			}
133
			$this->output( "Fixing collation for $count rows.\n" );
134
		}
135
		$count = 0;
136
		$batchCount = 0;
137
		$batchConds = [];
138
		do {
139
			$this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
140
141
			// cl_type must be selected as a number for proper paging because
142
			// enums suck.
143
			if ( $dbw->getType() === 'mysql' ) {
144
				$clType = 'cl_type+0 AS "cl_type_numeric"';
145
			} else {
146
				$clType = 'cl_type';
147
			}
148
			$res = $dbw->select(
149
				[ 'categorylinks', 'page' ],
150
				[ 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
151
					'cl_sortkey', $clType,
152
					'page_namespace', 'page_title'
153
				],
154
				array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
155
				__METHOD__,
156
				$options
157
			);
158
			$this->output( " processing..." );
159
160
			if ( !$dryRun ) {
161
				$this->beginTransaction( $dbw, __METHOD__ );
162
			}
163
			foreach ( $res as $row ) {
0 ignored issues
show
Bug introduced by
The expression $res of type object<ResultWrapper>|boolean is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
164
				$title = Title::newFromRow( $row );
165
				if ( !$row->cl_collation ) {
166
					# This is an old-style row, so the sortkey needs to be
167
					# converted.
168
					if ( $row->cl_sortkey == $title->getText()
169
						|| $row->cl_sortkey == $title->getPrefixedText()
170
					) {
171
						$prefix = '';
172
					} else {
173
						# Custom sortkey, use it as a prefix
174
						$prefix = $row->cl_sortkey;
175
					}
176
				} else {
177
					$prefix = $row->cl_sortkey_prefix;
178
				}
179
				# cl_type will be wrong for lots of pages if cl_collation is 0,
180
				# so let's update it while we're here.
181
				if ( $title->getNamespace() == NS_CATEGORY ) {
182
					$type = 'subcat';
183
				} elseif ( $title->getNamespace() == NS_FILE ) {
184
					$type = 'file';
185
				} else {
186
					$type = 'page';
187
				}
188
				$newSortKey = $collation->getSortKey(
189
					$title->getCategorySortkey( $prefix ) );
190
				if ( $verboseStats ) {
191
					$this->updateSortKeySizeHistogram( $newSortKey );
192
				}
193
194
				if ( !$dryRun ) {
195
					$dbw->update(
196
						'categorylinks',
197
						[
198
							'cl_sortkey' => $newSortKey,
199
							'cl_sortkey_prefix' => $prefix,
200
							'cl_collation' => $collationName,
201
							'cl_type' => $type,
202
							'cl_timestamp = cl_timestamp',
203
						],
204
						[ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
205
						__METHOD__
206
					);
207
				}
208
				if ( $row ) {
209
					$batchConds = [ $this->getBatchCondition( $row, $dbw ) ];
0 ignored issues
show
Compatibility introduced by
$dbw of type object<IDatabase> is not a sub-type of object<DatabaseBase>. It seems like you assume a concrete implementation of the interface IDatabase to be always present.

This check looks for parameters that are defined as one type in their type hint or doc comment but seem to be used as a narrower type, i.e an implementation of an interface or a subclass.

Consider changing the type of the parameter or doing an instanceof check before assuming your parameter is of the expected type.

Loading history...
210
				}
211
			}
212
			if ( !$dryRun ) {
213
				$this->commitTransaction( $dbw, __METHOD__ );
214
			}
215
216
			$count += $res->numRows();
217
			$this->output( "$count done.\n" );
218
219
			if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) {
220
				$this->output( "Waiting for slaves ... " );
221
				wfWaitForSlaves();
0 ignored issues
show
Deprecated Code introduced by
The function wfWaitForSlaves() has been deprecated with message: since 1.27 Use LBFactory::waitForReplication

This function has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed from the class and what other function to use instead.

Loading history...
222
				$this->output( "done\n" );
223
			}
224
		} while ( $res->numRows() == self::BATCH_SIZE );
225
226
		$this->output( "$count rows processed\n" );
227
228
		if ( $verboseStats ) {
229
			$this->output( "\n" );
230
			$this->showSortKeySizeHistogram();
231
		}
232
	}
233
234
	/**
235
	 * Return an SQL expression selecting rows which sort above the given row,
236
	 * assuming an ordering of cl_collation, cl_to, cl_type, cl_from
237
	 * @param stdClass $row
238
	 * @param DatabaseBase $dbw
239
	 * @return string
240
	 */
241
	function getBatchCondition( $row, $dbw ) {
242
		if ( $this->hasOption( 'previous-collation' ) ) {
243
			$fields = [ 'cl_to', 'cl_type', 'cl_from' ];
244
		} else {
245
			$fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
246
		}
247
		$first = true;
248
		$cond = false;
249
		$prefix = false;
250
		foreach ( $fields as $field ) {
251
			if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
252
				// Range conditions with enums are weird in mysql
253
				// This must be a numeric literal, or it won't work.
254
				$encValue = intval( $row->cl_type_numeric );
255
			} else {
256
				$encValue = $dbw->addQuotes( $row->$field );
257
			}
258
			$inequality = "$field > $encValue";
259
			$equality = "$field = $encValue";
260
			if ( $first ) {
261
				$cond = $inequality;
262
				$prefix = $equality;
263
				$first = false;
264
			} else {
265
				$cond .= " OR ($prefix AND $inequality)";
266
				$prefix .= " AND $equality";
267
			}
268
		}
269
270
		return $cond;
271
	}
272
273
	function updateSortKeySizeHistogram( $key ) {
274
		$length = strlen( $key );
275
		if ( !isset( $this->sizeHistogram[$length] ) ) {
276
			$this->sizeHistogram[$length] = 0;
277
		}
278
		$this->sizeHistogram[$length]++;
279
	}
280
281
	function showSortKeySizeHistogram() {
282
		$maxLength = max( array_keys( $this->sizeHistogram ) );
283
		if ( $maxLength == 0 ) {
284
			return;
285
		}
286
		$numBins = 20;
287
		$coarseHistogram = array_fill( 0, $numBins, 0 );
288
		$coarseBoundaries = [];
289
		$boundary = 0;
290
		for ( $i = 0; $i < $numBins - 1; $i++ ) {
291
			$boundary += $maxLength / $numBins;
292
			$coarseBoundaries[$i] = round( $boundary );
293
		}
294
		$coarseBoundaries[$numBins - 1] = $maxLength + 1;
295
		$raw = '';
296
		for ( $i = 0; $i <= $maxLength; $i++ ) {
297
			if ( $raw !== '' ) {
298
				$raw .= ', ';
299
			}
300
			if ( !isset( $this->sizeHistogram[$i] ) ) {
301
				$val = 0;
302
			} else {
303
				$val = $this->sizeHistogram[$i];
304
			}
305
			for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
306
				if ( $coarseBoundaries[$coarseIndex] > $i ) {
307
					$coarseHistogram[$coarseIndex] += $val;
308
					break;
309
				}
310
			}
311
			if ( $coarseIndex == $numBins - 1 ) {
312
				$coarseHistogram[$coarseIndex] += $val;
313
			}
314
			$raw .= $val;
315
		}
316
317
		$this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
318
319
		$maxBinVal = max( $coarseHistogram );
320
		$scale = 60 / $maxBinVal;
321
		$prevBoundary = 0;
322
		for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
323
			if ( !isset( $coarseHistogram[$coarseIndex] ) ) {
324
				$val = 0;
325
			} else {
326
				$val = $coarseHistogram[$coarseIndex];
327
			}
328
			$boundary = $coarseBoundaries[$coarseIndex];
329
			$this->output( sprintf( "%-10s %-10d |%s\n",
330
				$prevBoundary . '-' . ( $boundary - 1 ) . ': ',
331
				$val,
332
				str_repeat( '*', $scale * $val ) ) );
333
			$prevBoundary = $boundary;
334
		}
335
	}
336
}
337
338
$maintClass = "UpdateCollation";
339
require_once RUN_MAINTENANCE_IF_MAIN;
340