Completed
Branch master (939199)
by
unknown
39:35
created

maintenance/updateCollation.php (2 issues)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
/**
3
 * Find all rows in the categorylinks table whose collation is out-of-date
4
 * (cl_collation != $wgCategoryCollation) and repopulate cl_sortkey
5
 * using the page title and cl_sortkey_prefix.
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License along
18
 * with this program; if not, write to the Free Software Foundation, Inc.,
19
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20
 * http://www.gnu.org/copyleft/gpl.html
21
 *
22
 * @file
23
 * @ingroup Maintenance
24
 * @author Aryeh Gregor (Simetrical)
25
 */
26
27
require_once __DIR__ . '/Maintenance.php';
28
29
/**
30
 * Maintenance script that will find all rows in the categorylinks table
31
 * whose collation is out-of-date.
32
 *
33
 * @ingroup Maintenance
34
 */
35
class UpdateCollation extends Maintenance {
36
	const BATCH_SIZE = 100; // Number of rows to process in one batch
37
	const SYNC_INTERVAL = 5; // Wait for replica DBs after this many batches
38
39
	public $sizeHistogram = [];
40
41
	public function __construct() {
42
		parent::__construct();
43
44
		global $wgCategoryCollation;
45
		$this->addDescription( <<<TEXT
46
This script will find all rows in the categorylinks table whose collation is
47
out-of-date (cl_collation != '$wgCategoryCollation') and repopulate cl_sortkey
48
using the page title and cl_sortkey_prefix.  If all collations are
49
up-to-date, it will do nothing.
50
TEXT
51
		);
52
53
		$this->addOption( 'force', 'Run on all rows, even if the collation is ' .
54
			'supposed to be up-to-date.', false, false, 'f' );
55
		$this->addOption( 'previous-collation', 'Set the previous value of ' .
56
			'$wgCategoryCollation here to speed up this script, especially if your ' .
57
			'categorylinks table is large. This will only update rows with that ' .
58
			'collation, though, so it may miss out-of-date rows with a different, ' .
59
			'even older collation.', false, true );
60
		$this->addOption( 'target-collation', 'Set this to the new collation type to ' .
61
			'use instead of $wgCategoryCollation. Usually you should not use this, ' .
62
			'you should just update $wgCategoryCollation in LocalSettings.php.',
63
			false, true );
64
		$this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
65
			'compile statistics.' );
66
		$this->addOption( 'verbose-stats', 'Show more statistics.' );
67
	}
68
69
	public function execute() {
70
		global $wgCategoryCollation;
71
72
		$dbw = $this->getDB( DB_MASTER );
73
		$dbr = $this->getDB( DB_REPLICA );
74
		$force = $this->getOption( 'force' );
75
		$dryRun = $this->getOption( 'dry-run' );
76
		$verboseStats = $this->getOption( 'verbose-stats' );
77
		if ( $this->hasOption( 'target-collation' ) ) {
78
			$collationName = $this->getOption( 'target-collation' );
79
			$collation = Collation::factory( $collationName );
80
		} else {
81
			$collationName = $wgCategoryCollation;
82
			$collation = Collation::singleton();
83
		}
84
85
		// Collation sanity check: in some cases the constructor will work,
86
		// but this will raise an exception, breaking all category pages
87
		$collation->getFirstLetter( 'MediaWiki' );
88
89
		// Locally at least, (my local is a rather old version of mysql)
90
		// mysql seems to filesort if there is both an equality
91
		// (but not for an inequality) condition on cl_collation in the
92
		// WHERE and it is also the first item in the ORDER BY.
93
		if ( $this->hasOption( 'previous-collation' ) ) {
94
			$orderBy = 'cl_to, cl_type, cl_from';
95
		} else {
96
			$orderBy = 'cl_collation, cl_to, cl_type, cl_from';
97
		}
98
		$options = [
99
			'LIMIT' => self::BATCH_SIZE,
100
			'ORDER BY' => $orderBy,
101
			'STRAIGHT_JOIN' // per T58041
102
		];
103
104
		if ( $force ) {
105
			$collationConds = [];
106
		} else {
107
			if ( $this->hasOption( 'previous-collation' ) ) {
108
				$collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
109
			} else {
110
				$collationConds = [ 0 =>
111
					'cl_collation != ' . $dbw->addQuotes( $collationName )
112
				];
113
			}
114
115
			$count = $dbr->estimateRowCount(
116
				'categorylinks',
117
				'*',
118
				$collationConds,
119
				__METHOD__
120
			);
121
			// Improve estimate if feasible
122
			if ( $count < 1000000 ) {
123
				$count = $dbr->selectField(
124
					'categorylinks',
125
					'COUNT(*)',
126
					$collationConds,
127
					__METHOD__
128
				);
129
			}
130
			if ( $count == 0 ) {
131
				$this->output( "Collations up-to-date.\n" );
132
133
				return;
134
			}
135
			if ( $dryRun ) {
136
				$this->output( "$count rows would be updated.\n" );
137
			} else {
138
				$this->output( "Fixing collation for $count rows.\n" );
139
			}
140
			wfWaitForSlaves();
0 ignored issues
show
Deprecated Code introduced by
The function wfWaitForSlaves() has been deprecated with message: since 1.27 Use LBFactory::waitForReplication

This function has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed from the class and what other function to use instead.

Loading history...
141
		}
142
		$count = 0;
143
		$batchCount = 0;
144
		$batchConds = [];
145
		do {
146
			$this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
147
148
			// cl_type must be selected as a number for proper paging because
149
			// enums suck.
150
			if ( $dbw->getType() === 'mysql' ) {
151
				$clType = 'cl_type+0 AS "cl_type_numeric"';
152
			} else {
153
				$clType = 'cl_type';
154
			}
155
			$res = $dbw->select(
156
				[ 'categorylinks', 'page' ],
157
				[ 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
158
					'cl_sortkey', $clType,
159
					'page_namespace', 'page_title'
160
				],
161
				array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
162
				__METHOD__,
163
				$options
164
			);
165
			$this->output( " processing..." );
166
167
			if ( !$dryRun ) {
168
				$this->beginTransaction( $dbw, __METHOD__ );
169
			}
170
			foreach ( $res as $row ) {
171
				$title = Title::newFromRow( $row );
172
				if ( !$row->cl_collation ) {
173
					# This is an old-style row, so the sortkey needs to be
174
					# converted.
175
					if ( $row->cl_sortkey == $title->getText()
176
						|| $row->cl_sortkey == $title->getPrefixedText()
177
					) {
178
						$prefix = '';
179
					} else {
180
						# Custom sortkey, use it as a prefix
181
						$prefix = $row->cl_sortkey;
182
					}
183
				} else {
184
					$prefix = $row->cl_sortkey_prefix;
185
				}
186
				# cl_type will be wrong for lots of pages if cl_collation is 0,
187
				# so let's update it while we're here.
188
				if ( $title->getNamespace() == NS_CATEGORY ) {
189
					$type = 'subcat';
190
				} elseif ( $title->getNamespace() == NS_FILE ) {
191
					$type = 'file';
192
				} else {
193
					$type = 'page';
194
				}
195
				$newSortKey = $collation->getSortKey(
196
					$title->getCategorySortkey( $prefix ) );
197
				if ( $verboseStats ) {
198
					$this->updateSortKeySizeHistogram( $newSortKey );
199
				}
200
201
				if ( !$dryRun ) {
202
					$dbw->update(
203
						'categorylinks',
204
						[
205
							'cl_sortkey' => $newSortKey,
206
							'cl_sortkey_prefix' => $prefix,
207
							'cl_collation' => $collationName,
208
							'cl_type' => $type,
209
							'cl_timestamp = cl_timestamp',
210
						],
211
						[ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
212
						__METHOD__
213
					);
214
				}
215
				if ( $row ) {
216
					$batchConds = [ $this->getBatchCondition( $row, $dbw ) ];
217
				}
218
			}
219
			if ( !$dryRun ) {
220
				$this->commitTransaction( $dbw, __METHOD__ );
221
			}
222
223
			$count += $res->numRows();
224
			$this->output( "$count done.\n" );
225
226
			if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) {
227
				$this->output( "Waiting for replica DBs ... " );
228
				wfWaitForSlaves();
0 ignored issues
show
Deprecated Code introduced by
The function wfWaitForSlaves() has been deprecated with message: since 1.27 Use LBFactory::waitForReplication

This function has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed from the class and what other function to use instead.

Loading history...
229
				$this->output( "done\n" );
230
			}
231
		} while ( $res->numRows() == self::BATCH_SIZE );
232
233
		$this->output( "$count rows processed\n" );
234
235
		if ( $verboseStats ) {
236
			$this->output( "\n" );
237
			$this->showSortKeySizeHistogram();
238
		}
239
	}
240
241
	/**
242
	 * Return an SQL expression selecting rows which sort above the given row,
243
	 * assuming an ordering of cl_collation, cl_to, cl_type, cl_from
244
	 * @param stdClass $row
245
	 * @param Database $dbw
246
	 * @return string
247
	 */
248
	function getBatchCondition( $row, $dbw ) {
249
		if ( $this->hasOption( 'previous-collation' ) ) {
250
			$fields = [ 'cl_to', 'cl_type', 'cl_from' ];
251
		} else {
252
			$fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
253
		}
254
		$first = true;
255
		$cond = false;
256
		$prefix = false;
257
		foreach ( $fields as $field ) {
258
			if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
259
				// Range conditions with enums are weird in mysql
260
				// This must be a numeric literal, or it won't work.
261
				$encValue = intval( $row->cl_type_numeric );
262
			} else {
263
				$encValue = $dbw->addQuotes( $row->$field );
264
			}
265
			$inequality = "$field > $encValue";
266
			$equality = "$field = $encValue";
267
			if ( $first ) {
268
				$cond = $inequality;
269
				$prefix = $equality;
270
				$first = false;
271
			} else {
272
				$cond .= " OR ($prefix AND $inequality)";
273
				$prefix .= " AND $equality";
274
			}
275
		}
276
277
		return $cond;
278
	}
279
280
	function updateSortKeySizeHistogram( $key ) {
281
		$length = strlen( $key );
282
		if ( !isset( $this->sizeHistogram[$length] ) ) {
283
			$this->sizeHistogram[$length] = 0;
284
		}
285
		$this->sizeHistogram[$length]++;
286
	}
287
288
	function showSortKeySizeHistogram() {
289
		$maxLength = max( array_keys( $this->sizeHistogram ) );
290
		if ( $maxLength == 0 ) {
291
			return;
292
		}
293
		$numBins = 20;
294
		$coarseHistogram = array_fill( 0, $numBins, 0 );
295
		$coarseBoundaries = [];
296
		$boundary = 0;
297
		for ( $i = 0; $i < $numBins - 1; $i++ ) {
298
			$boundary += $maxLength / $numBins;
299
			$coarseBoundaries[$i] = round( $boundary );
300
		}
301
		$coarseBoundaries[$numBins - 1] = $maxLength + 1;
302
		$raw = '';
303
		for ( $i = 0; $i <= $maxLength; $i++ ) {
304
			if ( $raw !== '' ) {
305
				$raw .= ', ';
306
			}
307
			if ( !isset( $this->sizeHistogram[$i] ) ) {
308
				$val = 0;
309
			} else {
310
				$val = $this->sizeHistogram[$i];
311
			}
312
			for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
313
				if ( $coarseBoundaries[$coarseIndex] > $i ) {
314
					$coarseHistogram[$coarseIndex] += $val;
315
					break;
316
				}
317
			}
318
			if ( $coarseIndex == $numBins - 1 ) {
319
				$coarseHistogram[$coarseIndex] += $val;
320
			}
321
			$raw .= $val;
322
		}
323
324
		$this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
325
326
		$maxBinVal = max( $coarseHistogram );
327
		$scale = 60 / $maxBinVal;
328
		$prevBoundary = 0;
329
		for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
330
			if ( !isset( $coarseHistogram[$coarseIndex] ) ) {
331
				$val = 0;
332
			} else {
333
				$val = $coarseHistogram[$coarseIndex];
334
			}
335
			$boundary = $coarseBoundaries[$coarseIndex];
336
			$this->output( sprintf( "%-10s %-10d |%s\n",
337
				$prevBoundary . '-' . ( $boundary - 1 ) . ': ',
338
				$val,
339
				str_repeat( '*', $scale * $val ) ) );
340
			$prevBoundary = $boundary;
341
		}
342
	}
343
}
344
345
$maintClass = "UpdateCollation";
346
require_once RUN_MAINTENANCE_IF_MAIN;
347