CopyFileBackend::filesAreSame()   D
last analyzed

Complexity

Conditions 9
Paths 36

Size

Total Lines 41
Code Lines 24

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
eloc 24
nc 36
nop 4
dl 0
loc 41
rs 4.909
c 0
b 0
f 0
1
<?php
0 ignored issues
show
Coding Style Compatibility introduced by
For compatibility and reusability of your code, PSR1 recommends that a file should introduce either new symbols (like classes, functions, etc.) or have side-effects (like outputting something, or including other files), but not both at the same time. The first symbol is defined on line 37 and the first side effect is on line 24.

The PSR-1: Basic Coding Standard recommends that a file should either introduce new symbols, that is classes, functions, constants or similar, or have side effects. Side effects are anything that executes logic, like for example printing output, changing ini settings or writing to a file.

The idea behind this recommendation is that merely auto-loading a class should not change the state of an application. It also promotes a cleaner style of programming and makes your code less prone to errors, because the logic is not spread out all over the place.

To learn more about the PSR-1, please see the PHP-FIG site on the PSR-1.

Loading history...
2
/**
3
 * Copy all files in some containers of one backend to another.
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License along
16
 * with this program; if not, write to the Free Software Foundation, Inc.,
17
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18
 * http://www.gnu.org/copyleft/gpl.html
19
 *
20
 * @file
21
 * @ingroup Maintenance
22
 */
23
24
require_once __DIR__ . '/Maintenance.php';
25
26
/**
27
 * Copy all files in one container of one backend to another.
28
 *
29
 * This can also be used to re-shard the files for one backend using the
30
 * config of second backend. The second backend should have the same config
31
 * as the first, except for it having a different name and different sharding
32
 * configuration. The backend should be made read-only while this runs.
33
 * After this script finishes, the old files in the containers can be deleted.
34
 *
35
 * @ingroup Maintenance
36
 */
37
class CopyFileBackend extends Maintenance {
38
	/** @var array|null (path sha1 => stat) Pre-computed dst stat entries from listings */
39
	protected $statCache = null;
40
41
	public function __construct() {
42
		parent::__construct();
43
		$this->addDescription( 'Copy files in one backend to another.' );
44
		$this->addOption( 'src', 'Backend containing the source files', true, true );
45
		$this->addOption( 'dst', 'Backend where files should be copied to', true, true );
46
		$this->addOption( 'containers', 'Pipe separated list of containers', true, true );
47
		$this->addOption( 'subdir', 'Only do items in this child directory', false, true );
48
		$this->addOption( 'ratefile', 'File to check periodically for batch size', false, true );
49
		$this->addOption( 'prestat', 'Stat the destination files first (try to use listings)' );
50
		$this->addOption( 'skiphash', 'Skip SHA-1 sync checks for files' );
51
		$this->addOption( 'missingonly', 'Only copy files missing from destination listing' );
52
		$this->addOption( 'syncviadelete', 'Delete destination files missing from source listing' );
53
		$this->addOption( 'utf8only', 'Skip source files that do not have valid UTF-8 names' );
54
		$this->setBatchSize( 50 );
55
	}
56
57
	public function execute() {
58
		$src = FileBackendGroup::singleton()->get( $this->getOption( 'src' ) );
59
		$dst = FileBackendGroup::singleton()->get( $this->getOption( 'dst' ) );
60
		$containers = explode( '|', $this->getOption( 'containers' ) );
61
		$subDir = rtrim( $this->getOption( 'subdir', '' ), '/' );
62
63
		$rateFile = $this->getOption( 'ratefile' );
64
65
		foreach ( $containers as $container ) {
66
			if ( $subDir != '' ) {
67
				$backendRel = "$container/$subDir";
68
				$this->output( "Doing container '$container', directory '$subDir'...\n" );
69
			} else {
70
				$backendRel = $container;
71
				$this->output( "Doing container '$container'...\n" );
72
			}
73
74
			if ( $this->hasOption( 'missingonly' ) ) {
75
				$this->output( "\tBuilding list of missing files..." );
76
				$srcPathsRel = $this->getListingDiffRel( $src, $dst, $backendRel );
77
				$this->output( count( $srcPathsRel ) . " file(s) need to be copied.\n" );
78
			} else {
79
				$srcPathsRel = $src->getFileList( [
80
					'dir' => $src->getRootStoragePath() . "/$backendRel",
81
					'adviseStat' => true // avoid HEADs
82
				] );
83
				if ( $srcPathsRel === null ) {
84
					$this->error( "Could not list files in $container.", 1 ); // die
85
				}
86
			}
87
88
			if ( $this->getOption( 'prestat' ) && !$this->hasOption( 'missingonly' ) ) {
89
				// Build the stat cache for the destination files
90
				$this->output( "\tBuilding destination stat cache..." );
91
				$dstPathsRel = $dst->getFileList( [
92
					'dir' => $dst->getRootStoragePath() . "/$backendRel",
93
					'adviseStat' => true // avoid HEADs
94
				] );
95
				if ( $dstPathsRel === null ) {
96
					$this->error( "Could not list files in $container.", 1 ); // die
97
				}
98
				$this->statCache = [];
99
				foreach ( $dstPathsRel as $dstPathRel ) {
0 ignored issues
show
Bug introduced by
The expression $dstPathsRel of type object<Traversable>|array|null is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
100
					$path = $dst->getRootStoragePath() . "/$backendRel/$dstPathRel";
101
					$this->statCache[sha1( $path )] = $dst->getFileStat( [ 'src' => $path ] );
102
				}
103
				$this->output( "done [" . count( $this->statCache ) . " file(s)]\n" );
104
			}
105
106
			$this->output( "\tCopying file(s)...\n" );
107
			$count = 0;
108
			$batchPaths = [];
109 View Code Duplication
			foreach ( $srcPathsRel as $srcPathRel ) {
0 ignored issues
show
Bug introduced by
The expression $srcPathsRel of type object<Traversable>|array|null is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
110
				// Check up on the rate file periodically to adjust the concurrency
111
				if ( $rateFile && ( !$count || ( $count % 500 ) == 0 ) ) {
112
					$this->mBatchSize = max( 1, (int)file_get_contents( $rateFile ) );
113
					$this->output( "\tBatch size is now {$this->mBatchSize}.\n" );
114
				}
115
				$batchPaths[$srcPathRel] = 1; // remove duplicates
116
				if ( count( $batchPaths ) >= $this->mBatchSize ) {
117
					$this->copyFileBatch( array_keys( $batchPaths ), $backendRel, $src, $dst );
118
					$batchPaths = []; // done
119
				}
120
				++$count;
121
			}
122
			if ( count( $batchPaths ) ) { // left-overs
123
				$this->copyFileBatch( array_keys( $batchPaths ), $backendRel, $src, $dst );
124
				$batchPaths = []; // done
0 ignored issues
show
Unused Code introduced by
$batchPaths is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
125
			}
126
			$this->output( "\tCopied $count file(s).\n" );
127
128
			if ( $this->hasOption( 'syncviadelete' ) ) {
129
				$this->output( "\tBuilding list of excess destination files..." );
130
				$delPathsRel = $this->getListingDiffRel( $dst, $src, $backendRel );
131
				$this->output( count( $delPathsRel ) . " file(s) need to be deleted.\n" );
132
133
				$this->output( "\tDeleting file(s)...\n" );
134
				$count = 0;
135
				$batchPaths = [];
136 View Code Duplication
				foreach ( $delPathsRel as $delPathRel ) {
137
					// Check up on the rate file periodically to adjust the concurrency
138
					if ( $rateFile && ( !$count || ( $count % 500 ) == 0 ) ) {
139
						$this->mBatchSize = max( 1, (int)file_get_contents( $rateFile ) );
140
						$this->output( "\tBatch size is now {$this->mBatchSize}.\n" );
141
					}
142
					$batchPaths[$delPathRel] = 1; // remove duplicates
143
					if ( count( $batchPaths ) >= $this->mBatchSize ) {
144
						$this->delFileBatch( array_keys( $batchPaths ), $backendRel, $dst );
145
						$batchPaths = []; // done
146
					}
147
					++$count;
148
				}
149
				if ( count( $batchPaths ) ) { // left-overs
150
					$this->delFileBatch( array_keys( $batchPaths ), $backendRel, $dst );
151
					$batchPaths = []; // done
0 ignored issues
show
Unused Code introduced by
$batchPaths is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
152
				}
153
154
				$this->output( "\tDeleted $count file(s).\n" );
155
			}
156
157
			if ( $subDir != '' ) {
158
				$this->output( "Finished container '$container', directory '$subDir'.\n" );
159
			} else {
160
				$this->output( "Finished container '$container'.\n" );
161
			}
162
		}
163
164
		$this->output( "Done.\n" );
165
	}
166
167
	/**
168
	 * @param FileBackend $src
169
	 * @param FileBackend $dst
170
	 * @param string $backendRel
171
	 * @return array (rel paths in $src minus those in $dst)
172
	 */
173
	protected function getListingDiffRel( FileBackend $src, FileBackend $dst, $backendRel ) {
174
		$srcPathsRel = $src->getFileList( [
175
			'dir' => $src->getRootStoragePath() . "/$backendRel" ] );
176
		if ( $srcPathsRel === null ) {
177
			$this->error( "Could not list files in source container.", 1 ); // die
178
		}
179
		$dstPathsRel = $dst->getFileList( [
180
			'dir' => $dst->getRootStoragePath() . "/$backendRel" ] );
181
		if ( $dstPathsRel === null ) {
182
			$this->error( "Could not list files in destination container.", 1 ); // die
183
		}
184
		// Get the list of destination files
185
		$relFilesDstSha1 = [];
186
		foreach ( $dstPathsRel as $dstPathRel ) {
0 ignored issues
show
Bug introduced by
The expression $dstPathsRel of type object<Traversable>|array|null is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
187
			$relFilesDstSha1[sha1( $dstPathRel )] = 1;
188
		}
189
		unset( $dstPathsRel ); // free
190
		// Get the list of missing files
191
		$missingPathsRel = [];
192
		foreach ( $srcPathsRel as $srcPathRel ) {
0 ignored issues
show
Bug introduced by
The expression $srcPathsRel of type object<Traversable>|array|null is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
193
			if ( !isset( $relFilesDstSha1[sha1( $srcPathRel )] ) ) {
194
				$missingPathsRel[] = $srcPathRel;
195
			}
196
		}
197
		unset( $srcPathsRel ); // free
198
199
		return $missingPathsRel;
200
	}
201
202
	/**
203
	 * @param array $srcPathsRel
204
	 * @param string $backendRel
205
	 * @param FileBackend $src
206
	 * @param FileBackend $dst
207
	 * @return void
208
	 */
209
	protected function copyFileBatch(
210
		array $srcPathsRel, $backendRel, FileBackend $src, FileBackend $dst
211
	) {
212
		$ops = [];
213
		$fsFiles = [];
214
		$copiedRel = []; // for output message
215
		$wikiId = $src->getWikiId();
216
217
		// Download the batch of source files into backend cache...
218
		if ( $this->hasOption( 'missingonly' ) ) {
219
			$srcPaths = [];
220
			foreach ( $srcPathsRel as $srcPathRel ) {
221
				$srcPaths[] = $src->getRootStoragePath() . "/$backendRel/$srcPathRel";
222
			}
223
			$t_start = microtime( true );
224
			$fsFiles = $src->getLocalReferenceMulti( [ 'srcs' => $srcPaths, 'latest' => 1 ] );
225
			$elapsed_ms = floor( ( microtime( true ) - $t_start ) * 1000 );
226
			$this->output( "\n\tDownloaded these file(s) [{$elapsed_ms}ms]:\n\t" .
227
				implode( "\n\t", $srcPaths ) . "\n\n" );
228
		}
229
230
		// Determine what files need to be copied over...
231
		foreach ( $srcPathsRel as $srcPathRel ) {
232
			$srcPath = $src->getRootStoragePath() . "/$backendRel/$srcPathRel";
233
			$dstPath = $dst->getRootStoragePath() . "/$backendRel/$srcPathRel";
234
			if ( $this->hasOption( 'utf8only' ) && !mb_check_encoding( $srcPath, 'UTF-8' ) ) {
235
				$this->error( "$wikiId: Detected illegal (non-UTF8) path for $srcPath." );
236
				continue;
237
			} elseif ( !$this->hasOption( 'missingonly' )
238
				&& $this->filesAreSame( $src, $dst, $srcPath, $dstPath )
239
			) {
240
				$this->output( "\tAlready have $srcPathRel.\n" );
241
				continue; // assume already copied...
242
			}
243
			$fsFile = array_key_exists( $srcPath, $fsFiles )
244
				? $fsFiles[$srcPath]
245
				: $src->getLocalReference( [ 'src' => $srcPath, 'latest' => 1 ] );
246
			if ( !$fsFile ) {
247
				$src->clearCache( [ $srcPath ] );
248
				if ( $src->fileExists( [ 'src' => $srcPath, 'latest' => 1 ] ) === false ) {
249
					$this->error( "$wikiId: File '$srcPath' was listed but does not exist." );
250
				} else {
251
					$this->error( "$wikiId: Could not get local copy of $srcPath." );
252
				}
253
				continue;
254
			} elseif ( !$fsFile->exists() ) {
255
				// FSFileBackends just return the path for getLocalReference() and paths with
256
				// illegal slashes may get normalized to a different path. This can cause the
257
				// local reference to not exist...skip these broken files.
258
				$this->error( "$wikiId: Detected possible illegal path for $srcPath." );
259
				continue;
260
			}
261
			$fsFiles[] = $fsFile; // keep TempFSFile objects alive as needed
262
			// Note: prepare() is usually fast for key/value backends
263
			$status = $dst->prepare( [ 'dir' => dirname( $dstPath ), 'bypassReadOnly' => 1 ] );
264
			if ( !$status->isOK() ) {
265
				$this->error( print_r( $status->getErrorsArray(), true ) );
0 ignored issues
show
Bug introduced by
The method getErrorsArray() does not exist on StatusValue. Did you maybe mean getErrors()?

This check marks calls to methods that do not seem to exist on an object.

This is most likely the result of a method being renamed without all references to it being renamed likewise.

Loading history...
266
				$this->error( "$wikiId: Could not copy $srcPath to $dstPath.", 1 ); // die
267
			}
268
			$ops[] = [ 'op' => 'store',
269
				'src' => $fsFile->getPath(), 'dst' => $dstPath, 'overwrite' => 1 ];
270
			$copiedRel[] = $srcPathRel;
271
		}
272
273
		// Copy in the batch of source files...
274
		$t_start = microtime( true );
275
		$status = $dst->doQuickOperations( $ops, [ 'bypassReadOnly' => 1 ] );
276 View Code Duplication
		if ( !$status->isOK() ) {
277
			sleep( 10 ); // wait and retry copy again
278
			$status = $dst->doQuickOperations( $ops, [ 'bypassReadOnly' => 1 ] );
279
		}
280
		$elapsed_ms = floor( ( microtime( true ) - $t_start ) * 1000 );
281 View Code Duplication
		if ( !$status->isOK() ) {
282
			$this->error( print_r( $status->getErrorsArray(), true ) );
0 ignored issues
show
Bug introduced by
The method getErrorsArray() does not exist on StatusValue. Did you maybe mean getErrors()?

This check marks calls to methods that do not seem to exist on an object.

This is most likely the result of a method being renamed without all references to it being renamed likewise.

Loading history...
283
			$this->error( "$wikiId: Could not copy file batch.", 1 ); // die
284
		} elseif ( count( $copiedRel ) ) {
285
			$this->output( "\n\tCopied these file(s) [{$elapsed_ms}ms]:\n\t" .
286
				implode( "\n\t", $copiedRel ) . "\n\n" );
287
		}
288
	}
289
290
	/**
291
	 * @param array $dstPathsRel
292
	 * @param string $backendRel
293
	 * @param FileBackend $dst
294
	 * @return void
295
	 */
296
	protected function delFileBatch(
297
		array $dstPathsRel, $backendRel, FileBackend $dst
298
	) {
299
		$ops = [];
300
		$deletedRel = []; // for output message
301
		$wikiId = $dst->getWikiId();
302
303
		// Determine what files need to be copied over...
304
		foreach ( $dstPathsRel as $dstPathRel ) {
305
			$dstPath = $dst->getRootStoragePath() . "/$backendRel/$dstPathRel";
306
			$ops[] = [ 'op' => 'delete', 'src' => $dstPath ];
307
			$deletedRel[] = $dstPathRel;
308
		}
309
310
		// Delete the batch of source files...
311
		$t_start = microtime( true );
312
		$status = $dst->doQuickOperations( $ops, [ 'bypassReadOnly' => 1 ] );
313 View Code Duplication
		if ( !$status->isOK() ) {
314
			sleep( 10 ); // wait and retry copy again
315
			$status = $dst->doQuickOperations( $ops, [ 'bypassReadOnly' => 1 ] );
316
		}
317
		$elapsed_ms = floor( ( microtime( true ) - $t_start ) * 1000 );
318 View Code Duplication
		if ( !$status->isOK() ) {
319
			$this->error( print_r( $status->getErrorsArray(), true ) );
0 ignored issues
show
Bug introduced by
The method getErrorsArray() does not exist on StatusValue. Did you maybe mean getErrors()?

This check marks calls to methods that do not seem to exist on an object.

This is most likely the result of a method being renamed without all references to it being renamed likewise.

Loading history...
320
			$this->error( "$wikiId: Could not delete file batch.", 1 ); // die
321
		} elseif ( count( $deletedRel ) ) {
322
			$this->output( "\n\tDeleted these file(s) [{$elapsed_ms}ms]:\n\t" .
323
				implode( "\n\t", $deletedRel ) . "\n\n" );
324
		}
325
	}
326
327
	/**
328
	 * @param FileBackend $src
329
	 * @param FileBackend $dst
330
	 * @param string $sPath
331
	 * @param string $dPath
332
	 * @return bool
333
	 */
334
	protected function filesAreSame( FileBackend $src, FileBackend $dst, $sPath, $dPath ) {
335
		$skipHash = $this->hasOption( 'skiphash' );
336
		$srcStat = $src->getFileStat( [ 'src' => $sPath ] );
337
		$dPathSha1 = sha1( $dPath );
338
		if ( $this->statCache !== null ) {
339
			// All dst files are already in stat cache
340
			$dstStat = isset( $this->statCache[$dPathSha1] )
341
				? $this->statCache[$dPathSha1]
342
				: false;
343
		} else {
344
			$dstStat = $dst->getFileStat( [ 'src' => $dPath ] );
345
		}
346
		// Initial fast checks to see if files are obviously different
347
		$sameFast = (
348
			is_array( $srcStat ) // sanity check that source exists
349
			&& is_array( $dstStat ) // dest exists
350
			&& $srcStat['size'] === $dstStat['size']
351
		);
352
		// More thorough checks against files
353
		if ( !$sameFast ) {
354
			$same = false; // no need to look farther
355
		} elseif ( isset( $srcStat['md5'] ) && isset( $dstStat['md5'] ) ) {
356
			// If MD5 was already in the stat info, just use it.
357
			// This is useful as many objects stores can return this in object listing,
358
			// so we can use it to avoid slow per-file HEADs.
359
			$same = ( $srcStat['md5'] === $dstStat['md5'] );
360
		} elseif ( $skipHash ) {
361
			// This mode is good for copying to a backup location or resyncing clone
362
			// backends in FileBackendMultiWrite (since they get writes second, they have
363
			// higher timestamps). However, when copying the other way, this hits loads of
364
			// false positives (possibly 100%) and wastes a bunch of time on GETs/PUTs.
365
			$same = ( $srcStat['mtime'] <= $dstStat['mtime'] );
366
		} else {
367
			// This is the slowest method which does many per-file HEADs (unless an object
368
			// store tracks SHA-1 in listings).
369
			$same = ( $src->getFileSha1Base36( [ 'src' => $sPath, 'latest' => 1 ] )
370
				=== $dst->getFileSha1Base36( [ 'src' => $dPath, 'latest' => 1 ] ) );
371
		}
372
373
		return $same;
374
	}
375
}
376
377
$maintClass = 'CopyFileBackend';
378
require_once RUN_MAINTENANCE_IF_MAIN;
379