|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
/** |
|
4
|
|
|
* Experimental file sync |
|
5
|
|
|
*/ |
|
6
|
|
|
|
|
7
|
|
|
require_once dirname( __FILE__ ) . '/class.jetpack-sync-queue.php'; |
|
8
|
|
|
|
|
9
|
|
|
class Jetpack_Sync_Files { |
|
10
|
|
|
const FILE_HASH_SIZE = 4096; |
|
11
|
|
|
const POST_TYPE_BACKUP = 'jetpack_backup'; |
|
12
|
|
|
const QUEUE_NAME = 'backup'; |
|
13
|
|
|
|
|
14
|
|
|
// singleton functions |
|
15
|
|
|
private static $instance; |
|
16
|
|
|
|
|
17
|
|
|
public static function get_instance() { |
|
18
|
|
|
if ( null === self::$instance ) { |
|
19
|
|
|
self::$instance = new self(); |
|
20
|
|
|
} |
|
21
|
|
|
|
|
22
|
|
|
return self::$instance; |
|
23
|
|
|
} |
|
24
|
|
|
|
|
25
|
|
|
// this is necessary because you can't use "new" when you declare instance properties >:( |
|
26
|
|
|
protected function __construct() { |
|
27
|
|
|
$this->init(); |
|
28
|
|
|
} |
|
29
|
|
|
|
|
30
|
|
|
private function init() { |
|
31
|
|
|
// register Backup, Directory content types |
|
32
|
|
|
register_post_type( self::POST_TYPE_BACKUP, array( |
|
33
|
|
|
'description' => __( 'Jetpack Backup', 'jetpack' ), |
|
34
|
|
|
'rewrite' => array( |
|
35
|
|
|
'slug' => 'backup', |
|
36
|
|
|
'with_front' => false, |
|
37
|
|
|
'feeds' => false, |
|
38
|
|
|
'pages' => false, |
|
39
|
|
|
), |
|
40
|
|
|
'public' => false, |
|
41
|
|
|
'show_ui' => false, |
|
42
|
|
|
'capability_type' => 'backup', |
|
43
|
|
|
'map_meta_cap' => false, |
|
44
|
|
|
'has_archive' => false, |
|
45
|
|
|
'query_var' => 'backup', |
|
46
|
|
|
'show_in_rest' => false, |
|
47
|
|
|
) ); |
|
48
|
|
|
} |
|
49
|
|
|
|
|
50
|
|
|
public function backup( $label = null ) { |
|
51
|
|
|
if ( ! $label ) { |
|
52
|
|
|
$label = "Backup " . current_time( 'mysql' ); |
|
53
|
|
|
} |
|
54
|
|
|
|
|
55
|
|
|
// TODO: check if another backup is already in progress |
|
|
|
|
|
|
56
|
|
|
$backup = $this->get_running_backup(); |
|
57
|
|
|
|
|
58
|
|
|
if ( $backup ) { |
|
59
|
|
|
return new WP_Error( 'multiple_backups', 'There is already a running backup' ); |
|
60
|
|
|
} |
|
61
|
|
|
|
|
62
|
|
|
return wp_insert_post( |
|
63
|
|
|
array ( |
|
64
|
|
|
'post_title' => $label, |
|
65
|
|
|
'post_type' => self::POST_TYPE_BACKUP, |
|
66
|
|
|
) |
|
67
|
|
|
); |
|
68
|
|
|
} |
|
69
|
|
|
|
|
70
|
|
|
public function cancel_backup( $post_id ) { |
|
71
|
|
|
$backup = get_post( $post_id ); |
|
72
|
|
|
|
|
73
|
|
|
if ( ! $backup || self::POST_TYPE_BACKUP !== $backup->post_type ) { |
|
74
|
|
|
return new WP_Error( 'no_such_backup', 'No such backup with that ID' ); |
|
75
|
|
|
} |
|
76
|
|
|
|
|
77
|
|
|
return wp_trash_post( $post_id ); |
|
78
|
|
|
} |
|
79
|
|
|
|
|
80
|
|
|
private function get_running_backup() { |
|
81
|
|
|
$running_backups = get_posts( array( |
|
82
|
|
|
'post_type' => self::POST_TYPE_BACKUP, |
|
83
|
|
|
'post_status' => 'draft' |
|
84
|
|
|
) ); |
|
85
|
|
|
|
|
86
|
|
|
if ( empty( $running_backups ) ) { |
|
87
|
|
|
return false; |
|
88
|
|
|
} elseif ( count( $running_backups ) > 1 ) { |
|
89
|
|
|
return new WP_Error( 'multiple_backups', 'More than one running backup - this should never happen' ); |
|
90
|
|
|
} else { |
|
91
|
|
|
return reset( $running_backups ); // first element |
|
92
|
|
|
} |
|
93
|
|
|
} |
|
94
|
|
|
|
|
95
|
|
|
public function scan_directories() { |
|
96
|
|
|
$backup = $this->get_running_backup(); |
|
97
|
|
|
|
|
98
|
|
|
if ( ! $backup || is_wp_error( $backup ) ) { |
|
99
|
|
|
return $backup; |
|
100
|
|
|
} |
|
101
|
|
|
|
|
102
|
|
|
global $scanned_dirs_count; |
|
103
|
|
|
global $scanned_files_count; |
|
104
|
|
|
|
|
105
|
|
|
$scanned_dirs_count = 0; |
|
106
|
|
|
$scanned_files_count = 0; |
|
107
|
|
|
|
|
108
|
|
|
$begin = microtime( true ); |
|
109
|
|
|
list( $hash, $children ) = $this->get_directory_hash_and_children( ABSPATH ); |
|
|
|
|
|
|
110
|
|
|
echo "Scan duration: ".(microtime(true)-$begin)." seconds\n"; |
|
111
|
|
|
echo "Scanned $scanned_dirs_count directories and $scanned_files_count files\n"; |
|
112
|
|
|
// print_r($children); |
|
113
|
|
|
|
|
114
|
|
|
// flatten hash and send to server for comparison |
|
115
|
|
|
// perhaps we should send the tree, so the server can also confirm that the files have the same |
|
116
|
|
|
// names, and also do security scanning, e.g. be aware which files should _not_ have had their hashes modified? |
|
117
|
|
|
// but for now let's go with the naive, compact, stateless approach... |
|
118
|
|
|
$keys = $this->get_all_child_hashes( $children ); |
|
119
|
|
|
// print_r($keys); |
|
120
|
|
|
echo "JSON size: ".strlen(json_encode($keys))."\n"; |
|
121
|
|
|
|
|
122
|
|
|
return $keys; |
|
123
|
|
|
} |
|
124
|
|
|
|
|
125
|
|
|
/** |
|
126
|
|
|
* Find out which of my hashes are already present on the server |
|
127
|
|
|
*/ |
|
128
|
|
|
public function check_server( $keys ) { |
|
129
|
|
|
Jetpack::load_xml_rpc_client(); |
|
130
|
|
|
|
|
131
|
|
|
$query_args = array( 'timeout' => 60 ); |
|
132
|
|
|
$url = add_query_arg( $query_args, Jetpack::xmlrpc_api_url() ); |
|
133
|
|
|
$rpc = new Jetpack_IXR_Client( array( |
|
134
|
|
|
'url' => $url, |
|
135
|
|
|
'user_id' => JETPACK_MASTER_USER, |
|
136
|
|
|
'timeout' => $query_args['timeout'], |
|
137
|
|
|
) ); |
|
138
|
|
|
|
|
139
|
|
|
$response = array(); |
|
140
|
|
|
error_log("contacting server at $url"); |
|
141
|
|
|
$start = microtime( true ); |
|
142
|
|
|
foreach( array_chunk( $keys, 100 ) as $keys_chunk ) { |
|
143
|
|
|
$result = $rpc->query( 'jetpack.checkFiles', $keys_chunk ); |
|
144
|
|
|
error_log("received response for " . count($keys_chunk) . " keys in ".(microtime(true)-$start)." seconds"); |
|
145
|
|
|
if ( ! $result ) { |
|
146
|
|
|
return $rpc->get_jetpack_error(); |
|
147
|
|
|
} |
|
148
|
|
|
// queue up any files that are missing |
|
149
|
|
|
$hashes_to_upload = $rpc->getResponse(); |
|
|
|
|
|
|
150
|
|
|
} |
|
151
|
|
|
|
|
152
|
|
|
return $response; |
|
153
|
|
|
} |
|
154
|
|
|
|
|
155
|
|
|
/** |
|
156
|
|
|
* Recursively flattens the directory hierarchy into a simple array of hashes to |
|
157
|
|
|
* check against the server |
|
158
|
|
|
*/ |
|
159
|
|
|
private function get_all_child_hashes( $directory, &$hashes = array() ) { |
|
160
|
|
|
foreach ( $directory as $filename => $hash_or_children ) { |
|
161
|
|
|
if ( is_scalar( $hash_or_children ) ) { |
|
162
|
|
|
$hashes[] = $hash_or_children; |
|
163
|
|
|
} else { |
|
164
|
|
|
$hashes[] = $hash_or_children['hash']; |
|
165
|
|
|
$this->get_all_child_hashes( $hash_or_children['children'], $hashes ); |
|
166
|
|
|
} |
|
167
|
|
|
} |
|
168
|
|
|
return $hashes; |
|
169
|
|
|
} |
|
170
|
|
|
|
|
171
|
|
|
/** |
|
172
|
|
|
* Recursively fetches the hash of a directory and all its children, except certain blacklisted directories |
|
173
|
|
|
*/ |
|
174
|
|
|
private function get_directory_hash_and_children( $path ) { |
|
175
|
|
|
$children = array(); |
|
176
|
|
|
|
|
177
|
|
|
// normalize path |
|
178
|
|
|
if ( $path[ strlen( $path ) - 1 ] !== DIRECTORY_SEPARATOR ) { |
|
179
|
|
|
$path .= DIRECTORY_SEPARATOR; |
|
180
|
|
|
} |
|
181
|
|
|
|
|
182
|
|
|
// we concatenate hashes of all files and subdirs to generate |
|
183
|
|
|
// a unique hash for this directory |
|
184
|
|
|
$directory_hash_content = ''; |
|
185
|
|
|
|
|
186
|
|
|
// NOTE a directory's mtime is changed when a file is added or removed, but NOT when it's changed. |
|
187
|
|
|
// - useful! |
|
188
|
|
|
// ... but can be faked :( and so should not be trusted? |
|
189
|
|
|
$files = scandir( $path ); // we use scandir rather than opendir because it sorts alphabetically |
|
190
|
|
|
|
|
191
|
|
|
global $scanned_dirs_count; |
|
192
|
|
|
global $scanned_files_count; |
|
193
|
|
|
|
|
194
|
|
|
foreach( $files as $entry ) { |
|
195
|
|
|
$fullpath = $path . $entry; |
|
196
|
|
|
if ( is_dir( $fullpath ) |
|
197
|
|
|
&& $entry != '.' |
|
198
|
|
|
&& $entry != '..' |
|
199
|
|
|
&& $entry != 'node_modules' // TODO: make configurable |
|
|
|
|
|
|
200
|
|
|
&& $entry[0] != '.' ) { // is this desirable? can we miss files necessary for the site to function this way? |
|
201
|
|
|
$scanned_dirs_count += 1; |
|
202
|
|
|
list( $hash, $subchildren ) = $this->get_directory_hash_and_children( $fullpath ); |
|
203
|
|
|
$children[$entry] = array( |
|
204
|
|
|
'hash' => $hash, |
|
205
|
|
|
'children' => $subchildren |
|
206
|
|
|
); |
|
207
|
|
|
$directory_hash_content .= $hash; |
|
208
|
|
|
continue; |
|
209
|
|
|
} |
|
210
|
|
|
|
|
211
|
|
|
if ( is_file( $fullpath ) && $entry[0] != '.' ) { |
|
212
|
|
|
$scanned_files_count += 1; |
|
213
|
|
|
$hash = $this->get_file_hash( $fullpath ); |
|
214
|
|
|
$children[$entry] = $hash; |
|
215
|
|
|
$directory_hash_content .= $hash; |
|
216
|
|
|
} |
|
217
|
|
|
} |
|
218
|
|
|
return array( sha1( $directory_hash_content ), $children ); |
|
219
|
|
|
} |
|
220
|
|
|
|
|
221
|
|
|
/** |
|
222
|
|
|
* Generates a hash for a file as quickly as possible |
|
223
|
|
|
* It does this by only hashing the first FILE_HASH_SIZE bytes of the file and, if |
|
224
|
|
|
* the file size is > FILE_HASH_SIZE bytes, also the last FILE_HASH_SIZE bytes of the |
|
225
|
|
|
* file, or ( $filesize - FILE_HASH_SIZE ) bytes, if that is smaller. |
|
226
|
|
|
* |
|
227
|
|
|
* It includes the full path in the hash, minus ABSPATH, since the same file often ends up |
|
228
|
|
|
* in different parts of the tree. |
|
229
|
|
|
* TODO: test and account for 32 bit precision http://php.net/manual/en/function.filesize.php#121406 |
|
|
|
|
|
|
230
|
|
|
*/ |
|
231
|
|
|
private function get_file_hash( $fullpath ) { |
|
232
|
|
|
$filehandle = fopen( $fullpath, 'r' ); |
|
233
|
|
|
if ( ! $filehandle ) { |
|
234
|
|
|
return false; |
|
235
|
|
|
} |
|
236
|
|
|
|
|
237
|
|
|
$firstPart = fread( $filehandle, self::FILE_HASH_SIZE ); |
|
238
|
|
|
|
|
239
|
|
|
// if the bytes retrieved is < self::FILE_HASH_SIZE, just return, otherwise also load the last self::FILE_HASH_SIZE bytes |
|
240
|
|
|
// to maximise the chance of picking up changes |
|
241
|
|
|
if ( strlen( $firstPart ) <= self::FILE_HASH_SIZE ) { |
|
242
|
|
|
fclose( $filehandle ); |
|
243
|
|
|
return sha1( $firstPart ); |
|
244
|
|
|
} |
|
245
|
|
|
|
|
246
|
|
|
// calculate how much more of the file to read. Note that fseek uses INT internally so files > 2GB in size |
|
247
|
|
|
// will break on 32 bit PHP builds |
|
248
|
|
|
$lastPartBytesToRead = self::FILE_HASH_SIZE; |
|
249
|
|
|
$filesize = filesize( $fullpath ); |
|
250
|
|
|
|
|
251
|
|
|
// avoid reading overlapping bytes with the first hash, if the file is smaller than 2*FILE_HASH_SIZE |
|
252
|
|
|
if ( $filesize < 2*self::FILE_HASH_SIZE ) { |
|
253
|
|
|
$lastPartBytesToRead = $filesize - self::FILE_HASH_SIZE; |
|
254
|
|
|
} |
|
255
|
|
|
|
|
256
|
|
|
fseek($filehandle, -$lastPartBytesToRead); |
|
257
|
|
|
|
|
258
|
|
|
$lastPart = fread( $filehandle, $lastPartBytesToRead ); |
|
259
|
|
|
|
|
260
|
|
|
fclose( $filehandle ); |
|
261
|
|
|
return sha1( $firstPart . $lastPart ); |
|
262
|
|
|
} |
|
263
|
|
|
} |
|
264
|
|
|
|
|
265
|
|
|
add_action( 'init', array('Jetpack_Sync_Files', 'get_instance' ) ); |