|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* Job to update link tables for pages |
|
4
|
|
|
* |
|
5
|
|
|
* This program is free software; you can redistribute it and/or modify |
|
6
|
|
|
* it under the terms of the GNU General Public License as published by |
|
7
|
|
|
* the Free Software Foundation; either version 2 of the License, or |
|
8
|
|
|
* (at your option) any later version. |
|
9
|
|
|
* |
|
10
|
|
|
* This program is distributed in the hope that it will be useful, |
|
11
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13
|
|
|
* GNU General Public License for more details. |
|
14
|
|
|
* |
|
15
|
|
|
* You should have received a copy of the GNU General Public License along |
|
16
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc., |
|
17
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
|
18
|
|
|
* http://www.gnu.org/copyleft/gpl.html |
|
19
|
|
|
* |
|
20
|
|
|
* @file |
|
21
|
|
|
* @ingroup JobQueue |
|
22
|
|
|
*/ |
|
23
|
|
|
use MediaWiki\MediaWikiServices; |
|
24
|
|
|
|
|
25
|
|
|
/** |
|
26
|
|
|
* Job to update link tables for pages |
|
27
|
|
|
* |
|
28
|
|
|
* This job comes in a few variants: |
|
29
|
|
|
* - a) Recursive jobs to update links for backlink pages for a given title. |
|
30
|
|
|
* These jobs have (recursive:true,table:<table>) set. |
|
31
|
|
|
* - b) Jobs to update links for a set of pages (the job title is ignored). |
|
32
|
|
|
* These jobs have (pages:(<page ID>:(<namespace>,<title>),...) set. |
|
33
|
|
|
* - c) Jobs to update links for a single page (the job title) |
|
34
|
|
|
* These jobs need no extra fields set. |
|
35
|
|
|
* |
|
36
|
|
|
* @ingroup JobQueue |
|
37
|
|
|
*/ |
|
38
|
|
|
class RefreshLinksJob extends Job { |
|
39
|
|
|
/** @var float Cache parser output when it takes this long to render */ |
|
40
|
|
|
const PARSE_THRESHOLD_SEC = 1.0; |
|
41
|
|
|
/** @var integer Lag safety margin when comparing root job times to last-refresh times */ |
|
42
|
|
|
const CLOCK_FUDGE = 10; |
|
43
|
|
|
/** @var integer How many seconds to wait for slaves to catch up */ |
|
44
|
|
|
const LAG_WAIT_TIMEOUT = 15; |
|
45
|
|
|
|
|
46
|
|
|
function __construct( Title $title, array $params ) { |
|
47
|
|
|
parent::__construct( 'refreshLinks', $title, $params ); |
|
48
|
|
|
// Avoid the overhead of de-duplication when it would be pointless |
|
49
|
|
|
$this->removeDuplicates = ( |
|
50
|
|
|
// Ranges rarely will line up |
|
51
|
|
|
!isset( $params['range'] ) && |
|
52
|
|
|
// Multiple pages per job make matches unlikely |
|
53
|
|
|
!( isset( $params['pages'] ) && count( $params['pages'] ) != 1 ) |
|
54
|
|
|
); |
|
55
|
|
|
} |
|
56
|
|
|
|
|
57
|
|
|
/** |
|
58
|
|
|
* @param Title $title |
|
59
|
|
|
* @param array $params |
|
60
|
|
|
* @return RefreshLinksJob |
|
61
|
|
|
*/ |
|
62
|
|
|
public static function newPrioritized( Title $title, array $params ) { |
|
63
|
|
|
$job = new self( $title, $params ); |
|
64
|
|
|
$job->command = 'refreshLinksPrioritized'; |
|
65
|
|
|
|
|
66
|
|
|
return $job; |
|
67
|
|
|
} |
|
68
|
|
|
|
|
69
|
|
|
/** |
|
70
|
|
|
* @param Title $title |
|
71
|
|
|
* @param array $params |
|
72
|
|
|
* @return RefreshLinksJob |
|
73
|
|
|
*/ |
|
74
|
|
|
public static function newDynamic( Title $title, array $params ) { |
|
75
|
|
|
$job = new self( $title, $params ); |
|
76
|
|
|
$job->command = 'refreshLinksDynamic'; |
|
77
|
|
|
|
|
78
|
|
|
return $job; |
|
79
|
|
|
} |
|
80
|
|
|
|
|
81
|
|
|
function run() { |
|
82
|
|
|
global $wgUpdateRowsPerJob; |
|
83
|
|
|
|
|
84
|
|
|
// Job to update all (or a range of) backlink pages for a page |
|
85
|
|
|
if ( !empty( $this->params['recursive'] ) ) { |
|
86
|
|
|
// When the base job branches, wait for the slaves to catch up to the master. |
|
87
|
|
|
// From then on, we know that any template changes at the time the base job was |
|
88
|
|
|
// enqueued will be reflected in backlink page parses when the leaf jobs run. |
|
89
|
|
|
if ( !isset( $params['range'] ) ) { |
|
|
|
|
|
|
90
|
|
|
try { |
|
91
|
|
|
wfGetLBFactory()->waitForReplication( [ |
|
|
|
|
|
|
92
|
|
|
'wiki' => wfWikiID(), |
|
93
|
|
|
'timeout' => self::LAG_WAIT_TIMEOUT |
|
94
|
|
|
] ); |
|
95
|
|
|
} catch ( DBReplicationWaitError $e ) { // only try so hard |
|
96
|
|
|
$stats = MediaWikiServices::getInstance()->getStatsdDataFactory(); |
|
97
|
|
|
$stats->increment( 'refreshlinks.lag_wait_failed' ); |
|
98
|
|
|
} |
|
99
|
|
|
} |
|
100
|
|
|
// Carry over information for de-duplication |
|
101
|
|
|
$extraParams = $this->getRootJobParams(); |
|
102
|
|
|
$extraParams['triggeredRecursive'] = true; |
|
103
|
|
|
// Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title |
|
104
|
|
|
// jobs and possibly a recursive RefreshLinks job for the rest of the backlinks |
|
105
|
|
|
$jobs = BacklinkJobUtils::partitionBacklinkJob( |
|
106
|
|
|
$this, |
|
107
|
|
|
$wgUpdateRowsPerJob, |
|
108
|
|
|
1, // job-per-title |
|
109
|
|
|
[ 'params' => $extraParams ] |
|
110
|
|
|
); |
|
111
|
|
|
JobQueueGroup::singleton()->push( $jobs ); |
|
112
|
|
|
// Job to update link tables for a set of titles |
|
113
|
|
|
} elseif ( isset( $this->params['pages'] ) ) { |
|
114
|
|
|
foreach ( $this->params['pages'] as $pageId => $nsAndKey ) { |
|
115
|
|
|
list( $ns, $dbKey ) = $nsAndKey; |
|
116
|
|
|
$this->runForTitle( Title::makeTitleSafe( $ns, $dbKey ) ); |
|
|
|
|
|
|
117
|
|
|
} |
|
118
|
|
|
// Job to update link tables for a given title |
|
119
|
|
|
} else { |
|
120
|
|
|
$this->runForTitle( $this->title ); |
|
121
|
|
|
} |
|
122
|
|
|
|
|
123
|
|
|
return true; |
|
124
|
|
|
} |
|
125
|
|
|
|
|
126
|
|
|
/** |
|
127
|
|
|
* @param Title $title |
|
128
|
|
|
* @return bool |
|
129
|
|
|
*/ |
|
130
|
|
|
protected function runForTitle( Title $title ) { |
|
131
|
|
|
$page = WikiPage::factory( $title ); |
|
132
|
|
|
if ( !empty( $this->params['triggeringRevisionId'] ) ) { |
|
133
|
|
|
// Fetch the specified revision; lockAndGetLatest() below detects if the page |
|
134
|
|
|
// was edited since and aborts in order to avoid corrupting the link tables |
|
135
|
|
|
$revision = Revision::newFromId( |
|
136
|
|
|
$this->params['triggeringRevisionId'], |
|
137
|
|
|
Revision::READ_LATEST |
|
138
|
|
|
); |
|
139
|
|
|
} else { |
|
140
|
|
|
// Fetch current revision; READ_LATEST reduces lockAndGetLatest() check failures |
|
141
|
|
|
$revision = Revision::newFromTitle( $title, false, Revision::READ_LATEST ); |
|
142
|
|
|
} |
|
143
|
|
|
|
|
144
|
|
|
$stats = MediaWikiServices::getInstance()->getStatsdDataFactory(); |
|
145
|
|
|
|
|
146
|
|
|
if ( !$revision ) { |
|
147
|
|
|
$stats->increment( 'refreshlinks.rev_not_found' ); |
|
148
|
|
|
$this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" ); |
|
149
|
|
|
return false; // just deleted? |
|
150
|
|
|
} elseif ( !$revision->isCurrent() ) { |
|
151
|
|
|
// If the revision isn't current, there's no point in doing a bunch |
|
152
|
|
|
// of work just to fail at the lockAndGetLatest() check later. |
|
153
|
|
|
$stats->increment( 'refreshlinks.rev_not_current' ); |
|
154
|
|
|
$this->setLastError( "Revision {$revision->getId()} is not current" ); |
|
155
|
|
|
return false; |
|
156
|
|
|
} |
|
157
|
|
|
|
|
158
|
|
|
$content = $revision->getContent( Revision::RAW ); |
|
159
|
|
|
if ( !$content ) { |
|
160
|
|
|
// If there is no content, pretend the content is empty |
|
161
|
|
|
$content = $revision->getContentHandler()->makeEmptyContent(); |
|
162
|
|
|
} |
|
163
|
|
|
|
|
164
|
|
|
$parserOutput = false; |
|
165
|
|
|
$parserOptions = $page->makeParserOptions( 'canonical' ); |
|
166
|
|
|
// If page_touched changed after this root job, then it is likely that |
|
167
|
|
|
// any views of the pages already resulted in re-parses which are now in |
|
168
|
|
|
// cache. The cache can be reused to avoid expensive parsing in some cases. |
|
169
|
|
|
if ( isset( $this->params['rootJobTimestamp'] ) ) { |
|
170
|
|
|
$opportunistic = !empty( $this->params['isOpportunistic'] ); |
|
171
|
|
|
|
|
172
|
|
|
$skewedTimestamp = $this->params['rootJobTimestamp']; |
|
173
|
|
|
if ( $opportunistic ) { |
|
|
|
|
|
|
174
|
|
|
// Neither clock skew nor DB snapshot/slave lag matter much for such |
|
175
|
|
|
// updates; focus on reusing the (often recently updated) cache |
|
176
|
|
|
} else { |
|
177
|
|
|
// For transclusion updates, the template changes must be reflected |
|
178
|
|
|
$skewedTimestamp = wfTimestamp( TS_MW, |
|
179
|
|
|
wfTimestamp( TS_UNIX, $skewedTimestamp ) + self::CLOCK_FUDGE |
|
180
|
|
|
); |
|
181
|
|
|
} |
|
182
|
|
|
|
|
183
|
|
|
if ( $page->getLinksTimestamp() > $skewedTimestamp ) { |
|
184
|
|
|
// Something already updated the backlinks since this job was made |
|
185
|
|
|
$stats->increment( 'refreshlinks.update_skipped' ); |
|
186
|
|
|
return true; |
|
187
|
|
|
} |
|
188
|
|
|
|
|
189
|
|
|
if ( $page->getTouched() >= $this->params['rootJobTimestamp'] || $opportunistic ) { |
|
190
|
|
|
// Cache is suspected to be up-to-date. As long as the cache rev ID matches |
|
191
|
|
|
// and it reflects the job's triggering change, then it is usable. |
|
192
|
|
|
$parserOutput = ParserCache::singleton()->getDirty( $page, $parserOptions ); |
|
193
|
|
|
if ( !$parserOutput |
|
194
|
|
|
|| $parserOutput->getCacheRevisionId() != $revision->getId() |
|
195
|
|
|
|| $parserOutput->getCacheTime() < $skewedTimestamp |
|
196
|
|
|
) { |
|
197
|
|
|
$parserOutput = false; // too stale |
|
198
|
|
|
} |
|
199
|
|
|
} |
|
200
|
|
|
} |
|
201
|
|
|
|
|
202
|
|
|
// Fetch the current revision and parse it if necessary... |
|
203
|
|
|
if ( $parserOutput ) { |
|
204
|
|
|
$stats->increment( 'refreshlinks.parser_cached' ); |
|
205
|
|
|
} else { |
|
206
|
|
|
$start = microtime( true ); |
|
207
|
|
|
// Revision ID must be passed to the parser output to get revision variables correct |
|
208
|
|
|
$parserOutput = $content->getParserOutput( |
|
209
|
|
|
$title, $revision->getId(), $parserOptions, false ); |
|
210
|
|
|
$elapsed = microtime( true ) - $start; |
|
211
|
|
|
// If it took a long time to render, then save this back to the cache to avoid |
|
212
|
|
|
// wasted CPU by other apaches or job runners. We don't want to always save to |
|
213
|
|
|
// cache as this can cause high cache I/O and LRU churn when a template changes. |
|
214
|
|
|
if ( $elapsed >= self::PARSE_THRESHOLD_SEC |
|
215
|
|
|
&& $page->shouldCheckParserCache( $parserOptions, $revision->getId() ) |
|
216
|
|
|
&& $parserOutput->isCacheable() |
|
217
|
|
|
) { |
|
218
|
|
|
$ctime = wfTimestamp( TS_MW, (int)$start ); // cache time |
|
219
|
|
|
ParserCache::singleton()->save( |
|
220
|
|
|
$parserOutput, $page, $parserOptions, $ctime, $revision->getId() |
|
|
|
|
|
|
221
|
|
|
); |
|
222
|
|
|
} |
|
223
|
|
|
$stats->increment( 'refreshlinks.parser_uncached' ); |
|
224
|
|
|
} |
|
225
|
|
|
|
|
226
|
|
|
$updates = $content->getSecondaryDataUpdates( |
|
227
|
|
|
$title, |
|
228
|
|
|
null, |
|
229
|
|
|
!empty( $this->params['useRecursiveLinksUpdate'] ), |
|
230
|
|
|
$parserOutput |
|
231
|
|
|
); |
|
232
|
|
|
|
|
233
|
|
|
foreach ( $updates as $key => $update ) { |
|
234
|
|
|
// FIXME: This code probably shouldn't be here? |
|
235
|
|
|
// Needed by things like Echo notifications which need |
|
236
|
|
|
// to know which user caused the links update |
|
237
|
|
|
if ( $update instanceof LinksUpdate ) { |
|
238
|
|
|
$update->setRevision( $revision ); |
|
239
|
|
|
if ( !empty( $this->params['triggeringUser'] ) ) { |
|
240
|
|
|
$userInfo = $this->params['triggeringUser']; |
|
241
|
|
|
if ( $userInfo['userId'] ) { |
|
242
|
|
|
$user = User::newFromId( $userInfo['userId'] ); |
|
243
|
|
|
} else { |
|
244
|
|
|
// Anonymous, use the username |
|
245
|
|
|
$user = User::newFromName( $userInfo['userName'], false ); |
|
246
|
|
|
} |
|
247
|
|
|
$update->setTriggeringUser( $user ); |
|
|
|
|
|
|
248
|
|
|
} |
|
249
|
|
|
} |
|
250
|
|
|
} |
|
251
|
|
|
|
|
252
|
|
|
$latestNow = $page->lockAndGetLatest(); |
|
253
|
|
|
if ( !$latestNow || $revision->getId() != $latestNow ) { |
|
254
|
|
|
// Do not clobber over newer updates with older ones. If all jobs where FIFO and |
|
255
|
|
|
// serialized, it would be OK to update links based on older revisions since it |
|
256
|
|
|
// would eventually get to the latest. Since that is not the case (by design), |
|
257
|
|
|
// only update the link tables to a state matching the current revision's output. |
|
258
|
|
|
$stats->increment( 'refreshlinks.rev_cas_failure' ); |
|
259
|
|
|
$this->setLastError( "page_latest changed from {$revision->getId()} to $latestNow" ); |
|
260
|
|
|
return false; |
|
261
|
|
|
} |
|
262
|
|
|
|
|
263
|
|
|
DataUpdate::runUpdates( $updates ); |
|
264
|
|
|
|
|
265
|
|
|
InfoAction::invalidateCache( $title ); |
|
266
|
|
|
|
|
267
|
|
|
return true; |
|
268
|
|
|
} |
|
269
|
|
|
|
|
270
|
|
|
public function getDeduplicationInfo() { |
|
271
|
|
|
$info = parent::getDeduplicationInfo(); |
|
272
|
|
|
if ( is_array( $info['params'] ) ) { |
|
273
|
|
|
// For per-pages jobs, the job title is that of the template that changed |
|
274
|
|
|
// (or similar), so remove that since it ruins duplicate detection |
|
275
|
|
|
if ( isset( $info['pages'] ) ) { |
|
276
|
|
|
unset( $info['namespace'] ); |
|
277
|
|
|
unset( $info['title'] ); |
|
278
|
|
|
} |
|
279
|
|
|
} |
|
280
|
|
|
|
|
281
|
|
|
return $info; |
|
282
|
|
|
} |
|
283
|
|
|
|
|
284
|
|
|
public function workItemCount() { |
|
285
|
|
|
return isset( $this->params['pages'] ) ? count( $this->params['pages'] ) : 1; |
|
286
|
|
|
} |
|
287
|
|
|
} |
|
288
|
|
|
|
This check looks for calls to
isset(...)orempty()on variables that are yet undefined. These calls will always produce the same result and can be removed.This is most likely caused by the renaming of a variable or the removal of a function/method parameter.