Completed
Branch master (e2eefa)
by
unknown
25:58
created

BaseDump   B

Complexity

Total Complexity 36

Size/Duplication

Total Lines 177
Duplicated Lines 19.21 %

Coupling/Cohesion

Components 1
Dependencies 0
Metric Value
dl 34
loc 177
rs 8.8
wmc 36
lcom 1
cbo 0

9 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 10 2
D prefetch() 0 29 10
A debug() 0 5 1
A nextPage() 5 16 4
A nextRev() 7 9 3
A nextText() 0 5 1
B skipTo() 0 21 7
C nodeContents() 22 22 7
A close() 0 6 1

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
<?php
2
/**
3
 * Helper class for the --prefetch option of dumpTextPass.php
4
 *
5
 * Copyright © 2005 Brion Vibber <[email protected]>
6
 * https://www.mediawiki.org/
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License along
19
 * with this program; if not, write to the Free Software Foundation, Inc.,
20
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21
 * http://www.gnu.org/copyleft/gpl.html
22
 *
23
 * @file
24
 * @ingroup Maintenance
25
 */
26
27
/**
28
 * Readahead helper for making large MediaWiki data dumps;
29
 * reads in a previous XML dump to sequentially prefetch text
30
 * records already normalized and decompressed.
31
 *
32
 * This can save load on the external database servers, hopefully.
33
 *
34
 * Assumes that dumps will be recorded in the canonical order:
35
 * - ascending by page_id
36
 * - ascending by rev_id within each page
37
 * - text contents are immutable and should not change once
38
 *   recorded, so the previous dump is a reliable source
39
 *
40
 * @ingroup Maintenance
41
 */
42
class BaseDump {
43
	protected $reader = null;
44
	protected $atEnd = false;
45
	protected $atPageEnd = false;
46
	protected $lastPage = 0;
47
	protected $lastRev = 0;
48
	protected $infiles = null;
49
50
	public function __construct( $infile ) {
51
		$this->infiles = explode( ';', $infile );
52
		$this->reader = new XMLReader();
53
		$infile = array_shift( $this->infiles );
54
		if ( defined( 'LIBXML_PARSEHUGE' ) ) {
55
			$this->reader->open( $infile, null, LIBXML_PARSEHUGE );
56
		} else {
57
			$this->reader->open( $infile );
58
		}
59
	}
60
61
	/**
62
	 * Attempts to fetch the text of a particular page revision
63
	 * from the dump stream. May return null if the page is
64
	 * unavailable.
65
	 *
66
	 * @param int $page ID number of page to read
67
	 * @param int $rev ID number of revision to read
68
	 * @return string|null
69
	 */
70
	function prefetch( $page, $rev ) {
71
		$page = intval( $page );
72
		$rev = intval( $rev );
73
		while ( $this->lastPage < $page && !$this->atEnd ) {
74
			$this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
75
			$this->nextPage();
76
		}
77
		if ( $this->lastPage > $page || $this->atEnd ) {
78
			$this->debug( "BaseDump::prefetch already past page $page "
79
				. "looking for rev $rev  [$this->lastPage, $this->lastRev]" );
80
81
			return null;
82
		}
83
		while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
84
			$this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, "
85
				. "looking for $page, $rev" );
86
			$this->nextRev();
87
		}
88
		if ( $this->lastRev == $rev && !$this->atEnd ) {
89
			$this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
90
91
			return $this->nextText();
92
		} else {
93
			$this->debug( "BaseDump::prefetch already past rev $rev on page $page "
94
				. "[$this->lastPage, $this->lastRev]" );
95
96
			return null;
97
		}
98
	}
99
100
	function debug( $str ) {
101
		wfDebug( $str . "\n" );
102
		// global $dumper;
103
		// $dumper->progress( $str );
104
	}
105
106
	/**
107
	 * @access private
108
	 */
109
	function nextPage() {
110
		if ( $this->skipTo( 'page', 'mediawiki' ) ) {
111 View Code Duplication
			if ( $this->skipTo( 'id' ) ) {
112
				$this->lastPage = intval( $this->nodeContents() );
113
				$this->lastRev = 0;
114
				$this->atPageEnd = false;
115
			}
116
		} else {
117
			$this->close();
118
			if ( count( $this->infiles ) ) {
119
				$infile = array_shift( $this->infiles );
120
				$this->reader->open( $infile );
121
				$this->atEnd = false;
122
			}
123
		}
124
	}
125
126
	/**
127
	 * @access private
128
	 */
129
	function nextRev() {
130 View Code Duplication
		if ( $this->skipTo( 'revision' ) ) {
131
			if ( $this->skipTo( 'id' ) ) {
132
				$this->lastRev = intval( $this->nodeContents() );
133
			}
134
		} else {
135
			$this->atPageEnd = true;
136
		}
137
	}
138
139
	/**
140
	 * @access private
141
	 * @return string
142
	 */
143
	function nextText() {
144
		$this->skipTo( 'text' );
145
146
		return strval( $this->nodeContents() );
147
	}
148
149
	/**
150
	 * @access private
151
	 * @param string $name
152
	 * @param string $parent
153
	 * @return bool|null
154
	 */
155
	function skipTo( $name, $parent = 'page' ) {
156
		if ( $this->atEnd ) {
157
			return false;
158
		}
159
		while ( $this->reader->read() ) {
160
			if ( $this->reader->nodeType == XMLReader::ELEMENT
161
				&& $this->reader->name == $name
162
			) {
163
				return true;
164
			}
165
			if ( $this->reader->nodeType == XMLReader::END_ELEMENT
166
				&& $this->reader->name == $parent
167
			) {
168
				$this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
169
170
				return false;
171
			}
172
		}
173
174
		return $this->close();
175
	}
176
177
	/**
178
	 * Shouldn't something like this be built-in to XMLReader?
179
	 * Fetches text contents of the current element, assuming
180
	 * no sub-elements or such scary things.
181
	 *
182
	 * @return string
183
	 * @access private
184
	 */
185 View Code Duplication
	function nodeContents() {
186
		if ( $this->atEnd ) {
187
			return null;
188
		}
189
		if ( $this->reader->isEmptyElement ) {
190
			return "";
191
		}
192
		$buffer = "";
193
		while ( $this->reader->read() ) {
194
			switch ( $this->reader->nodeType ) {
195
				case XMLReader::TEXT:
196
				// case XMLReader::WHITESPACE:
197
				case XMLReader::SIGNIFICANT_WHITESPACE:
198
					$buffer .= $this->reader->value;
199
					break;
200
				case XMLReader::END_ELEMENT:
201
					return $buffer;
202
			}
203
		}
204
205
		return $this->close();
206
	}
207
208
	/**
209
	 * @access private
210
	 * @return null
211
	 */
212
	function close() {
213
		$this->reader->close();
214
		$this->atEnd = true;
215
216
		return null;
217
	}
218
}
219