DjVuImage::__construct()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * DjVu image handler.
4
 *
5
 * Copyright © 2006 Brion Vibber <[email protected]>
6
 * https://www.mediawiki.org/
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License along
19
 * with this program; if not, write to the Free Software Foundation, Inc.,
20
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21
 * http://www.gnu.org/copyleft/gpl.html
22
 *
23
 * @file
24
 * @ingroup Media
25
 */
26
27
/**
28
 * Support for detecting/validating DjVu image files and getting
29
 * some basic file metadata (resolution etc)
30
 *
31
 * File format docs are available in source package for DjVuLibre:
32
 * http://djvulibre.djvuzone.org/
33
 *
34
 * @ingroup Media
35
 */
36
class DjVuImage {
37
	/**
38
	 * @const DJVUTXT_MEMORY_LIMIT Memory limit for the DjVu description software
39
	 */
40
	const DJVUTXT_MEMORY_LIMIT = 300000;
41
42
	/**
43
	 * Constructor
44
	 *
45
	 * @param string $filename The DjVu file name.
46
	 */
47
	function __construct( $filename ) {
48
		$this->mFilename = $filename;
0 ignored issues
show
Bug introduced by
The property mFilename does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
49
	}
50
51
	/**
52
	 * Check if the given file is indeed a valid DjVu image file
53
	 * @return bool
54
	 */
55
	public function isValid() {
56
		$info = $this->getInfo();
57
58
		return $info !== false;
59
	}
60
61
	/**
62
	 * Return data in the style of getimagesize()
63
	 * @return array|bool Array or false on failure
64
	 */
65
	public function getImageSize() {
66
		$data = $this->getInfo();
67
68
		if ( $data !== false ) {
69
			$width = $data['width'];
70
			$height = $data['height'];
71
72
			return [ $width, $height, 'DjVu',
73
				"width=\"$width\" height=\"$height\"" ];
74
		}
75
76
		return false;
77
	}
78
79
	// ---------
80
81
	/**
82
	 * For debugging; dump the IFF chunk structure
83
	 */
84
	function dump() {
85
		$file = fopen( $this->mFilename, 'rb' );
86
		$header = fread( $file, 12 );
87
		$arr = unpack( 'a4magic/a4chunk/NchunkLength', $header );
88
		$chunk = $arr['chunk'];
89
		$chunkLength = $arr['chunkLength'];
90
		echo "$chunk $chunkLength\n";
91
		$this->dumpForm( $file, $chunkLength, 1 );
92
		fclose( $file );
93
	}
94
95
	private function dumpForm( $file, $length, $indent ) {
96
		$start = ftell( $file );
97
		$secondary = fread( $file, 4 );
98
		echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
99
		while ( ftell( $file ) - $start < $length ) {
100
			$chunkHeader = fread( $file, 8 );
101
			if ( $chunkHeader == '' ) {
102
				break;
103
			}
104
			$arr = unpack( 'a4chunk/NchunkLength', $chunkHeader );
105
			$chunk = $arr['chunk'];
106
			$chunkLength = $arr['chunkLength'];
107
			echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
108
109
			if ( $chunk == 'FORM' ) {
110
				$this->dumpForm( $file, $chunkLength, $indent + 1 );
111 View Code Duplication
			} else {
112
				fseek( $file, $chunkLength, SEEK_CUR );
113
				if ( $chunkLength & 1 == 1 ) {
114
					// Padding byte between chunks
115
					fseek( $file, 1, SEEK_CUR );
116
				}
117
			}
118
		}
119
	}
120
121
	function getInfo() {
122
		MediaWiki\suppressWarnings();
123
		$file = fopen( $this->mFilename, 'rb' );
124
		MediaWiki\restoreWarnings();
125
		if ( $file === false ) {
126
			wfDebug( __METHOD__ . ": missing or failed file read\n" );
127
128
			return false;
129
		}
130
131
		$header = fread( $file, 16 );
132
		$info = false;
133
134
		if ( strlen( $header ) < 16 ) {
135
			wfDebug( __METHOD__ . ": too short file header\n" );
136
		} else {
137
			$arr = unpack( 'a4magic/a4form/NformLength/a4subtype', $header );
138
139
			$subtype = $arr['subtype'];
140
			if ( $arr['magic'] != 'AT&T' ) {
141
				wfDebug( __METHOD__ . ": not a DjVu file\n" );
142
			} elseif ( $subtype == 'DJVU' ) {
143
				// Single-page document
144
				$info = $this->getPageInfo( $file );
145
			} elseif ( $subtype == 'DJVM' ) {
146
				// Multi-page document
147
				$info = $this->getMultiPageInfo( $file, $arr['formLength'] );
148
			} else {
149
				wfDebug( __METHOD__ . ": unrecognized DJVU file type '{$arr['subtype']}'\n" );
150
			}
151
		}
152
		fclose( $file );
153
154
		return $info;
155
	}
156
157
	private function readChunk( $file ) {
158
		$header = fread( $file, 8 );
159
		if ( strlen( $header ) < 8 ) {
160
			return [ false, 0 ];
161
		} else {
162
			$arr = unpack( 'a4chunk/Nlength', $header );
163
164
			return [ $arr['chunk'], $arr['length'] ];
165
		}
166
	}
167
168
	private function skipChunk( $file, $chunkLength ) {
169
		fseek( $file, $chunkLength, SEEK_CUR );
170
171 View Code Duplication
		if ( $chunkLength & 0x01 == 1 && !feof( $file ) ) {
172
			// padding byte
173
			fseek( $file, 1, SEEK_CUR );
174
		}
175
	}
176
177
	private function getMultiPageInfo( $file, $formLength ) {
178
		// For now, we'll just look for the first page in the file
179
		// and report its information, hoping others are the same size.
180
		$start = ftell( $file );
181
		do {
182
			list( $chunk, $length ) = $this->readChunk( $file );
183
			if ( !$chunk ) {
184
				break;
185
			}
186
187
			if ( $chunk == 'FORM' ) {
188
				$subtype = fread( $file, 4 );
189
				if ( $subtype == 'DJVU' ) {
190
					wfDebug( __METHOD__ . ": found first subpage\n" );
191
192
					return $this->getPageInfo( $file );
193
				}
194
				$this->skipChunk( $file, $length - 4 );
195
			} else {
196
				wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" );
197
				$this->skipChunk( $file, $length );
198
			}
199
		} while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
200
201
		wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" );
202
203
		return false;
204
	}
205
206
	private function getPageInfo( $file ) {
207
		list( $chunk, $length ) = $this->readChunk( $file );
208
		if ( $chunk != 'INFO' ) {
209
			wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" );
210
211
			return false;
212
		}
213
214
		if ( $length < 9 ) {
215
			wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" );
216
217
			return false;
218
		}
219
		$data = fread( $file, $length );
220
		if ( strlen( $data ) < $length ) {
221
			wfDebug( __METHOD__ . ": INFO chunk cut off\n" );
222
223
			return false;
224
		}
225
226
		$arr = unpack(
227
			'nwidth/' .
228
			'nheight/' .
229
			'Cminor/' .
230
			'Cmajor/' .
231
			'vresolution/' .
232
			'Cgamma', $data );
233
234
		# Newer files have rotation info in byte 10, but we don't use it yet.
235
236
		return [
237
			'width' => $arr['width'],
238
			'height' => $arr['height'],
239
			'version' => "{$arr['major']}.{$arr['minor']}",
240
			'resolution' => $arr['resolution'],
241
			'gamma' => $arr['gamma'] / 10.0 ];
242
	}
243
244
	/**
245
	 * Return an XML string describing the DjVu image
246
	 * @return string|bool
247
	 */
248
	function retrieveMetaData() {
249
		global $wgDjvuToXML, $wgDjvuDump, $wgDjvuTxt;
250
251
		if ( !$this->isValid() ) {
252
			return false;
253
		}
254
255
		if ( isset( $wgDjvuDump ) ) {
256
			# djvudump is faster as of version 3.5
257
			# https://sourceforge.net/p/djvu/bugs/71/
258
			$cmd = wfEscapeShellArg( $wgDjvuDump ) . ' ' . wfEscapeShellArg( $this->mFilename );
259
			$dump = wfShellExec( $cmd );
260
			$xml = $this->convertDumpToXML( $dump );
261
		} elseif ( isset( $wgDjvuToXML ) ) {
262
			$cmd = wfEscapeShellArg( $wgDjvuToXML ) . ' --without-anno --without-text ' .
263
				wfEscapeShellArg( $this->mFilename );
264
			$xml = wfShellExec( $cmd );
265
		} else {
266
			$xml = null;
267
		}
268
		# Text layer
269
		if ( isset( $wgDjvuTxt ) ) {
270
			$cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename );
271
			wfDebug( __METHOD__ . ": $cmd\n" );
272
			$retval = '';
273
			$txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
274
			if ( $retval == 0 ) {
275
				# Strip some control characters
276
				$txt = preg_replace( "/[\013\035\037]/", "", $txt );
277
				$reg = <<<EOR
278
					/\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
279
					((?>    # Text to match is composed of atoms of either:
280
					  \\\\. # - any escaped character
281
					  |     # - any character different from " and \
282
					  [^"\\\\]+
283
					)*?)
284
					"\s*\)
285
					| # Or page can be empty ; in this case, djvutxt dumps ()
286
					\(\s*()\)/sx
287
EOR;
288
				$txt = preg_replace_callback( $reg, [ $this, 'pageTextCallback' ], $txt );
289
				$txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n";
290
				$xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 );
291
				$xml = $xml . $txt . '</mw-djvu>';
292
			}
293
		}
294
295
		return $xml;
296
	}
297
298
	function pageTextCallback( $matches ) {
299
		# Get rid of invalid UTF-8, strip control characters
300
		$val = htmlspecialchars( UtfNormal\Validator::cleanUp( stripcslashes( $matches[1] ) ) );
301
		$val = str_replace( [ "\n", '�' ], [ '&#10;', '' ], $val );
302
		return '<PAGE value="' . $val . '" />';
303
	}
304
305
	/**
306
	 * Hack to temporarily work around djvutoxml bug
307
	 * @param string $dump
308
	 * @return string
309
	 */
310
	function convertDumpToXML( $dump ) {
311
		if ( strval( $dump ) == '' ) {
312
			return false;
313
		}
314
315
		$xml = <<<EOT
316
<?xml version="1.0" ?>
317
<!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd">
318
<DjVuXML>
319
<HEAD></HEAD>
320
<BODY>
321
EOT;
322
323
		$dump = str_replace( "\r", '', $dump );
324
		$line = strtok( $dump, "\n" );
325
		$m = false;
326
		$good = false;
327
		if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
328
			# Single-page
329
			if ( $this->parseFormDjvu( $line, $xml ) ) {
330
				$good = true;
331
			} else {
332
				return false;
333
			}
334
		} elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
335
			# Multi-page
336
			$parentLevel = strlen( $m[1] );
337
			# Find DIRM
338
			$line = strtok( "\n" );
339
			while ( $line !== false ) {
340
				$childLevel = strspn( $line, ' ' );
341
				if ( $childLevel <= $parentLevel ) {
342
					# End of chunk
343
					break;
344
				}
345
346
				if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
347
					wfDebug( "Indirect multi-page DjVu document, bad for server!\n" );
348
349
					return false;
350
				}
351
				if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
352
					# Found page
353
					if ( $this->parseFormDjvu( $line, $xml ) ) {
354
						$good = true;
355
					} else {
356
						return false;
357
					}
358
				}
359
				$line = strtok( "\n" );
360
			}
361
		}
362
		if ( !$good ) {
363
			return false;
364
		}
365
366
		$xml .= "</BODY>\n</DjVuXML>\n";
367
368
		return $xml;
369
	}
370
371
	function parseFormDjvu( $line, &$xml ) {
372
		$parentLevel = strspn( $line, ' ' );
373
		$line = strtok( "\n" );
374
375
		# Find INFO
376
		while ( $line !== false ) {
377
			$childLevel = strspn( $line, ' ' );
378
			if ( $childLevel <= $parentLevel ) {
379
				# End of chunk
380
				break;
381
			}
382
383
			if ( preg_match(
384
				'/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
385
				$line,
386
				$m
387
			) ) {
388
				$xml .= Xml::tags(
389
					'OBJECT',
390
					[
391
						# 'data' => '',
392
						# 'type' => 'image/x.djvu',
393
						'height' => $m[2],
394
						'width' => $m[1],
395
						# 'usemap' => '',
396
					],
397
					"\n" .
398
						Xml::element( 'PARAM', [ 'name' => 'DPI', 'value' => $m[3] ] ) . "\n" .
399
						Xml::element( 'PARAM', [ 'name' => 'GAMMA', 'value' => $m[4] ] ) . "\n"
400
				) . "\n";
401
402
				return true;
403
			}
404
			$line = strtok( "\n" );
405
		}
406
407
		# Not found
408
		return false;
409
	}
410
}
411