Completed
Push — vendor/getid3 ( dfd0b4...d7a1ee )
by Pauli
02:47
created

getid3_pdf::parseXREF()   A

Complexity

Conditions 5
Paths 7

Size

Total Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
nc 7
nop 1
dl 0
loc 28
rs 9.1608
c 0
b 0
f 0
1
<?php
2
3
/////////////////////////////////////////////////////////////////
4
/// getID3() by James Heinrich <[email protected]>               //
5
//  available at https://github.com/JamesHeinrich/getID3       //
6
//            or https://www.getid3.org                        //
7
//            or http://getid3.sourceforge.net                 //
8
//  see readme.txt for more details                            //
9
/////////////////////////////////////////////////////////////////
10
//                                                             //
11
// module.misc.pdf.php                                         //
12
// module for analyzing PDF files                              //
13
// dependencies: NONE                                          //
14
//                                                            ///
15
/////////////////////////////////////////////////////////////////
16
17
if (!defined('GETID3_INCLUDEPATH')) { // prevent path-exposing attacks that access modules directly on public webservers
18
	exit;
19
}
20
21
class getid3_pdf extends getid3_handler
22
{
23
	public $returnXREF = false; // return full details of PDF Cross-Reference Table (XREF)
24
25
	/**
26
	 * @return bool
27
	 */
28
	public function Analyze() {
29
		$info = &$this->getid3->info;
30
31
		$this->fseek(0);
32
		if (preg_match('#^%PDF-([0-9\\.]+)$#', rtrim($this->fgets()), $matches)) {
33
			$info['pdf']['header']['version'] = floatval($matches[1]);
34
			$info['fileformat'] = 'pdf';
35
36
			// the PDF Cross-Reference Table (XREF) is located near the end of the file
37
			// the starting offset is specified in the penultimate section, on the two lines just before "%%EOF"
38
			// the first line is "startxref", the second line is the byte offset of the XREF.
39
			// We know the length of "%%EOF" and "startxref", but the offset could be 2-10 bytes,
40
			// and we're not sure if the line ends are one or two bytes, so we might find "startxref" as little as 18(?) bytes
41
			// from EOF, but it could 30 bytes, so we start 40 bytes back just to be safe and do a search for the data we want.
42
			$this->fseek(-40, SEEK_END);
43
			if (preg_match('#[\r\n]startxref[ \r\n]+([0-9]+)[ \r\n]+#', $this->fread(40), $matches)) {
44
				$info['pdf']['trailer']['startxref'] = intval($matches[1]);
45
				$this->parseXREF($info['pdf']['trailer']['startxref']);
46
				if (!empty($info['pdf']['xref']['offset'])) {
47
					while (!$this->feof() && (max(array_keys($info['pdf']['xref']['offset'])) > $info['pdf']['xref']['count'])) {
48
						// suspect that there may be another XREF entry somewhere in the file, brute-force scan for it
49
						/*
50
						// starting at last known entry of main XREF table
51
						$this->fseek(max($info['pdf']['xref']['offset']));
52
						*/
53
						// starting at the beginning of the file
54
						$this->fseek(0);
55
						while (!$this->feof()) {
56
							$XREFoffset = $this->ftell();
57
							if (rtrim($this->fgets()) == 'xref') {
58
								if (empty($info['pdf']['xref']['xref_offsets']) || !in_array($XREFoffset, $info['pdf']['xref']['xref_offsets'])) {
59
									$this->parseXREF($XREFoffset);
60
									break;
61
								}
62
							}
63
						}
64
					}
65
					foreach ($info['pdf']['xref']['offset'] as $objectNumber => $offset) {
66
						if ($info['pdf']['xref']['entry'][$objectNumber] == 'f') {
67
							// "free" object means "deleted", ignore
68
							continue;
69
						}
70
						$this->fseek($offset);
71
						$line = rtrim($this->fgets());
72
						if (preg_match('#^'.$objectNumber.' ([0-9]+) obj#', $line, $matches)) {
73
							if (strlen($line) > strlen($matches[0])) {
74
								// object header line not actually on its own line, rewind file pointer to start reading data
75
								$this->fseek($offset + strlen($matches[0]));
76
							}
77
							$objectData  = '';
78
							while (true) {
79
								$line = $this->fgets();
80
								if (rtrim($line) == 'endobj') {
81
									break;
82
								}
83
								$objectData .= $line;
84
							}
85
							if (preg_match('#^<<[\r\n\s]*(/Type|/Pages|/Parent [0-9]+ [0-9]+ [A-Z]|/Count [0-9]+|/Kids *\\[[0-9A-Z ]+\\]|[\r\n\s])+[\r\n\s]*>>#', $objectData, $matches)) {
86
								if (preg_match('#/Count ([0-9]+)#', $objectData, $matches)) {
87
									$info['pdf']['pages'] = (int) $matches[1];
88
									break; // for now this is the only data we're looking for in the PDF not need to loop through every object in the file (and a large PDF may contain MANY objects). And it MAY be possible that there are other objects elsewhere in the file that define additional (or removed?) pages
89
								}
90
							}
91
						} else {
92
							$this->error('Unexpected structure "'.$line.'" at offset '.$offset);
93
							break;
94
						}
95
					}
96
					if (!$this->returnXREF) {
97
						unset($info['pdf']['xref']['offset'], $info['pdf']['xref']['generation'], $info['pdf']['xref']['entry']);
98
					}
99
100
				} else {
101
					$this->error('Did not find "xref" at offset '.$info['pdf']['trailer']['startxref']);
102
				}
103
			} else {
104
				$this->error('Did not find "startxref" in the last 40 bytes of the PDF');
105
			}
106
107
			$this->warning('PDF parsing incomplete in this version of getID3() ['.$this->getid3->version().']');
108
			return true;
109
		}
110
		$this->error('Did not find "%PDF" at the beginning of the PDF');
111
		return false;
112
113
	}
114
115
	/**
116
	 * @return bool
117
	 */
118
	private function parseXREF($XREFoffset) {
119
		$info = &$this->getid3->info;
120
121
		$this->fseek($XREFoffset);
122
		if (rtrim($this->fgets()) == 'xref') {
123
124
			$info['pdf']['xref']['xref_offsets'][$XREFoffset] = $XREFoffset;
125
			list($firstObjectNumber, $XREFcount) = explode(' ', rtrim($this->fgets()));
126
			$XREFcount = (int) $XREFcount;
127
			$info['pdf']['xref']['count'] = $XREFcount + (!empty($info['pdf']['xref']['count']) ? $info['pdf']['xref']['count'] : 0);
128
			for ($i = 0; $i < $XREFcount; $i++) {
129
				$line = rtrim($this->fgets());
130
				if (preg_match('#^([0-9]+) ([0-9]+) ([nf])$#', $line, $matches)) {
131
					$info['pdf']['xref']['offset'][($firstObjectNumber + $i)]     = (int) $matches[1];
132
					$info['pdf']['xref']['generation'][($firstObjectNumber + $i)] = (int) $matches[2];
133
					$info['pdf']['xref']['entry'][($firstObjectNumber + $i)]      =       $matches[3];
134
				} else {
135
					$this->error('failed to parse XREF entry #'.$i.' in XREF table at offset '.$XREFoffset);
136
					return false;
137
				}
138
			}
139
			sort($info['pdf']['xref']['xref_offsets']);
140
			return true;
141
142
		}
143
		$this->warning('failed to find expected XREF structure at offset '.$XREFoffset);
144
		return false;
145
	}
146
147
}
148