This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include
, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | /** |
||
4 | * @file |
||
5 | * Class PdfParser |
||
6 | * |
||
7 | * @author : Sebastien MALOT <[email protected]> |
||
8 | * @date : 2013-08-08 |
||
9 | * |
||
10 | * References : |
||
11 | * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html |
||
12 | * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php |
||
13 | * - http://www.php.net/manual/en/ref.pdf.php#74211 |
||
14 | */ |
||
15 | class PdfParser |
||
16 | { |
||
17 | /** |
||
18 | * Parse PDF file |
||
19 | * |
||
20 | * @param string $filename |
||
21 | * @return string |
||
22 | */ |
||
23 | public static function parseFile($filename) |
||
24 | { |
||
25 | $content = file_get_contents($filename); |
||
26 | |||
27 | return self::extractText($content); |
||
28 | } |
||
29 | |||
30 | /** |
||
31 | * Parse PDF content |
||
32 | * |
||
33 | * @param string $content |
||
34 | * @return string |
||
35 | */ |
||
36 | public static function parseContent($content) |
||
37 | { |
||
38 | return self::extractText($content); |
||
39 | } |
||
40 | |||
41 | /** |
||
42 | * Convert a PDF into text. |
||
43 | * |
||
44 | * @param string $filename The filename to extract the data from. |
||
0 ignored issues
–
show
|
|||
45 | * @return string The extracted text from the PDF |
||
46 | */ |
||
47 | protected static function extractText($data) |
||
48 | { |
||
49 | /** |
||
50 | * Split apart the PDF document into sections. We will address each |
||
51 | * section separately. |
||
52 | */ |
||
53 | $a_obj = self::getDataArray($data, 'obj', 'endobj'); |
||
54 | $j = 0; |
||
55 | $a_chunks = array(); |
||
56 | |||
57 | /** |
||
58 | * Attempt to extract each part of the PDF document into a 'filter' |
||
59 | * element and a 'data' element. This can then be used to decode the |
||
60 | * data. |
||
61 | */ |
||
62 | foreach ($a_obj as $obj) { |
||
63 | $a_filter = self::getDataArray($obj, '<<', '>>'); |
||
64 | |||
65 | if (is_array($a_filter) && isset($a_filter[0])) { |
||
66 | $a_chunks[$j]['filter'] = $a_filter[0]; |
||
67 | $a_data = self::getDataArray($obj, 'stream', 'endstream'); |
||
68 | |||
69 | if (is_array($a_data) && isset($a_data[0])) { |
||
70 | $a_chunks[$j]['data'] = trim(substr($a_data[0], strlen('stream'), strlen($a_data[0]) - strlen('stream') - strlen('endstream'))); |
||
71 | } |
||
72 | |||
73 | $j++; |
||
74 | } |
||
75 | } |
||
76 | |||
77 | $result_data = null; |
||
78 | |||
79 | // decode the chunks |
||
80 | foreach ($a_chunks as $chunk) { |
||
81 | // Look at each chunk decide if we can decode it by looking at the contents of the filter |
||
82 | if (isset($chunk['data'])) { |
||
83 | |||
84 | // look at the filter to find out which encoding has been used |
||
85 | if (strpos($chunk['filter'], 'FlateDecode') !== false) { |
||
86 | // Use gzuncompress but suppress error messages. |
||
87 | $data =@ gzuncompress($chunk['data']); |
||
88 | } else { |
||
89 | $data = $chunk['data']; |
||
90 | } |
||
91 | |||
92 | if (trim($data) != '') { |
||
93 | // If we got data then attempt to extract it. |
||
94 | $result_data .= ' ' . self::extractTextElements($data); |
||
95 | } |
||
96 | } |
||
97 | } |
||
98 | |||
99 | /** |
||
100 | * Make sure we don't have large blocks of white space before and after |
||
101 | * our string. Also extract alphanumerical information to reduce |
||
102 | * redundant data. |
||
103 | */ |
||
104 | if (trim($result_data) == '') { |
||
105 | return null; |
||
106 | } else { |
||
107 | // Optimize hyphened words |
||
108 | $result_data = preg_replace('/\s*-[\r\n]+\s*/', '', $result_data); |
||
109 | $result_data = preg_replace('/\s+/', ' ', $result_data); |
||
110 | |||
111 | return $result_data; |
||
112 | } |
||
113 | } |
||
114 | |||
115 | protected static function extractTextElements($content) |
||
116 | { |
||
117 | if (strpos($content, '/CIDInit') === 0) { |
||
118 | return ''; |
||
119 | } |
||
120 | |||
121 | $text = ''; |
||
122 | $lines = explode("\n", $content); |
||
123 | |||
124 | foreach ($lines as $line) { |
||
125 | $line = trim($line); |
||
126 | $matches = array(); |
||
127 | |||
128 | // Parse each lines to extract command and operator values |
||
129 | if (preg_match('/^(?<command>.*[\)\] ])(?<operator>[a-z]+[\*]?)$/i', $line, $matches)) { |
||
130 | $command = trim($matches['command']); |
||
131 | |||
132 | // Convert octal encoding |
||
133 | $found_octal_values = array(); |
||
134 | preg_match_all('/\\\\([0-9]{3})/', $command, $found_octal_values); |
||
135 | |||
136 | foreach($found_octal_values[0] as $value) { |
||
137 | $octal = substr($value, 1); |
||
138 | |||
139 | if (intval($octal) < 40) { |
||
140 | // Skips non printable chars |
||
141 | $command = str_replace($value, '', $command); |
||
142 | } else { |
||
143 | $command = str_replace($value, chr(octdec($octal)), $command); |
||
144 | } |
||
145 | } |
||
146 | // Removes encoded new lines, tabs, ... |
||
147 | $command = preg_replace('/\\\\[\r\n]/', '', $command); |
||
148 | $command = preg_replace('/\\\\[rnftb ]/', ' ', $command); |
||
149 | // Force UTF-8 charset |
||
150 | $encoding = mb_detect_encoding($command, array('ASCII', 'UTF-8', 'Windows-1252', 'ISO-8859-1')); |
||
151 | if (strtoupper($encoding) != 'UTF-8') { |
||
152 | if ($decoded = @iconv('CP1252', 'UTF-8//TRANSLIT//IGNORE', $command)) { |
||
153 | $command = $decoded; |
||
154 | } |
||
155 | } |
||
156 | // Removes leading spaces |
||
157 | $operator = trim($matches['operator']); |
||
158 | } else { |
||
159 | $command = $line; |
||
160 | $operator = ''; |
||
161 | } |
||
162 | |||
163 | // Handle main operators |
||
164 | switch ($operator) { |
||
165 | // Set character spacing. |
||
166 | case 'Tc': |
||
167 | break; |
||
168 | |||
169 | // Move text current point. |
||
170 | case 'Td': |
||
171 | $values = explode(' ', $command); |
||
172 | $y = array_pop($values); |
||
173 | $x = array_pop($values); |
||
174 | if ($x > 0) { |
||
175 | $text .= ' '; |
||
176 | } |
||
177 | if ($y < 0) { |
||
178 | $text .= ' '; |
||
179 | } |
||
180 | break; |
||
181 | |||
182 | // Move text current point and set leading. |
||
183 | case 'TD': |
||
184 | $values = explode(' ', $command); |
||
185 | $y = array_pop($values); |
||
186 | if ($y < 0) { |
||
187 | $text .= "\n"; |
||
188 | } |
||
189 | break; |
||
190 | |||
191 | // Set font name and size. |
||
192 | case 'Tf': |
||
193 | $text.= ' '; |
||
194 | break; |
||
195 | |||
196 | // Display text, allowing individual character positioning |
||
197 | case 'TJ': |
||
198 | $start = mb_strpos($command, '[', null, 'UTF-8') + 1; |
||
199 | $end = mb_strrpos($command, ']', null, 'UTF-8'); |
||
200 | $text.= self::parseTextCommand(mb_substr($command, $start, $end - $start, 'UTF-8')); |
||
201 | break; |
||
202 | |||
203 | // Display text. |
||
204 | case 'Tj': |
||
205 | $start = mb_strpos($command, '(', null, 'UTF-8') + 1; |
||
206 | $end = mb_strrpos($command, ')', null, 'UTF-8'); |
||
207 | $text.= mb_substr($command, $start, $end - $start, 'UTF-8'); // Removes round brackets |
||
208 | break; |
||
209 | |||
210 | // Set leading. |
||
211 | case 'TL': |
||
0 ignored issues
–
show
The case body in a switch statement must start on the line following the statement.
According to the PSR-2, the body of a case statement must start on the line immediately following the case statement. switch ($expr) {
case "A":
doSomething(); //right
break;
case "B":
doSomethingElse(); //wrong
break;
} To learn more about the PSR-2 coding standard, please refer to the PHP-Fig. ![]() |
|||
212 | |||
213 | // Set text matrix. |
||
214 | case 'Tm': |
||
215 | // $text.= ' '; |
||
216 | break; |
||
217 | |||
218 | // Set text rendering mode. |
||
219 | case 'Tr': |
||
220 | break; |
||
221 | |||
222 | // Set super/subscripting text rise. |
||
223 | case 'Ts': |
||
224 | break; |
||
225 | |||
226 | // Set text spacing. |
||
227 | case 'Tw': |
||
228 | break; |
||
229 | |||
230 | // Set horizontal scaling. |
||
231 | case 'Tz': |
||
232 | break; |
||
233 | |||
234 | // Move to start of next line. |
||
235 | case 'T*': |
||
236 | $text.= "\n"; |
||
237 | break; |
||
238 | |||
239 | // Internal use |
||
240 | case 'g': |
||
241 | case 'gs': |
||
242 | case 're': |
||
243 | case 'f': |
||
244 | // Begin text |
||
245 | case 'BT': |
||
246 | // End text |
||
247 | case 'ET': |
||
248 | break; |
||
249 | |||
250 | case '': |
||
251 | break; |
||
252 | |||
253 | default: |
||
254 | } |
||
255 | } |
||
256 | |||
257 | $text = str_replace(array('\\(', '\\)'), array('(', ')'), $text); |
||
258 | |||
259 | return $text; |
||
260 | } |
||
261 | |||
262 | /** |
||
263 | * Strip out the text from a small chunk of data. |
||
264 | * |
||
265 | * @param string $text |
||
266 | * @param int $font_size Currently not used |
||
267 | * |
||
268 | * @return string |
||
269 | */ |
||
270 | protected static function parseTextCommand($text, $font_size = 0) { |
||
271 | |||
272 | $result = ''; |
||
273 | $cur_start_pos = 0; |
||
274 | |||
275 | while (($cur_start_text = mb_strpos($text, '(', $cur_start_pos, 'UTF-8')) !== false) { |
||
276 | // New text element found |
||
277 | if ($cur_start_text - $cur_start_pos > 8) { |
||
278 | $spacing = ' '; |
||
279 | } else { |
||
280 | $spacing_size = mb_substr($text, $cur_start_pos, $cur_start_text - $cur_start_pos, 'UTF-8'); |
||
281 | |||
282 | if ($spacing_size < -50) { |
||
283 | $spacing = ' '; |
||
284 | } else { |
||
285 | $spacing = ''; |
||
286 | } |
||
287 | } |
||
288 | $cur_start_text++; |
||
289 | |||
290 | $start_search_end = $cur_start_text; |
||
291 | while (($cur_start_pos = mb_strpos($text, ')', $start_search_end, 'UTF-8')) !== false) { |
||
292 | if (mb_substr($text, $cur_start_pos - 1, 1, 'UTF-8') != '\\') { |
||
293 | break; |
||
294 | } |
||
295 | $start_search_end = $cur_start_pos + 1; |
||
296 | } |
||
297 | |||
298 | // something wrong happened |
||
299 | if ($cur_start_pos === false) { |
||
300 | break; |
||
301 | } |
||
302 | |||
303 | // Add to result |
||
304 | $result .= $spacing . mb_substr($text, $cur_start_text, $cur_start_pos - $cur_start_text, 'UTF-8'); |
||
305 | $cur_start_pos++; |
||
306 | } |
||
307 | |||
308 | return $result; |
||
309 | } |
||
310 | |||
311 | /** |
||
312 | * Convert a section of data into an array, separated by the start and end words. |
||
313 | * |
||
314 | * @param string $data The data. |
||
315 | * @param string $start_word The start of each section of data. |
||
316 | * @param string $end_word The end of each section of data. |
||
317 | * @return array The array of data. |
||
318 | */ |
||
319 | protected static function getDataArray($data, $start_word, $end_word) |
||
320 | { |
||
321 | $start = 0; |
||
322 | $end = 0; |
||
323 | $a_results = array(); |
||
324 | |||
325 | while ($start !== false && $end !== false) { |
||
326 | $start = strpos($data, $start_word, $end); |
||
327 | $end = strpos($data, $end_word, $start); |
||
328 | |||
329 | if ($end !== false && $start !== false) { |
||
330 | // data is between start and end |
||
331 | $a_results[] = substr($data, $start, $end - $start + strlen($end_word)); |
||
332 | } |
||
333 | } |
||
334 | |||
335 | return $a_results; |
||
336 | } |
||
337 | } |
||
338 |
This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.
Consider the following example. The parameter
$italy
is not defined by the methodfinale(...)
.The most likely cause is that the parameter was removed, but the annotation was not.