1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* @file This file is part of the PdfParser library. |
5
|
|
|
* |
6
|
|
|
* @author Konrad Abicht <[email protected]> |
7
|
|
|
* @date 2020-06-01 |
8
|
|
|
* |
9
|
|
|
* @author Sébastien MALOT <[email protected]> |
10
|
|
|
* @date 2017-01-03 |
11
|
|
|
* |
12
|
|
|
* @license LGPLv3 |
13
|
|
|
* @url <https://github.com/smalot/pdfparser> |
14
|
|
|
* |
15
|
|
|
* PdfParser is a pdf library written in PHP, extraction oriented. |
16
|
|
|
* Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
17
|
|
|
* |
18
|
|
|
* This program is free software: you can redistribute it and/or modify |
19
|
|
|
* it under the terms of the GNU Lesser General Public License as published by |
20
|
|
|
* the Free Software Foundation, either version 3 of the License, or |
21
|
|
|
* (at your option) any later version. |
22
|
|
|
* |
23
|
|
|
* This program is distributed in the hope that it will be useful, |
24
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
25
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
26
|
|
|
* GNU Lesser General Public License for more details. |
27
|
|
|
* |
28
|
|
|
* You should have received a copy of the GNU Lesser General Public License |
29
|
|
|
* along with this program. |
30
|
|
|
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
31
|
|
|
*/ |
32
|
|
|
|
33
|
|
|
namespace Tests\Smalot\PdfParser\Performance; |
34
|
|
|
|
35
|
|
|
use Exception; |
36
|
|
|
use Smalot\PdfParser\Element; |
37
|
|
|
use Smalot\PdfParser\Encoding; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* This test checks does a performance test with certain PDF files that extensively use |
41
|
|
|
* the getFirstFont() method of Document.php. If Document.php correctly uses a dictionary |
42
|
|
|
* to cache the objects inside the PDF file, then the parsing should be quick. |
43
|
|
|
* If it does not, the parsing can be extensively slow or even crash. |
44
|
|
|
*/ |
45
|
|
|
class DocumentDictionaryCacheTest extends AbstractPerformanceTest |
46
|
|
|
{ |
47
|
|
|
|
48
|
|
|
public function run() |
49
|
|
|
{ |
50
|
|
|
$parser = new \Smalot\PdfParser\Parser(); |
51
|
|
|
|
52
|
|
|
// load PDF file content |
53
|
|
|
$data = file_get_contents('https://comserv.cs.ut.ee/home/files/Shoush_ComputerScience_2020.pdf?study=ATILoputoo&reference=76F6FAFD4C9E6981D9A434D32D2E7EE2AE9C49E7'); |
54
|
|
|
|
55
|
|
|
// give PDF content to function and parse it |
56
|
|
|
$pdf = $parser->parseContent($data); |
57
|
|
|
|
58
|
|
|
$pages = $pdf->getPages(); |
59
|
|
|
|
60
|
|
|
foreach ($pages as $i => $page) { /** @var $page Page */ |
61
|
|
|
if ($i < 77) continue; |
62
|
|
|
if ($i > 78) continue; |
63
|
|
|
|
64
|
|
|
$startTime = microtime(true); |
65
|
|
|
$pageText = $page->getText(); |
66
|
|
|
$endTime = microtime(true); |
67
|
|
|
|
68
|
|
|
echo '<b>Page ' . $i . ' (took ' . ($endTime - $startTime) . ' seconds, ' . round(memory_get_usage() / (1000 * 1000), 2) . ' MB RAM)</b><br>'; |
69
|
|
|
var_dump($pageText); |
|
|
|
|
70
|
|
|
echo '<br><br>'; |
71
|
|
|
} |
72
|
|
|
} |
73
|
|
|
|
74
|
|
|
} |
75
|
|
|
|