Completed
Branch master (3c4846)
by Lucas
02:09
created

PdfParser   A

Complexity

Total Complexity 29

Size/Duplication

Total Lines 264
Duplicated Lines 0 %

Coupling/Cohesion

Components 2
Dependencies 2

Importance

Changes 11
Bugs 2 Features 1
Metric Value
wmc 29
c 11
b 2
f 1
lcom 2
cbo 2
dl 0
loc 264
rs 10

11 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 5 1
A addAvailableProcessor() 0 4 1
A getAvailableProcessors() 0 4 1
A getProcessor() 0 4 1
A setProcessor() 0 4 1
A parse() 0 13 1
B doParse() 0 40 6
B parseBlock() 0 29 6
B findSpaceGroups() 0 38 6
A guessWidth() 0 15 2
A getTextVersion() 0 22 3
1
<?php
2
3
namespace Kasifi\PdfParserBundle;
4
5
use Doctrine\Common\Collections\ArrayCollection;
6
use Exception;
7
use Kasifi\PdfParserBundle\Processor\ProcessorInterface;
8
use Kasifi\PdfParserBundle\Util\ParseHelper;
9
use Psr\Log\LoggerInterface;
10
use Symfony\Component\Process\Exception\ProcessFailedException;
11
use Symfony\Component\Process\Process;
12
13
class PdfParser
14
{
15
    /**
16
     * @var LoggerInterface
17
     */
18
    private $logger;
19
20
    /** @var ProcessorInterface */
21
    private $processor;
22
23
    /** @var array */
24
    private $processorConfiguration;
25
26
    /** @var string */
27
    private $temporaryDirectoryPath;
28
29
    /** @var ProcessorInterface[] */
30
    private $availableProcessors = [];
31
32
    /**
33
     * PdfParser constructor.
34
     *
35
     * @param LoggerInterface $logger
36
     */
37
    public function __construct(LoggerInterface $logger)
38
    {
39
        $this->logger = $logger;
40
        $this->temporaryDirectoryPath = sys_get_temp_dir();
41
    }
42
43
    /**
44
     * @param ProcessorInterface $processor
45
     */
46
    public function addAvailableProcessor(ProcessorInterface $processor)
47
    {
48
        $this->availableProcessors[$processor->getConfiguration()['id']] = $processor;
49
    }
50
51
    /**
52
     * @return ProcessorInterface[]
53
     */
54
    public function getAvailableProcessors()
55
    {
56
        return $this->availableProcessors;
57
    }
58
59
    /**
60
     * @return ProcessorInterface
61
     */
62
    public function getProcessor()
63
    {
64
        return $this->processor;
65
    }
66
67
    /**
68
     * @param ProcessorInterface $processor
69
     */
70
    public function setProcessor(ProcessorInterface $processor)
71
    {
72
        $this->processor = $processor;
73
    }
74
75
    /**
76
     * @param $filePath
77
     *
78
     * @return ArrayCollection
79
     *
80
     * @throws Exception
81
     */
82
    public function parse($filePath)
83
    {
84
        $this->processorConfiguration = $this->processor->getConfiguration();
85
86
        $rawData = $this->getTextVersion($filePath);
87
88
        $rows = $this->doParse($rawData);
89
        $rows = new ArrayCollection($rows);
90
91
        $formattedRows = $this->processor->format($rows);
92
93
        return $formattedRows;
94
    }
95
96
    /**
97
     * @param $data
98
     *
99
     * @return array|string
100
     *
101
     * @throws Exception
102
     */
103
    private function doParse($data)
104
    {
105
        $blocks = [];
106
        while ($startPos = ParseHelper::findPosition($data, $this->processorConfiguration['startConditions'])) {
107
            // Find start
108
            if (is_null($startPos) && !count($blocks)) {
109
                throw new Exception('Start condition never reached.');
110
            }
111
            $data = substr($data, $startPos);
112
            $data = substr($data, strpos($data, "\n"));
113
114
            // Find end
115
116
            $endPos = ParseHelper::findPosition($data, $this->processorConfiguration['endConditions']);
117
            if (is_null($endPos)) {
118
                throw new Exception('End condition not reached at the ' . (count($blocks) + 1) . ' nth loop of block.');
119
            } else {
120
                $blockData = substr($data, 0, $endPos);
121
                $data = substr($data, $endPos);
122
            }
123
            $blockData = rtrim($blockData);
124
125
            $block = $this->parseBlock(
126
                $blockData,
127
                $this->processorConfiguration['rowsToSkip'],
128
                $this->processorConfiguration['rowMergeColumnTokens'],
129
                $this->processorConfiguration['rowSkipConditions']
130
            );
131
132
            $blocks[] = $block;
133
        }
134
135
        // Merge block.
136
        $data = [];
137
        foreach ($blocks as $block) {
138
            $data = array_merge($data, $block);
139
        }
140
141
        return $data;
142
    }
143
144
145
146
    /**
147
     * @param $blockData
148
     * @param $skipKeys
149
     * @param $rowMergeColumnTokens
150
     * @param $rowSkipConditions
151
     *
152
     * @return array
153
     */
154
    private function parseBlock($blockData, $skipKeys, $rowMergeColumnTokens, $rowSkipConditions)
155
    {
156
        $rows = [];
157
        $rawRows = explode("\n", $blockData);
158
        $rawRows = ParseHelper::prepareRows($rawRows, $skipKeys, $rowSkipConditions);
159
        $this->logger->debug(implode("\n", $rawRows));
160
        $previousIndex = 0;
161
        $colWidths = $this->guessWidth($rawRows);
162
        foreach ($rawRows as $key => $rawRow) {
163
            $row = ParseHelper::parseRow($colWidths, $rawRow);
164
            $toMergeWithPrevious = false;
165
            if ($key > 0) {
166
                foreach ($rowMergeColumnTokens as $rowMergeColumnToken) {
167
                    if (!strlen($row[$rowMergeColumnToken])) {
168
                        $toMergeWithPrevious = true;
169
                    }
170
                }
171
            }
172
173
            if ($toMergeWithPrevious) {
174
                $rows[$previousIndex] = ParseHelper::mergeRows($rows[$previousIndex], $row);
175
            } else {
176
                $rows[] = $row;
177
                $previousIndex = count($rows) - 1;
178
            }
179
        }
180
181
        return $rows;
182
    }
183
184
    /**
185
     * @param $rawRows
186
     *
187
     * @return array
188
     */
189
    private function findSpaceGroups($rawRows)
190
    {
191
        $globalSpacePositions = [];
192
        foreach ($rawRows as $rawRow) {
193
            $spacePositions = ParseHelper::getSpacePositions($rawRow);
194
195
            if (count($globalSpacePositions)) {
196
                $globalSpacePositions = array_intersect($globalSpacePositions, $spacePositions);
197
            } else {
198
                $globalSpacePositions = $spacePositions;
199
            }
200
        }
201
        $globalSpacePositions = array_values($globalSpacePositions);
202
203
        $spaceGroups = [];
204
        $spaceGroupIndex = 0;
205
        foreach ($globalSpacePositions as $key => $spacePosition) {
206
            if ($key == 0) {
207
                $spaceGroups[$spaceGroupIndex] = ['start' => $spacePosition, 'end' => $spacePosition + 1];
208
            } else {
209
                $previousPos = $globalSpacePositions[$key - 1];
210
                $increase = $spacePosition - $previousPos;
211
                if ($increase == 1) {
212
                    ++$spaceGroups[$spaceGroupIndex]['end'];
213
                } else {
214
                    ++$spaceGroupIndex;
215
                    $spaceGroups[$spaceGroupIndex] = ['start' => $spacePosition, 'end' => $spacePosition + 1];
216
                }
217
            }
218
        }
219
220
        // Clean "false positive" space groups.
221
        $spaceGroups = array_filter($spaceGroups, function ($spaceGroup) {
222
            return $spaceGroup['end'] - $spaceGroup['start'] > 1;
223
        });
224
225
        return $spaceGroups;
226
    }
227
228
    /**
229
     * @param $rawRows
230
     *
231
     * @return array
232
     */
233
    private function guessWidth($rawRows)
234
    {
235
        $spaceGroups = $this->findSpaceGroups($rawRows);
236
237
        $widths = [];
238
        $spaceEnd = 0;
239
        foreach ($spaceGroups as $spaceGroupKey => $spaceGroup) {
240
            $spaceStart = $spaceGroup['start'];
241
            $widths[] = ['start' => $spaceEnd, 'length' => $spaceStart - $spaceEnd];
242
            $spaceEnd = $spaceGroup['end'];
243
        }
244
        $widths[] = ['start' => $spaceEnd, 'length' => strlen($rawRows[0]) - $spaceEnd];
245
246
        return $widths;
247
    }
248
249
    /**
250
     * @param $filePath
251
     *
252
     * @return string
253
     */
254
    private function getTextVersion($filePath)
255
    {
256
        $tmpPath = $this->temporaryDirectoryPath . '/' . rand(0, 10000) . '.txt';
257
        $process = new Process('/usr/bin/pdftotext -layout ' . $filePath . ' ' . $tmpPath);
258
        $this->logger->info('Execute Pdftotext', ['file' => $filePath]);
259
        $process->run(function ($type, $buffer) {
260
            if (Process::ERR === $type) {
261
                $this->logger->error($buffer);
262
            } else {
263
                $this->logger->info($buffer);
264
            }
265
        });
266
267
        if (!$process->isSuccessful()) {
268
            throw new ProcessFailedException($process);
269
        }
270
271
        $content = file_get_contents($tmpPath);
272
        unlink($tmpPath);
273
274
        return $content;
275
    }
276
}
277