Total Complexity | 55 |
Total Lines | 280 |
Duplicated Lines | 0 % |
Changes | 19 | ||
Bugs | 4 | Features | 2 |
Complex classes like Parser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Parser, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
49 | class Parser |
||
50 | { |
||
51 | /** |
||
52 | * @var Config |
||
53 | */ |
||
54 | private $config; |
||
55 | |||
56 | /** |
||
57 | * @var PDFObject[] |
||
58 | */ |
||
59 | protected $objects = []; |
||
60 | |||
61 | protected $rawDataParser; |
||
62 | |||
63 | public function __construct($cfg = [], ?Config $config = null) |
||
64 | { |
||
65 | $this->config = $config ?: new Config(); |
||
66 | $this->rawDataParser = new RawDataParser($cfg, $this->config); |
||
67 | } |
||
68 | |||
69 | public function getConfig(): Config |
||
72 | } |
||
73 | |||
74 | /** |
||
75 | * @throws \Exception |
||
76 | */ |
||
77 | public function parseFile(string $filename): Document |
||
78 | { |
||
79 | $content = file_get_contents($filename); |
||
80 | |||
81 | /* |
||
82 | * 2018/06/20 @doganoo as multiple times a |
||
83 | * users have complained that the parseFile() |
||
84 | * method dies silently, it is an better option |
||
85 | * to remove the error control operator (@) and |
||
86 | * let the users know that the method throws an exception |
||
87 | * by adding @throws tag to PHPDoc. |
||
88 | * |
||
89 | * See here for an example: https://github.com/smalot/pdfparser/issues/204 |
||
90 | */ |
||
91 | return $this->parseContent($content); |
||
92 | } |
||
93 | |||
94 | /** |
||
95 | * @param string $content PDF content to parse |
||
96 | * |
||
97 | * @throws \Exception if secured PDF file was detected |
||
98 | * @throws \Exception if no object list was found |
||
99 | */ |
||
100 | public function parseContent(string $content): Document |
||
101 | { |
||
102 | // Create structure from raw data. |
||
103 | list($xref, $data) = $this->rawDataParser->parseData($content); |
||
104 | |||
105 | if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) { |
||
106 | throw new \Exception('Secured pdf file are currently not supported.'); |
||
107 | } |
||
108 | |||
109 | if (empty($data)) { |
||
110 | throw new \Exception('Object list not found. Possible secured file.'); |
||
111 | } |
||
112 | |||
113 | // Create destination object. |
||
114 | $document = new Document(); |
||
115 | $this->objects = []; |
||
116 | |||
117 | foreach ($data as $id => $structure) { |
||
118 | $this->parseObject($id, $structure, $document); |
||
119 | unset($data[$id]); |
||
120 | } |
||
121 | |||
122 | $document->setTrailer($this->parseTrailer($xref['trailer'], $document)); |
||
123 | $document->setObjects($this->objects); |
||
124 | |||
125 | return $document; |
||
126 | } |
||
127 | |||
128 | protected function parseTrailer(array $structure, ?Document $document) |
||
148 | } |
||
149 | |||
150 | protected function parseObject(string $id, array $structure, ?Document $document) |
||
151 | { |
||
152 | $header = new Header([], $document); |
||
153 | $content = ''; |
||
154 | |||
155 | foreach ($structure as $position => $part) { |
||
156 | if (\is_int($part)) { |
||
157 | $part = [null, null]; |
||
158 | } |
||
159 | switch ($part[0]) { |
||
160 | case '[': |
||
161 | $elements = []; |
||
162 | |||
163 | foreach ($part[1] as $sub_element) { |
||
|
|||
164 | $sub_type = $sub_element[0]; |
||
165 | $sub_value = $sub_element[1]; |
||
166 | $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document); |
||
167 | } |
||
168 | |||
169 | $header = new Header($elements, $document); |
||
170 | break; |
||
171 | |||
172 | case '<<': |
||
173 | $header = $this->parseHeader($part[1], $document); |
||
174 | break; |
||
175 | |||
176 | case 'stream': |
||
177 | $content = isset($part[3][0]) ? $part[3][0] : $part[1]; |
||
178 | |||
179 | if ($header->get('Type')->equals('ObjStm')) { |
||
180 | $match = []; |
||
181 | |||
182 | // Split xrefs and contents. |
||
183 | preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match); |
||
184 | $content = $match[3]; |
||
185 | |||
186 | // Extract xrefs. |
||
187 | $xrefs = preg_split( |
||
188 | '/(\d+\s+\d+\s*)/s', |
||
189 | $match[1], |
||
190 | -1, |
||
191 | \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE |
||
192 | ); |
||
193 | $table = []; |
||
194 | |||
195 | foreach ($xrefs as $xref) { |
||
196 | list($id, $position) = preg_split("/\s+/", trim($xref)); |
||
197 | $table[$position] = $id; |
||
198 | } |
||
199 | |||
200 | ksort($table); |
||
201 | |||
202 | $ids = array_values($table); |
||
203 | $positions = array_keys($table); |
||
204 | |||
205 | foreach ($positions as $index => $position) { |
||
206 | $id = $ids[$index].'_0'; |
||
207 | $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content); |
||
208 | $sub_content = substr($content, $position, (int) $next_position - (int) $position); |
||
209 | |||
210 | $sub_header = Header::parse($sub_content, $document); |
||
211 | $object = PDFObject::factory($document, $sub_header, '', $this->config); |
||
212 | $this->objects[$id] = $object; |
||
213 | } |
||
214 | |||
215 | // It is not necessary to store this content. |
||
216 | |||
217 | return; |
||
218 | } elseif ($header->get('Type')->equals('Metadata')) { |
||
219 | // Attempt to parse XMP XML Metadata |
||
220 | $document->extractXMPMetadata($content); |
||
221 | } |
||
222 | break; |
||
223 | |||
224 | default: |
||
225 | if ('null' != $part) { |
||
226 | $element = $this->parseHeaderElement($part[0], $part[1], $document); |
||
227 | |||
228 | if ($element) { |
||
229 | $header = new Header([$element], $document); |
||
230 | } |
||
231 | } |
||
232 | break; |
||
233 | } |
||
234 | } |
||
235 | |||
236 | if (!isset($this->objects[$id])) { |
||
237 | $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config); |
||
238 | } |
||
239 | } |
||
240 | |||
241 | /** |
||
242 | * @throws \Exception |
||
243 | */ |
||
244 | protected function parseHeader(array $structure, ?Document $document): Header |
||
245 | { |
||
246 | $elements = []; |
||
247 | $count = \count($structure); |
||
248 | |||
249 | for ($position = 0; $position < $count; $position += 2) { |
||
250 | $name = $structure[$position][1]; |
||
251 | $type = $structure[$position + 1][0]; |
||
252 | $value = $structure[$position + 1][1]; |
||
253 | |||
254 | $elements[$name] = $this->parseHeaderElement($type, $value, $document); |
||
255 | } |
||
256 | |||
257 | return new Header($elements, $document); |
||
258 | } |
||
259 | |||
260 | /** |
||
261 | * @param string|array $value |
||
262 | * |
||
263 | * @return Element|Header|null |
||
264 | * |
||
265 | * @throws \Exception |
||
266 | */ |
||
267 | protected function parseHeaderElement(?string $type, $value, ?Document $document) |
||
329 | } |
||
330 | } |
||
331 | } |
||
332 |