Passed
Push — master ( e8c874...e40a60 )
by Domenico
03:39 queued 12s
created

XliffParserV2::parse()   A

Complexity

Conditions 4
Paths 5

Size

Total Lines 31
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 14
c 0
b 0
f 0
nc 5
nop 2
dl 0
loc 31
rs 9.7998
1
<?php
2
3
namespace Matecat\XliffParser\XliffParser;
4
5
use DOMDocument;
6
use DOMElement;
7
use Matecat\XliffParser\Constants\Placeholder;
8
use Matecat\XliffParser\Exception\DuplicateTransUnitIdInXliff;
9
use Matecat\XliffParser\Exception\NotFoundIdInTransUnit;
10
use Matecat\XliffParser\Exception\SegmentIdTooLongException;
11
use Matecat\XliffParser\Utils\Strings;
12
use Matecat\XliffParser\XliffUtils\DataRefReplacer;
13
14
class XliffParserV2 extends AbstractXliffParser
15
{
16
    /**
17
     * @inheritDoc
18
     * @throws \Exception
19
     */
20
    public function parse( DOMDocument $dom, $output = [])
21
    {
22
        $i = 1;
23
        /** @var DOMElement $file */
24
        foreach ($dom->getElementsByTagName('file') as $file) {
25
26
            // metadata
27
            $output[ 'files' ][ $i ][ 'attr' ] = $this->extractMetadata($dom);
28
29
            // notes
30
            $output[ 'files' ][ $i ]['notes'] = $this->extractNotes($file);
31
32
            // trans-units
33
            $transUnitIdArrayForUniquenessCheck = [];
34
            $j = 1;
35
            /** @var DOMElement $transUnit */
36
            foreach ($file->childNodes as $childNode) {
37
                $this->extractTuFromNode($childNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j);
38
            }
39
40
            // trans-unit re-count check
41
            $totalTransUnitsId  = count($transUnitIdArrayForUniquenessCheck);
42
            $transUnitsUniqueId = count(array_unique($transUnitIdArrayForUniquenessCheck));
43
            if ($totalTransUnitsId != $transUnitsUniqueId) {
44
                throw new DuplicateTransUnitIdInXliff("Invalid trans-unit id, duplicate found.", 400);
45
            }
46
47
            $i++;
48
        }
49
50
        return $output;
51
    }
52
53
    /**
54
     * @param DOMDocument $dom
55
     *
56
     * @return array
57
     */
58
    private function extractMetadata( DOMDocument $dom)
59
    {
60
        $metadata = [];
61
62
        $xliffNode = $dom->getElementsByTagName('xliff')->item(0);
63
        $fileNode = $dom->getElementsByTagName('file')->item(0);
64
65
        // original
66
        $metadata[ 'original' ] = (null !== $fileNode->attributes->getNamedItem('original')) ? $fileNode->attributes->getNamedItem('original')->nodeValue : 'no-name';
0 ignored issues
show
Bug introduced by
The method getNamedItem() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

66
        $metadata[ 'original' ] = (null !== $fileNode->attributes->/** @scrutinizer ignore-call */ getNamedItem('original')) ? $fileNode->attributes->getNamedItem('original')->nodeValue : 'no-name';

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
67
68
        // source-language
69
        $metadata[ 'source-language' ] = (null !== $xliffNode->attributes->getNamedItem('srcLang')) ? $xliffNode->attributes->getNamedItem('srcLang')->nodeValue : 'en-US';
0 ignored issues
show
Bug introduced by
The method getNamedItem() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

69
        $metadata[ 'source-language' ] = (null !== $xliffNode->attributes->/** @scrutinizer ignore-call */ getNamedItem('srcLang')) ? $xliffNode->attributes->getNamedItem('srcLang')->nodeValue : 'en-US';

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
70
71
        // datatype
72
        // @TODO to be implemented
73
74
        // target-language
75
        $metadata[ 'target-language' ] = (null !== $xliffNode->attributes->getNamedItem('trgLang')) ? $xliffNode->attributes->getNamedItem('trgLang')->nodeValue : 'en-US';
76
77
        // custom MateCat x-attribute
78
        // @TODO to be implemented
79
80
        return $metadata;
81
    }
82
83
    /**
84
     * @param DOMElement $file
85
     *
86
     * @return array
87
     * @throws \Exception
88
     */
89
    private function extractNotes( DOMElement $file)
90
    {
91
        $notes = [];
92
93
        // loop <notes> to get nested <note> tag
94
        foreach ($file->childNodes as $childNode) {
95
            if ($childNode->nodeName === 'notes') {
96
                foreach ($childNode->childNodes as $note) {
97
                    $noteValue = trim($note->nodeValue);
98
                    if ('' !== $noteValue) {
99
                        $notes[] = $this->JSONOrRawContentArray($noteValue);
100
                    }
101
                }
102
            }
103
        }
104
105
        return $notes;
106
    }
107
108
    /**
109
     * Extract and populate 'trans-units' array
110
     *
111
     * @param $transUnit
112
     * @param $transUnitIdArrayForUniquenessCheck
113
     * @param $dom
114
     * @param $output
115
     * @param $i
116
     * @param $j
117
     *
118
     * @throws \Exception
119
     */
120
    protected function extractTransUnit($transUnit, &$transUnitIdArrayForUniquenessCheck, $dom, &$output, &$i, &$j)
121
    {
122
        // metadata
123
        $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'attr' ] = $this->extractTransUnitMetadata($transUnit, $transUnitIdArrayForUniquenessCheck);
124
125
        // notes
126
        // merge <notes> with key and key-note contained in metadata <mda:metaGroup>
127
        $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'notes' ] = $this->extractTransUnitNotes($transUnit);
128
129
        // uuid
130
        foreach ($output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'notes' ] as $note){
131
            if(isset($note['raw-content']) && Strings::isAValidUuid($note['raw-content'])){
132
                $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'attr' ]['uuid'] = $note['raw-content'];
133
            }
134
        }
135
136
        // original-data (exclusive for V2)
137
        // http://docs.oasis-open.org/xliff/xliff-core/v2.0/xliff-core-v2.0.html#originaldata
138
        $originalData = $this->extractTransUnitOriginalData($transUnit);
139
        if (!empty($originalData)) {
140
            $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'original-data' ] = $originalData;
141
            $dataRefMap = $this->getDataRefMap($originalData);
142
        }
143
144
        // additionalTagData (exclusive for V2)
145
        $additionalTagData = $this->extractTransUnitAdditionalTagData($transUnit);
146
        if (!empty($additionalTagData)) {
147
            $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'additional-tag-data' ] = $additionalTagData;
148
        }
149
150
        // content
151
152
        $source = [
153
            'attr' => [],
154
            'raw-content' => [],
155
        ];
156
157
        $target = [
158
            'attr' => [],
159
            'raw-content' => [],
160
        ];
161
162
        $segSource = [];
163
        $segTarget = [];
164
165
        /** @var DOMElement $segment */
166
        $c = 0;
167
        foreach ($transUnit->childNodes as $segment) {
168
            if ($segment->nodeName === 'segment') {
169
170
                // check segment id consistency
171
                $attr = $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'attr' ];
172
                $this->checkSegmentIdConsistency($segment, $attr);
173
174
                // loop <segment> to get nested <source> and <target> tag
175
                foreach ($segment->childNodes as $childNode) {
176
                    if ($childNode->nodeName === 'source') {
177
                        $extractedSource = $this->extractContent($dom, $childNode);
178
                        $source['raw-content'][$c] = $extractedSource['raw-content'];
179
180
                        if (!empty($originalData)) {
181
                            $source['replaced-content'][$c] = (new DataRefReplacer($dataRefMap))->replace($source['raw-content'][$c]);
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $dataRefMap does not seem to be defined for all execution paths leading up to this point.
Loading history...
182
                        }
183
184
                        if (!empty($extractedSource['attr'])) {
185
                            $source['attr'][$c] = $extractedSource['attr'];
186
                        }
187
188
                        // append value to 'seg-source'
189
                        if ($this->stringContainsMarks($extractedSource['raw-content'])) {
190
                            $segSource = $this->extractContentWithMarksAndExtTags($dom, $childNode, $extractedSource['raw-content'], $originalData);
191
                        } else {
192
                            $segSource[] = [
193
                                'mid'           => count($segSource) > 0 ? count($segSource) : 0,
194
                                'ext-prec-tags' => '',
195
                                'raw-content'   => $extractedSource['raw-content'],
196
                                'replaced-content'   => (!empty($originalData)) ?  (new DataRefReplacer($dataRefMap))->replace($extractedSource['raw-content']) : null,
197
                                'ext-succ-tags' => '',
198
                            ];
199
                        }
200
                    }
201
202
                    if ($childNode->nodeName === 'target') {
203
                        $extractedTarget = $this->extractContent($dom, $childNode);
204
                        $target['raw-content'][$c] = $extractedTarget['raw-content'];
205
206
                        if (!empty($originalData)) {
207
                            $target['replaced-content'][$c] = (new DataRefReplacer($dataRefMap))->replace($target['raw-content'][$c]);
208
                        }
209
210
                        if (!empty($extractedTarget['attr'])) {
211
                            $target['attr'][$c] = $extractedTarget['attr'];
212
                        }
213
214
                        // append value to 'seg-target'
215
                        if ($this->stringContainsMarks($extractedTarget['raw-content'])) {
216
                            $segTarget = $this->extractContentWithMarksAndExtTags($dom, $childNode, $extractedTarget['raw-content'], $originalData);
217
                        } else {
218
                            $segTarget[] = [
219
                                'mid'           => count($segTarget) > 0 ? count($segTarget) : 0,
220
                                'ext-prec-tags' => '',
221
                                'raw-content'   => $extractedTarget['raw-content'],
222
                                'replaced-content' => (!empty($originalData)) ?  (new DataRefReplacer($dataRefMap))->replace($extractedTarget['raw-content']) : null,
223
                                'ext-succ-tags' => '',
224
                            ];
225
                        }
226
                    }
227
                }
228
229
                $c++;
230
            }
231
        }
232
233
        $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'source' ] = $source;
234
        $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'target' ] = $target;
235
        $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'seg-source' ] = $segSource;
236
        $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'seg-target' ] = $segTarget;
237
238
        $j++;
239
    }
240
241
    /**
242
     * @param DOMElement $transUnit
243
     * @param             $transUnitIdArrayForUniquenessCheck
244
     *
245
     * @return array
246
     */
247
    private function extractTransUnitMetadata( DOMElement $transUnit, &$transUnitIdArrayForUniquenessCheck)
248
    {
249
        $metadata = [];
250
251
        // id
252
        if (null === $transUnit->attributes->getNamedItem('id')) {
0 ignored issues
show
Bug introduced by
The method getNamedItem() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

252
        if (null === $transUnit->attributes->/** @scrutinizer ignore-call */ getNamedItem('id')) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
253
            throw new NotFoundIdInTransUnit('Invalid trans-unit id found. EMPTY value', 400);
254
        }
255
256
        $id = $transUnit->attributes->getNamedItem('id')->nodeValue;
257
258
        if(strlen($id) > 100){
259
            throw new SegmentIdTooLongException('Segment-id too long. Max 100 characters allowed', 400);
260
        }
261
262
        $transUnitIdArrayForUniquenessCheck[] = $id;
263
        $metadata[ 'id' ] = $id;
264
265
        // translate
266
        if (null !== $transUnit->attributes->getNamedItem('translate')) {
267
            $metadata[ 'translate' ] = $transUnit->attributes->getNamedItem('translate')->nodeValue;
268
        }
269
270
        // tGroupBegin
271
        if (null !== $transUnit->attributes->getNamedItem('tGroupBegin')) {
272
            $metadata[ 'tGroupBegin' ] = $transUnit->attributes->getNamedItem('tGroupBegin')->nodeValue;
273
        }
274
275
        // tGroupEnd
276
        if (null !== $transUnit->attributes->getNamedItem('tGroupEnd')) {
277
            $metadata[ 'tGroupEnd' ] = $transUnit->attributes->getNamedItem('tGroupEnd')->nodeValue;
278
        }
279
280
        // sizeRestriction
281
        if (null !== $transUnit->attributes->getNamedItem('sizeRestriction') &&  '' !== $transUnit->attributes->getNamedItem('sizeRestriction')->nodeValue ) {
282
            $metadata[ 'sizeRestriction' ] = (int)$transUnit->attributes->getNamedItem('sizeRestriction')->nodeValue;
283
        }
284
285
        return $metadata;
286
    }
287
288
    /**
289
     * @param DOMElement $transUnit
290
     *
291
     * @return array
292
     * @throws \Exception
293
     */
294
    private function extractTransUnitOriginalData( DOMElement $transUnit)
295
    {
296
        $originalData = [];
297
298
        // loop <originalData> to get nested content
299
        foreach ($transUnit->childNodes as $childNode) {
300
            if ($childNode->nodeName === 'originalData') {
301
                foreach ($childNode->childNodes as $data) {
302
                    if (null!== $data->attributes && null !== $data->attributes->getNamedItem('id')) {
303
                        $dataId = $data->attributes->getNamedItem('id')->nodeValue;
304
305
                        $dataValue = str_replace(Placeholder::WHITE_SPACE_PLACEHOLDER, ' ', $data->nodeValue);
306
                        $dataValue = str_replace(Placeholder::NEW_LINE_PLACEHOLDER,'\n', $dataValue);
307
                        $dataValue = str_replace(Placeholder::TAB_PLACEHOLDER, '\t', $dataValue);
308
309
                        if ('' !== $dataValue) {
310
311
                            $jsonOrRawContentArray = $this->JSONOrRawContentArray($dataValue, false);
312
313
                            // restore xliff tags
314
                            if (isset($jsonOrRawContentArray['json'])){
315
                                $jsonOrRawContentArray['json'] = str_replace([Placeholder::LT_PLACEHOLDER, Placeholder::GT_PLACEHOLDER], ['&lt;','&gt;'], $jsonOrRawContentArray['json']);
316
                            }
317
318
                            if (isset($jsonOrRawContentArray['raw-content'])){
319
                                $jsonOrRawContentArray['raw-content'] = str_replace([Placeholder::LT_PLACEHOLDER, Placeholder::GT_PLACEHOLDER], ['&lt;','&gt;'], $jsonOrRawContentArray['raw-content']);
320
                            }
321
322
                            $originalData[] = array_merge(
323
                                $jsonOrRawContentArray,
324
                                [
325
                                    'attr' => [
326
                                        'id' => $dataId
327
                                    ]
328
                                ]
329
                            );
330
                        }
331
                    }
332
                }
333
            }
334
        }
335
336
        return $originalData;
337
    }
338
339
    /**
340
     * @param DOMElement $transUnit
341
     *
342
     * @return array
343
     */
344
    private function extractTransUnitAdditionalTagData( DOMElement $transUnit)
345
    {
346
        $additionalTagData = [];
347
348
        // loop <originalData> to get nested content
349
        foreach ($transUnit->childNodes as $childNode) {
350
            if ($childNode->nodeName === 'memsource:additionalTagData') {
351
                foreach ($childNode->childNodes as $data) {
352
                    $dataArray = [];
353
354
                    // id
355
                    if ($data->nodeName === 'memsource:tag') {
356
                        if (null!== $data->attributes && null !== $data->attributes->getNamedItem('id')) {
357
                            $dataId = $data->attributes->getNamedItem('id')->nodeValue;
358
                            $dataArray['attr']['id'] = $dataId;
359
                        }
360
                    }
361
362
                    // in PHP 7.4 $data->childNodes is an empty DomNodeList, it is iterable with size 0
363
                    // PHP 5.6 check: in php 5.6 $data->childNodes can be null
364
                    if( $data->childNodes != null ){
365
366
                        // content
367
                        foreach ($data->childNodes as $datum) {
368
                            if ($datum->nodeName === 'memsource:tagId') {
369
                                $dataArray['raw-content']['tagId'] = $datum->nodeValue;
370
                            }
371
372
                            if ($datum->nodeName === 'memsource:type') {
373
                                $dataArray['raw-content']['type'] = $datum->nodeValue;
374
                            }
375
                        }
376
377
                    }
378
379
                    if (!empty($dataArray)) {
380
                        $additionalTagData[] = $dataArray;
381
                    }
382
                }
383
            }
384
        }
385
386
        return $additionalTagData;
387
    }
388
389
    /**
390
     * Check if segment id is present within tGroupBegin and tGroupEnd attributes
391
     *
392
     * @param DOMElement $segment
393
     * @param array $attr
394
     */
395
    private function checkSegmentIdConsistency( DOMElement $segment, array $attr)
396
    {
397
        if (isset($attr[ 'tGroupBegin' ]) && isset($attr[ 'tGroupEnd' ]) && $segment->attributes->getNamedItem('id')) {
0 ignored issues
show
Bug introduced by
The method getNamedItem() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

397
        if (isset($attr[ 'tGroupBegin' ]) && isset($attr[ 'tGroupEnd' ]) && $segment->attributes->/** @scrutinizer ignore-call */ getNamedItem('id')) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
398
            $id = $segment->attributes->getNamedItem('id')->nodeValue;
399
            $min = (int)$attr[ 'tGroupBegin' ];
400
            $max = (int)$attr[ 'tGroupEnd' ];
401
402
            if (false === (($min <= $id) && ($id <= $max))) {
403
                if ($this->logger) {
404
                    $this->logger->warning('Segment #' . $id . ' is not included within tGroupBegin and tGroupEnd');
405
                }
406
            }
407
        }
408
    }
409
410
    /**
411
     * @param DOMElement $transUnit
412
     *
413
     * @return array
414
     * @throws \Exception
415
     */
416
    private function extractTransUnitNotes( DOMElement $transUnit)
417
    {
418
        $notes = [];
419
420
        // loop <notes> to get nested <note> tag
421
        foreach ($transUnit->childNodes as $childNode) {
422
            if ($childNode->nodeName == 'notes') {
423
                foreach ($childNode->childNodes as $note) {
424
                    $noteValue = trim($note->nodeValue);
425
                    if ('' !== $noteValue) {
426
                        $notes[] = $this->JSONOrRawContentArray($noteValue);
427
                    }
428
                }
429
            }
430
431
            if ($childNode->nodeName === 'mda:metadata') {
432
                foreach ($childNode->childNodes as $metadata) {
433
                    if ($metadata->nodeName === 'mda:metaGroup') {
434
                        foreach ($metadata->childNodes as $meta) {
435
                            if (null!== $meta->attributes && null !== $meta->attributes->getNamedItem('type')) {
436
                                $type = $meta->attributes->getNamedItem('type')->nodeValue;
437
                                $metaValue = trim($meta->nodeValue);
438
439
                                if ('' !== $metaValue) {
440
                                    $notes[] = array_merge(
441
                                        $this->JSONOrRawContentArray($metaValue),
442
                                        [
443
                                        'attr' => [
444
                                                'type' => $type
445
                                        ]
446
                                    ]
447
                                    );
448
                                }
449
                            }
450
                        }
451
                    }
452
                }
453
            }
454
        }
455
456
        return $notes;
457
    }
458
}
459