Issues (16)

src/XliffParser/XliffParserV1.php (3 issues)

1
<?php
2
3
namespace Matecat\XliffParser\XliffParser;
4
5
use DOMAttr;
6
use DOMDocument;
7
use DOMElement;
8
use DOMNode;
9
use Exception;
10
use Matecat\XliffParser\Exception\DuplicateTransUnitIdInXliff;
11
use Matecat\XliffParser\Exception\NotFoundIdInTransUnit;
12
use Matecat\XliffParser\Exception\SegmentIdTooLongException;
13
14
class XliffParserV1 extends AbstractXliffParser {
15
    /**
16
     * @inheritDoc
17
     * @throws Exception
18
     */
19
    public function parse( DOMDocument $dom, ?array $output = [] ): array {
20
        $i = 1;
21
        /** @var DOMElement $file */
22
        foreach ( $dom->getElementsByTagName( 'file' ) as $file ) {
23
24
            // metadata
25
            $output[ 'files' ][ $i ][ 'attr' ] = $this->extractMetadata( $file );
26
27
            // reference
28
            if ( !empty( $this->extractReference( $file ) ) ) {
29
                $output[ 'files' ][ $i ][ 'reference' ] = $this->extractReference( $file );
30
            }
31
32
            // trans-units
33
            $transUnitIdArrayForUniquenessCheck = [];
34
            $j                                  = 1;
35
            foreach ( $file->childNodes as $body ) {
36
37
                // external-file
38
                if ( $body->nodeName === 'header' ) {
39
                    foreach ( $body->childNodes as $header ) {
40
                        $this->extractExternalFile( $header, $i, $output );
41
                    }
42
                }
43
44
                if ( $body->nodeName === 'body' ) {
45
                    foreach ( $body->childNodes as $childNode ) {
46
                        $this->extractTuFromNode( $childNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j );
47
                    }
48
49
                    // trans-unit re-count check
50
                    $totalTransUnitsId  = count( $transUnitIdArrayForUniquenessCheck );
51
                    $transUnitsUniqueId = count( array_unique( $transUnitIdArrayForUniquenessCheck ) );
52
                    if ( $totalTransUnitsId != $transUnitsUniqueId ) {
53
                        throw new DuplicateTransUnitIdInXliff( "Invalid trans-unit id, duplicate found.", 400 );
54
                    }
55
56
                    $i++;
57
                }
58
            }
59
        }
60
61
        return $output;
62
    }
63
64
    /**
65
     * @param DOMNode    $header
66
     * @param            $i
67
     * @param            $output
68
     */
69
    private function extractExternalFile( DOMNode $header, $i, &$output ) {
70
71
        if ( $header->nodeName === "skl" ) {
72
            foreach ( $header->childNodes as $referenceNode ) {
73
                if ( $referenceNode->nodeName === "reference" ) {
74
                    foreach ( $referenceNode->childNodes as $childNode ) {
75
                        if ( $childNode->nodeName === "external-file" ) {
76
                            $href                                                 = $childNode->getAttribute( "href" );
77
                            $output[ 'files' ][ $i ][ 'attr' ][ 'external-file' ] = $href;
78
                        }
79
                    }
80
                } elseif ( $referenceNode->nodeName === "external-file" ) {
81
                    $href                                                 = $referenceNode->getAttribute( "href" );
82
                    $output[ 'files' ][ $i ][ 'attr' ][ 'external-file' ] = $href;
83
                }
84
            }
85
        } elseif ( $header->nodeName === "reference" ) {
86
            foreach ( $header->childNodes as $referenceNode ) {
87
                if ( $referenceNode->nodeName === "external-file" ) {
88
                    $href                                                 = $referenceNode->getAttribute( "href" );
89
                    $output[ 'files' ][ $i ][ 'attr' ][ 'external-file' ] = $href;
90
                }
91
            }
92
        }
93
    }
94
95
    /**
96
     * @param DOMElement $file
97
     *
98
     * @return array
99
     */
100
    private function extractMetadata( DOMElement $file ): array {
101
        $metadata   = [];
102
        $customAttr = [];
103
104
        /** @var DOMAttr $attribute */
105
        foreach ( $file->attributes as $attribute ) {
106
            switch ( $attribute->localName ) {
107
                // original
108
                case 'original':
109
                    $metadata[ 'original' ] = $attribute->value;
110
                    break;
111
112
                // source-language
113
                case 'source-language':
114
                    $metadata[ 'source-language' ] = $attribute->value;
115
                    break;
116
117
                // data-type
118
                case 'datatype':
119
                    $metadata[ 'data-type' ] = $attribute->value;
120
                    break;
121
122
                // target-language
123
                case 'target-language':
124
                    $metadata[ 'target-language' ] = $attribute->value;
125
                    break;
126
            }
127
128
            // Custom MateCat x-Attribute
129
            preg_match( '|x-(.*?)|si', $attribute->localName, $temp );
130
            if ( isset( $temp[ 1 ] ) ) {
131
                $customAttr[ $attribute->localName ] = $attribute->value;
132
            }
133
            unset( $temp );
134
135
            // Custom MateCat namespace Attribute mtc:
136
            preg_match( '|mtc:(.*?)|si', $attribute->nodeName, $temp );
137
            if ( isset( $temp[ 1 ] ) ) {
138
                $customAttr[ $attribute->nodeName ] = $attribute->value;
139
            }
140
            unset( $temp );
141
142
            if ( !empty( $customAttr ) ) {
143
                $metadata[ 'custom' ] = $customAttr;
144
            }
145
        }
146
147
        return $metadata;
148
    }
149
150
    /**
151
     * @param DOMElement $file
152
     *
153
     * @return array
154
     */
155
    private function extractReference( DOMElement $file ): array {
156
        $reference = [];
157
158
        $order = 0;
159
        foreach ( $file->getElementsByTagName( 'reference' ) as $ref ) {
160
            /** @var DOMNode $childNode */
161
            foreach ( $ref->childNodes as $childNode ) {
162
                if ( $childNode->nodeName === 'internal-file' ) {
163
                    $reference[ $order ][ 'form-type' ] = $childNode->attributes->getNamedItem( 'form' )->nodeValue;
164
                    $reference[ $order ][ 'base64' ]    = trim( $childNode->nodeValue );
165
                    $order++;
166
                }
167
            }
168
        }
169
170
        return $reference;
171
    }
172
173
    /**
174
     * Extract and populate 'trans-units' array
175
     *
176
     * @param DOMElement  $transUnit
177
     * @param array       $transUnitIdArrayForUniquenessCheck
178
     * @param DOMDocument $dom
179
     * @param array       $output
180
     * @param int         $i
181
     * @param int         $j
182
     * @param array|null  $contextGroups
183
     *
184
     * @throws Exception
185
     */
186
    protected function extractTransUnit( DOMElement $transUnit, array &$transUnitIdArrayForUniquenessCheck, DomDocument $dom, array &$output, int &$i, int &$j, ?array $contextGroups = [] ) {
187
        // metadata
188
        $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'attr' ] = $this->extractTransUnitMetadata( $transUnit, $transUnitIdArrayForUniquenessCheck );
189
190
        // notes
191
        $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'notes' ] = $this->extractTransUnitNotes( $dom, $transUnit );
192
193
        // content
194
        /** @var DOMElement $childNode */
195
        foreach ( $transUnit->childNodes as $childNode ) {
196
            // source
197
            if ( $childNode->nodeName === 'source' ) {
198
                $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'source' ] = $this->extractContent( $dom, $childNode );
199
            }
200
201
            // seg-source
202
            if ( $childNode->nodeName === 'seg-source' ) {
203
                $rawSegment                                                     = $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'source' ][ 'raw-content' ];
204
                $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'seg-source' ] = $this->extractContentWithMarksAndExtTags( $dom, $childNode, $rawSegment );
0 ignored issues
show
The call to Matecat\XliffParser\Xlif...ntWithMarksAndExtTags() has too many arguments starting with $rawSegment. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

204
                /** @scrutinizer ignore-call */ 
205
                $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'seg-source' ] = $this->extractContentWithMarksAndExtTags( $dom, $childNode, $rawSegment );

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
205
            }
206
207
            // target
208
            if ( $childNode->nodeName === 'target' ) {
209
                $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'target' ] = $this->extractContent( $dom, $childNode );
210
211
                // seg-target
212
                $targetRawContent = $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'target' ][ 'raw-content' ] ?? null;
213
                $segSource        = $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'seg-source' ] ?? null;
214
215
                if ( !empty( $targetRawContent ) and isset( $segSource ) and count( $segSource ) > 0 ) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
Using logical operators such as and instead of && is generally not recommended.

PHP has two types of connecting operators (logical operators, and boolean operators):

  Logical Operators Boolean Operator
AND - meaning and &&
OR - meaning or ||

The difference between these is the order in which they are executed. In most cases, you would want to use a boolean operator like &&, or ||.

Let’s take a look at a few examples:

// Logical operators have lower precedence:
$f = false or true;

// is executed like this:
($f = false) or true;


// Boolean operators have higher precedence:
$f = false || true;

// is executed like this:
$f = (false || true);

Logical Operators are used for Control-Flow

One case where you explicitly want to use logical operators is for control-flow such as this:

$x === 5
    or die('$x must be 5.');

// Instead of
if ($x !== 5) {
    die('$x must be 5.');
}

Since die introduces problems of its own, f.e. it makes our code hardly testable, and prevents any kind of more sophisticated error handling; you probably do not want to use this in real-world code. Unfortunately, logical operators cannot be combined with throw at this point:

// The following is currently a parse error.
$x === 5
    or throw new RuntimeException('$x must be 5.');

These limitations lead to logical operators rarely being of use in current PHP code.

Loading history...
216
                    $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'seg-target' ]                = $this->extractContentWithMarksAndExtTags( $dom, $childNode );
217
                    $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'seg-target' ][ 0 ][ 'attr' ] = $this->extractTagAttributes( $childNode );
218
                }
219
            }
220
221
            // locked
222
            if ( $childNode->nodeName === 'sdl:seg' ) {
223
                $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'locked' ] = $this->extractLocked( $childNode );
224
            }
225
        }
226
227
        // context-group
228
        if ( !empty( $contextGroups ) ) {
229
            foreach ( $contextGroups as $contextGroup ) {
230
                $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'context-group' ][] = $this->extractTransUnitContextGroup( $dom, $contextGroup );
231
            }
232
        }
233
234
        foreach ( $transUnit->getElementsByTagName( 'context-group' ) as $contextGroup ) {
235
            $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'context-group' ][] = $this->extractTransUnitContextGroup( $dom, $contextGroup );
236
        }
237
238
        // alt-trans
239
        foreach ( $transUnit->getElementsByTagName( 'alt-trans' ) as $altTrans ) {
240
            $output[ 'files' ][ $i ][ 'trans-units' ][ $j ][ 'alt-trans' ][] = $this->extractTransUnitAltTrans( $altTrans );
241
        }
242
243
        $j++;
244
    }
245
246
    /**
247
     * @param DOMElement $transUnit
248
     * @param array      $transUnitIdArrayForUniquenessCheck
249
     *
250
     * @return array
251
     * @throws Exception
252
     */
253
    private function extractTransUnitMetadata( DOMElement $transUnit, array &$transUnitIdArrayForUniquenessCheck ): array {
254
        $metadata = [];
255
256
        // id MUST NOT be null
257
        if ( null === $transUnit->attributes->getNamedItem( 'id' ) ) {
0 ignored issues
show
The method getNamedItem() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

257
        if ( null === $transUnit->attributes->/** @scrutinizer ignore-call */ getNamedItem( 'id' ) ) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
258
            throw new NotFoundIdInTransUnit( 'Invalid trans-unit id found. EMPTY value', 400 );
259
        }
260
261
        /**
262
         * @var DOMAttr $element
263
         */
264
        foreach ( $transUnit->attributes as $element ) {
265
266
            if ( $element->nodeName === "id" ) {
267
268
                $id = $element->nodeValue;
269
270
                if ( strlen( $id ) > 100 ) {
271
                    throw new SegmentIdTooLongException( 'Segment-id too long. Max 100 characters allowed', 400 );
272
                }
273
274
                $transUnitIdArrayForUniquenessCheck[] = $id;
275
                $metadata[ 'id' ]                     = $id;
276
277
            } elseif ( $element->nodeName === "approved" ) {
278
                // approved as BOOLEAN
279
                // http://docs.oasis-open.org/xliff/v1.2/os/xliff-core.html#approved
280
                $metadata[ $element->nodeName ] = filter_var( $element->nodeValue, FILTER_VALIDATE_BOOLEAN );
281
            } elseif ( $element->nodeName === "maxwidth" ) {
282
                // we ignore ( but we get ) the attribute size-unit="char" assuming that a restriction is everytime done by character
283
                // we duplicate the info to allow Xliff V1 and V2 to work the same
284
                $metadata[ 'sizeRestriction' ]  = filter_var( $element->nodeValue, FILTER_SANITIZE_NUMBER_INT );
285
                $metadata[ $element->nodeName ] = filter_var( $element->nodeValue, FILTER_SANITIZE_NUMBER_INT );
286
            } else {
287
                $metadata[ $element->nodeName ] = $element->nodeValue;
288
            }
289
290
        }
291
292
        return $metadata;
293
    }
294
295
    /**
296
     * @param DOMDocument $dom
297
     * @param DOMElement  $transUnit
298
     *
299
     * @return array
300
     * @throws Exception
301
     */
302
    private function extractTransUnitNotes( DOMDocument $dom, DOMElement $transUnit ): array {
303
        $notes = [];
304
        foreach ( $transUnit->getElementsByTagName( 'note' ) as $note ) {
305
306
            $noteValue = $this->extractTagContent( $dom, $note );
307
308
            if ( '' !== $noteValue ) {
309
310
                $extractedNote = $this->JSONOrRawContentArray( $noteValue );
311
312
                // extract all the attributes
313
                foreach ( $note->attributes as $attribute ) {
314
                    $extractedNote[ $attribute->name ] = $attribute->value;
315
                }
316
317
                $notes[] = $extractedNote;
318
            }
319
        }
320
321
        return $notes;
322
    }
323
324
    /**
325
     * @param DOMDocument $dom
326
     * @param DOMElement  $contextGroup
327
     *
328
     * @return array
329
     */
330
    private function extractTransUnitContextGroup( DOMDocument $dom, DOMElement $contextGroup ): array {
331
        $cg           = [];
332
        $cg[ 'attr' ] = $this->extractTagAttributes( $contextGroup );
333
334
        /** @var DOMNode $context */
335
        foreach ( $contextGroup->childNodes as $context ) {
336
            if ( $context->nodeName === 'context' ) {
337
                $cg[ 'contexts' ][] = $this->extractContent( $dom, $context );
338
            }
339
        }
340
341
        return $cg;
342
    }
343
344
    /**
345
     * @param DOMElement $altTrans
346
     *
347
     * @return array
348
     */
349
    private function extractTransUnitAltTrans( DOMElement $altTrans ) {
350
        $at           = [];
351
        $at[ 'attr' ] = $this->extractTagAttributes( $altTrans );
352
353
        if ( $altTrans->getElementsByTagName( 'source' )->length > 0 ) {
354
            $at[ 'source' ] = $altTrans->getElementsByTagName( 'source' )->item( 0 )->nodeValue;
355
        }
356
357
        if ( $altTrans->getElementsByTagName( 'target' ) ) {
358
            $at[ 'target' ] = $altTrans->getElementsByTagName( 'target' )->item( 0 )->nodeValue;
359
        }
360
361
        return $at;
362
    }
363
364
    /**
365
     * @param DOMElement $locked
366
     *
367
     * @return bool
368
     */
369
    private function extractLocked( DOMElement $locked ) {
370
        return null !== $locked->getAttribute( 'locked' );
371
    }
372
}
373