Passed
Pull Request — master (#90)
by Domenico
03:06
created

AbstractXliffReplacer::setFileDescriptors()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 2
eloc 4
c 3
b 0
f 0
nc 2
nop 2
dl 0
loc 7
rs 10
1
<?php
2
3
namespace Matecat\XliffParser\XliffReplacer;
4
5
use Psr\Log\LoggerInterface;
6
use RuntimeException;
7
8
abstract class AbstractXliffReplacer {
9
    protected $originalFP;
10
    protected $outputFP;                  // output stream pointer
11
12
    protected string $tuTagName;                 // <trans-unit> (forXliff v 1.*) or <unit> (forXliff v 2.*)
13
    protected bool   $inTU             = false;  // flag to check whether we are in a <trans-unit>
14
    protected bool   $inTarget         = false;  // flag to check whether we are in a <target>, to ignore everything
15
    protected bool   $isEmpty          = false;  // flag to check whether we are in an empty tag (<tag/>)
16
    protected bool   $targetWasWritten = false;  // flag to check is <target> was written in the current unit
17
    protected string $CDATABuffer      = "";       // buffer for special tag
18
    protected bool   $bufferIsActive   = false;    // buffer for special tag
19
20
    protected int $offset = 0;         // offset for SAX pointer
21
22
    protected string  $currentBuffer;             // the current piece of text it's been parsed
23
    protected int     $len;                       // length of the currentBuffer
24
    protected array   $segments;                  // array of translations
25
    protected array   $lastTransUnit                  = [];
26
    protected int     $segmentInUnitPosition          = 0;
27
    protected ?string $currentTransUnitId             = null;        // id of current <trans-unit>
28
    protected ?string $currentTransUnitIsTranslatable = null; // 'translate' attribute of current <trans-unit>
29
    protected bool    $unitContainsMda                = false;   // check if <unit> already contains a <mda:metadata> (forXliff v 2.*)
30
    protected bool    $hasWrittenCounts               = false;  // check if <unit> already wrote segment counts (forXliff v 2.*)
31
    protected string  $targetLang;
32
    protected bool    $sourceInTarget                 = false;
33
34
    protected array $nodesToBuffer;
35
36
    protected array $transUnits;
37
38
    /** @var int */
39
    protected int $xliffVersion;
40
41
    /**
42
     * @var XliffReplacerCallbackInterface|null
43
     */
44
    protected ?XliffReplacerCallbackInterface $callback;
45
46
    protected ?LoggerInterface $logger;
47
48
    protected static $INTERNAL_TAG_PLACEHOLDER;
49
50
    protected $counts = [
51
            'raw_word_count' => 0,
52
            'eq_word_count'  => 0,
53
    ];
54
55
    /**
56
     * AbstractXliffReplacer constructor.
57
     *
58
     * @param string                              $originalXliffPath
59
     * @param int                                 $xliffVersion
60
     * @param array                               $segments
61
     * @param array                               $transUnits
62
     * @param string                              $trgLang
63
     * @param string                              $outputFilePath
64
     * @param bool                                $setSourceInTarget
65
     * @param LoggerInterface|null                $logger
66
     * @param XliffReplacerCallbackInterface|null $callback
67
     */
68
    public function __construct(
69
            string                         $originalXliffPath,
70
            int                            $xliffVersion,
71
            array                          $segments,
72
            array                          $transUnits,
73
            string                         $trgLang,
74
            string                         $outputFilePath,
75
            bool                           $setSourceInTarget,
76
            LoggerInterface                $logger = null,
77
            XliffReplacerCallbackInterface $callback = null
78
    ) {
79
        self::$INTERNAL_TAG_PLACEHOLDER = $this->getInternalTagPlaceholder();
80
        $this->createOutputFileIfDoesNotExist( $outputFilePath );
81
        $this->setFileDescriptors( $originalXliffPath, $outputFilePath );
82
        $this->xliffVersion   = $xliffVersion;
83
        $this->tuTagName      = ( $this->xliffVersion === 2 ) ? 'unit' : 'trans-unit';
84
        $this->segments       = $segments;
85
        $this->targetLang     = $trgLang;
86
        $this->sourceInTarget = $setSourceInTarget;
87
        $this->transUnits     = $transUnits;
88
        $this->logger         = $logger;
89
        $this->callback       = $callback;
90
    }
91
92
    public function replaceTranslation() {
93
        fwrite( $this->outputFP, '<?xml version="1.0" encoding="UTF-8"?>' );
94
95
        //create Sax parser
96
        $xmlParser = $this->initSaxParser();
97
98
        while ( $this->currentBuffer = fread( $this->originalFP, 4096 ) ) {
99
            /*
100
               preprocess file
101
             */
102
            // obfuscate entities because sax automatically does html_entity_decode
103
            $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
104
105
            //avoid cutting entities in half:
106
            //the last fread could have truncated an entity (say, '&lt;' in '&l'), thus invalidating the escaping
107
            //***** and if there is an & that it is not an entity, this is an infinite loop !!!!!
108
            // 9 is the max length of an entity. So, suppose that the & is at the end of buffer,
109
            // add 9 Bytes and substitute the entities, if the & is present, and it is not at the end
110
            //it can't be an entity, exit the loop
111
            while ( true ) {
112
                $_ampPos = strpos( $temporary_check_buffer, '&' );
113
114
                //check for real entity or escape it to safely exit from the loop!!!
115
                if ( $_ampPos === false || strlen( substr( $temporary_check_buffer, $_ampPos ) ) > 9 ) {
116
                    break;
117
                }
118
119
                //if an entity is still present, fetch some more and repeat the escaping
120
                $this->currentBuffer    .= fread( $this->originalFP, 9 );
121
                $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
122
            }
123
124
            //free stuff outside the loop
125
            unset( $temporary_check_buffer );
126
127
            $this->currentBuffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
128
            $this->currentBuffer = str_replace( "&", self::$INTERNAL_TAG_PLACEHOLDER . 'amp' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
129
130
            //get length of chunk
131
            $this->len = strlen( $this->currentBuffer );
132
133
            //parse chunk of text
134
            if ( !xml_parse( $xmlParser, $this->currentBuffer, feof( $this->originalFP ) ) ) {
135
                //if unable, raise an exception
136
                throw new RuntimeException( sprintf(
137
                        "XML error: %s at line %d",
138
                        xml_error_string( xml_get_error_code( $xmlParser ) ),
139
                        xml_get_current_line_number( $xmlParser )
140
                ) );
141
            }
142
            /*
143
            * Get the accumulated this->offset in the document:
144
             * as long as SAX pointer advances, we keep track of total bytes it has seen so far;
145
             * this way, we can translate its global pointer in an address local to the current buffer of text to retrieve the last char of tag
146
            */
147
            $this->offset += $this->len;
148
        }
149
150
        // close Sax parser
151
        $this->closeSaxParser( $xmlParser );
152
153
    }
154
155
    /**
156
     * @param resource $parser
157
     *
158
     * @return string
159
     */
160
    protected function getLastCharacter( $parser ): string {
161
162
        //this logic helps detecting empty tags
163
        //get current position of SAX pointer in all the stream of data is has read so far:
164
        //it points at the end of current tag
165
        $idx = xml_get_current_byte_index( $parser );
166
167
        //check whether the bounds of current tag are entirely in current buffer or the end of the current tag
168
        //is outside current buffer (in the latter case, it's in next buffer to be read by the while loop);
169
        //this check is necessary because we may have truncated a tag in half with current read,
170
        //and the other half may be encountered in the next buffer it will be passed
171
        if ( isset( $this->currentBuffer[ $idx - $this->offset ] ) ) {
172
            //if this tag entire lenght fitted in the buffer, the last char must be the last
173
            //symbol before the '>'; if it's an empty tag, it is assumed that it's a '/'
174
            $lastChar = $this->currentBuffer[ $idx - $this->offset ];
175
        } else {
176
            //if it's out, simple use the last character of the chunk
177
            $lastChar = $this->currentBuffer[ $this->len - 1 ];
178
        }
179
180
        return $lastChar;
181
182
    }
183
184
    /**
185
     * @return string
186
     */
187
    private function getInternalTagPlaceholder(): string {
188
        return "§" .
189
                substr(
190
                        str_replace(
191
                                [ '+', '/' ],
192
                                '',
193
                                base64_encode( openssl_random_pseudo_bytes( 10, $_crypto_strong ) )
194
                        ),
195
                        0,
196
                        4
197
                );
198
    }
199
200
    private function createOutputFileIfDoesNotExist( string $outputFilePath ) {
201
        // create output file
202
        if ( !file_exists( $outputFilePath ) ) {
203
            touch( $outputFilePath );
204
        }
205
    }
206
207
    /**
208
     * @param string $originalXliffPath
209
     * @param string $outputFilePath
210
     */
211
    private function setFileDescriptors( string $originalXliffPath, string $outputFilePath ) {
212
        $this->outputFP = fopen( $outputFilePath, 'w+' );
213
214
        $streamArgs = null;
215
216
        if ( !( $this->originalFP = fopen( $originalXliffPath, "r", false, stream_context_create( $streamArgs ) ) ) ) {
217
            throw new RuntimeException( "could not open XML input" );
218
        }
219
    }
220
221
    /**
222
     * AbstractXliffReplacer destructor.
223
     */
224
    public function __destruct() {
225
        //this stream can be closed outside the class
226
        //to permit multiple concurrent downloads, so suppress warnings
227
        @fclose( $this->originalFP );
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for fclose(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

227
        /** @scrutinizer ignore-unhandled */ @fclose( $this->originalFP );

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
228
        fclose( $this->outputFP );
229
    }
230
231
    /**
232
     * Init Sax parser
233
     *
234
     * @return resource
235
     */
236
    protected function initSaxParser() {
237
        $xmlSaxParser = xml_parser_create( 'UTF-8' );
238
        xml_set_object( $xmlSaxParser, $this );
239
        xml_parser_set_option( $xmlSaxParser, XML_OPTION_CASE_FOLDING, false );
240
        xml_set_element_handler( $xmlSaxParser, 'tagOpen', 'tagClose' );
241
        xml_set_character_data_handler( $xmlSaxParser, 'characterData' );
242
243
        return $xmlSaxParser;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $xmlSaxParser also could return the type XmlParser which is incompatible with the documented return type resource.
Loading history...
244
    }
245
246
    /**
247
     * @param resource $xmlSaxParser
248
     */
249
    protected function closeSaxParser( $xmlSaxParser ) {
250
        xml_parser_free( $xmlSaxParser );
251
    }
252
253
    /**
254
     * @param resource $parser
255
     * @param string   $name
256
     * @param array    $attr
257
     *
258
     * @return mixed
259
     */
260
    abstract protected function tagOpen( $parser, string $name, array $attr );
261
262
    /**
263
     * @param resource $parser
264
     * @param string   $name
265
     *
266
     * @return mixed
267
     */
268
    abstract protected function tagClose( $parser, string $name );
269
270
    /**
271
     * @param resource $parser
272
     * @param string   $data
273
     *
274
     * @return mixed
275
     */
276
    protected function characterData( $parser, string $data ): void {
0 ignored issues
show
Unused Code introduced by
The parameter $parser is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

276
    protected function characterData( /** @scrutinizer ignore-unused */ $parser, string $data ): void {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
277
        // don't write <target> data
278
        if ( !$this->inTarget && !$this->bufferIsActive ) {
279
            $this->postProcAndFlush( $this->outputFP, $data );
280
        } elseif ( $this->bufferIsActive ) {
281
            $this->CDATABuffer .= $data;
282
        }
283
    }
284
285
    /**
286
     * postprocess escaped data and write to disk
287
     *
288
     * @param resource $fp
289
     * @param string   $data
290
     * @param ?bool    $treatAsCDATA
291
     */
292
    protected function postProcAndFlush( $fp, string $data, ?bool $treatAsCDATA = false ) {
293
        //postprocess string
294
        $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data );
295
        $data = str_replace( '&nbsp;', ' ', $data );
296
        if ( !$treatAsCDATA ) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $treatAsCDATA of type boolean|null is loosely compared to false; this is ambiguous if the boolean can be false. You might want to explicitly use !== null instead.

If an expression can have both false, and null as possible values. It is generally a good practice to always use strict comparison to clearly distinguish between those two values.

$a = canBeFalseAndNull();

// Instead of
if ( ! $a) { }

// Better use one of the explicit versions:
if ($a !== null) { }
if ($a !== false) { }
if ($a !== null && $a !== false) { }
Loading history...
297
            //unix2dos
298
            $data = str_replace( "\r\n", "\r", $data );
299
            $data = str_replace( "\n", "\r", $data );
300
            $data = str_replace( "\r", "\r\n", $data );
301
        }
302
303
        //flush to disk
304
        fwrite( $fp, $data );
305
    }
306
307
    /**
308
     * @param string $name
309
     * @param array  $attr
310
     *
311
     * @return void
312
     */
313
    protected function handleOpenUnit( string $name, array $attr ) {
314
315
        // check if we are entering into a <trans-unit> (xliff v1.*) or <unit> (xliff v2.*)
316
        if ( $this->tuTagName === $name ) {
317
            $this->inTU = true;
318
319
            // get id
320
            // trim to first 100 characters because this is the limit on Matecat's DB
321
            $this->currentTransUnitId = substr( $attr[ 'id' ], 0, 100 );
322
323
            // `translate` attribute can be only yes or no
324
            // current 'translate' attribute of the current trans-unit
325
            $this->currentTransUnitIsTranslatable = empty( $attr[ 'translate' ] ) ? 'yes' : $attr[ 'translate' ];
326
327
            $this->setLastTransUnitSegments();
328
329
        }
330
    }
331
332
    /**
333
     * @param string $name
334
     * @param array  $attr
335
     * @param string $tag
336
     *
337
     * @return string
338
     */
339
    protected function handleOpenXliffTag( string $name, array $attr, string $tag ): string {
340
341
        // Add MateCat specific namespace.
342
        // Add trgLang
343
        if ( $name === 'xliff' ) {
344
            if ( !array_key_exists( 'xmlns:mtc', $attr ) ) {
345
                $tag .= ' xmlns:mtc="https://www.matecat.com" ';
346
            }
347
            $tag = preg_replace( '/trgLang="(.*?)"/', 'trgLang="' . $this->targetLang . '"', $tag );
348
        }
349
350
        return $tag;
351
352
    }
353
354
    /**
355
     * @param string $name
356
     *
357
     * @return void
358
     */
359
    protected function checkSetInTarget( string $name ) {
360
        // check if we are entering into a <target>
361
        if ( 'target' === $name ) {
362
            if ( $this->currentTransUnitIsTranslatable === 'no' ) {
363
                $this->inTarget = false;
364
            } else {
365
                $this->inTarget = true;
366
            }
367
        }
368
    }
369
370
    /**
371
     * @param string $name
372
     *
373
     * @return void
374
     */
375
    protected function setInBuffer( string $name ) {
376
        if ( in_array( $name, $this->nodesToBuffer ) ) {
377
            $this->bufferIsActive = true;
378
        }
379
    }
380
381
    /**
382
     * @param array $seg
383
     */
384
    protected function updateSegmentCounts( array $seg = [] ) {
385
386
        $raw_word_count = $seg[ 'raw_word_count' ];
387
        $eq_word_count  = ( floor( $seg[ 'eq_word_count' ] * 100 ) / 100 );
388
389
        $this->counts[ 'segments_count_array' ][ $seg[ 'sid' ] ] = [
390
                'raw_word_count' => $raw_word_count,
391
                'eq_word_count'  => $eq_word_count,
392
        ];
393
394
        $this->counts[ 'raw_word_count' ] += $raw_word_count;
395
        $this->counts[ 'eq_word_count' ]  += $eq_word_count;
396
    }
397
398
    protected function resetCounts() {
399
        $this->counts[ 'segments_count_array' ] = [];
400
        $this->counts[ 'raw_word_count' ]       = 0;
401
        $this->counts[ 'eq_word_count' ]        = 0;
402
    }
403
404
    /**
405
     * @param resource $parser
406
     * @param string   $tag
407
     *
408
     * @return void
409
     */
410
    protected function checkForSelfClosedTagAndFlush( $parser, string $tag ) {
411
412
        $lastChar = $this->getLastCharacter( $parser );
413
414
        //trim last space
415
        $tag = rtrim( $tag );
416
417
        //detect empty tag
418
        $this->isEmpty = $lastChar == '/';
419
        if ( $this->isEmpty ) {
420
            $tag .= $lastChar;
421
        }
422
423
        //add tag ending
424
        $tag .= ">";
425
426
        //set a Buffer for the segSource Source tag
427
        if ( $this->bufferIsActive ) { // we are opening a critical CDATA section
428
            //these are NOT source/seg-source/value empty tags, THERE IS A CONTENT, write it in buffer
429
            $this->CDATABuffer .= $tag;
430
        } else {
431
            $this->postProcAndFlush( $this->outputFP, $tag );
432
        }
433
434
    }
435
436
    /**
437
     * A trans-unit can contain a list of segments because of mrk tags
438
     * Copy the segment's list for this trans-unit in a different structure
439
     *
440
     * @return void
441
     */
442
    protected function setLastTransUnitSegments() {
443
444
        /*
445
         * At the end of every cycle the segment grouping information is lost: unset( 'matecat|' . $this->currentId )
446
         *
447
         * We need to take the info about the last segment parsed
448
         *          ( normally more than 1 db row because of mrk tags )
449
         *
450
         * So, copy the current segment data group into another structure to keep the last segment
451
         * for the next tagOpen ( possible sdl:seg-defs )
452
         *
453
         */
454
        $this->lastTransUnit = [];
455
456
        if ( !isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
457
            return;
458
        }
459
460
        $listOfSegmentsIds = $this->transUnits[ $this->currentTransUnitId ];
461
        $last_value        = null;
462
        $segmentsCount     = count( $listOfSegmentsIds );
463
        for ( $i = 0; $i < $segmentsCount; $i++ ) {
464
            $id = $listOfSegmentsIds[ $i ];
465
            if ( isset( $this->segments[ $id ] ) && ( $i == 0 || $last_value + 1 == $listOfSegmentsIds[ $i ] ) ) {
466
                $last_value            = $listOfSegmentsIds[ $i ];
467
                $this->lastTransUnit[] = $this->segments[ $id ];
468
            }
469
        }
470
471
    }
472
473
    /**
474
     * @return array
475
     */
476
    protected function getCurrentSegment(): array {
477
        if ( $this->currentTransUnitIsTranslatable !== 'no' && isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
478
            return $this->segments[ $this->segmentInUnitPosition ];
479
        }
480
481
        return [];
482
    }
483
484
}