Passed
Pull Request — master (#97)
by Mauro
03:42
created

AbstractXliffReplacer::handleOpenUnit()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 15
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
eloc 5
nc 3
nop 2
dl 0
loc 15
rs 10
c 1
b 0
f 0
1
<?php
2
3
namespace Matecat\XliffParser\XliffReplacer;
4
5
use Psr\Log\LoggerInterface;
6
use RuntimeException;
7
8
abstract class AbstractXliffReplacer {
9
    protected $originalFP;
10
    protected $outputFP;                  // output stream pointer
11
12
    protected string $tuTagName;                 // <trans-unit> (forXliff v 1.*) or <unit> (forXliff v 2.*)
13
    protected bool   $inTU                  = false;  // flag to check whether we are in a <trans-unit>
14
    protected bool   $inTarget              = false;  // flag to check whether we are in a <target>, to ignore everything
15
    protected bool   $inAltTrans            = false;  // flag to check whether we are in an <alt-trans> (xliff 1.2) or <mtc:matches> (xliff 2.0)
16
    protected string $alternativeMatchesTag = ""; // polymorphic tag name for xliff 1.2 and 2.0
17
    protected bool   $isEmpty               = false;  // flag to check whether we are in an empty tag (<tag/>)
18
    protected bool   $targetWasWritten      = false;  // flag to check is <target> was written in the current unit
19
    protected string $CDATABuffer           = "";       // buffer for special tag
20
    protected string $namespace             = "";       // Custom namespace
21
    protected bool   $bufferIsActive        = false;    // flag for buffeting
22
23
    protected int $offset = 0;         // offset for SAX pointer
24
25
    protected string  $currentBuffer;             // the current piece of text it's been parsed
26
    protected int     $len;                       // length of the currentBuffer
27
    protected array   $segments;                  // array of translations
28
    protected array   $lastTransUnit                  = [];
29
    protected int     $segmentInUnitPosition          = 0;
30
    protected ?string $currentTransUnitId             = null;        // id of current <trans-unit>
31
    protected ?string $currentTransUnitIsTranslatable = null; // 'translate' attribute of current <trans-unit>
32
    protected bool    $hasWrittenCounts               = false;  // check if <unit> already wrote segment counts (forXliff v 2.*)
33
    protected string  $targetLang;
34
    protected bool    $sourceInTarget                 = false;
35
36
    protected array $nodesToBuffer;
37
38
    protected array $transUnits;
39
40
    /** @var int */
41
    protected int $xliffVersion;
42
43
    /**
44
     * @var XliffReplacerCallbackInterface|null
45
     */
46
    protected ?XliffReplacerCallbackInterface $callback;
47
48
    protected ?LoggerInterface $logger;
49
50
    protected static string $INTERNAL_TAG_PLACEHOLDER;
51
52
    protected $counts = [
53
            'raw_word_count' => 0,
54
            'eq_word_count'  => 0,
55
    ];
56
57
    protected $mrkTagsMap = [];
58
59
    /**
60
     * AbstractXliffReplacer constructor.
61
     *
62
     * @param string                              $originalXliffPath
63
     * @param int                                 $xliffVersion
64
     * @param array                               $segments
65
     * @param array                               $transUnits
66
     * @param string                              $trgLang
67
     * @param string                              $outputFilePath
68
     * @param bool                                $setSourceInTarget
69
     * @param LoggerInterface|null                $logger
70
     * @param XliffReplacerCallbackInterface|null $callback
71
     */
72
    public function __construct(
73
            string                         $originalXliffPath,
74
            int                            $xliffVersion,
75
            array                          $segments,
76
            array                          $transUnits,
77
            string                         $trgLang,
78
            string                         $outputFilePath,
79
            bool                           $setSourceInTarget,
80
            LoggerInterface                $logger = null,
81
            XliffReplacerCallbackInterface $callback = null
82
    ) {
83
        self::$INTERNAL_TAG_PLACEHOLDER = $this->getInternalTagPlaceholder();
84
        $this->createOutputFileIfDoesNotExist( $outputFilePath );
85
        $this->setFileDescriptors( $originalXliffPath, $outputFilePath );
86
        $this->xliffVersion   = $xliffVersion;
87
        $this->segments       = $segments;
88
        $this->targetLang     = $trgLang;
89
        $this->sourceInTarget = $setSourceInTarget;
90
        $this->transUnits     = $transUnits;
91
        $this->logger         = $logger;
92
        $this->callback       = $callback;
93
    }
94
95
    public function replaceTranslation() {
96
        fwrite( $this->outputFP, '<?xml version="1.0" encoding="UTF-8"?>' );
97
98
        //create Sax parser
99
        $xmlParser = $this->initSaxParser();
100
101
        while ( $this->currentBuffer = fread( $this->originalFP, 4096 ) ) {
102
            /*
103
               preprocess file
104
             */
105
            // obfuscate entities because sax automatically does html_entity_decode
106
            $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
107
108
            //avoid cutting entities in half:
109
            //the last fread could have truncated an entity (say, '&lt;' in '&l'), thus invalidating the escaping
110
            //***** and if there is an & that it is not an entity, this is an infinite loop !!!!!
111
            // 9 is the max length of an entity. So, suppose that the & is at the end of buffer,
112
            // add 9 Bytes and substitute the entities, if the & is present, and it is not at the end
113
            //it can't be an entity, exit the loop
114
            while ( true ) {
115
                $_ampPos = strpos( $temporary_check_buffer, '&' );
116
117
                //check for real entity or escape it to safely exit from the loop!!!
118
                if ( $_ampPos === false || strlen( substr( $temporary_check_buffer, $_ampPos ) ) > 9 ) {
119
                    break;
120
                }
121
122
                //if an entity is still present, fetch some more and repeat the escaping
123
                $this->currentBuffer    .= fread( $this->originalFP, 9 );
124
                $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
125
            }
126
127
            //free stuff outside the loop
128
            unset( $temporary_check_buffer );
129
130
            $this->currentBuffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
131
            $this->currentBuffer = str_replace( "&", self::$INTERNAL_TAG_PLACEHOLDER . 'amp' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
132
133
            //get length of chunk
134
            $this->len = strlen( $this->currentBuffer );
135
136
            /*
137
            * Get the accumulated this->offset in the document:
138
             * as long as SAX pointer advances, we keep track of total bytes it has seen so far;
139
             * this way, we can translate its global pointer in an address local to the current buffer of text to retrieve the last char of tag
140
            */
141
            $this->offset += $this->len;
142
143
            //parse chunk of text
144
            $this->runParser( $xmlParser );
145
146
        }
147
148
        // close Sax parser
149
        $this->closeSaxParser( $xmlParser );
150
151
    }
152
153
    /**
154
     * @param $xmlParser
155
     *
156
     * @return void
157
     */
158
    protected function runParser( $xmlParser ) {
159
        //parse chunk of text
160
        if ( !xml_parse( $xmlParser, $this->currentBuffer, feof( $this->originalFP ) ) ) {
161
            //if unable, raise an exception
162
            throw new RuntimeException( sprintf(
163
                    "XML error: %s at line %d",
164
                    xml_error_string( xml_get_error_code( $xmlParser ) ),
165
                    xml_get_current_line_number( $xmlParser )
166
            ) );
167
        }
168
    }
169
170
    /**
171
     * @param resource $parser
172
     *
173
     * @return string
174
     */
175
    protected function getLastCharacter( $parser ): string {
176
177
        //this logic helps detecting empty tags
178
        //get current position of SAX pointer in all the stream of data is has read so far:
179
        //it points at the end of current tag
180
        $idx = xml_get_current_byte_index( $parser );
181
182
        //check whether the bounds of current tag are entirely in current buffer or the end of the current tag
183
        //is outside current buffer (in the latter case, it's in next buffer to be read by the while loop);
184
        //this check is necessary because we may have truncated a tag in half with current read,
185
        //and the other half may be encountered in the next buffer it will be passed
186
        return $this->currentBuffer[ $idx - $this->offset ] ?? $this->currentBuffer[ $this->len - 1 ];
187
188
    }
189
190
    /**
191
     * @return string
192
     */
193
    private function getInternalTagPlaceholder(): string {
194
        return "§" .
195
                substr(
196
                        str_replace(
197
                                [ '+', '/' ],
198
                                '',
199
                                base64_encode( openssl_random_pseudo_bytes( 10, $_crypto_strong ) )
200
                        ),
201
                        0,
202
                        4
203
                );
204
    }
205
206
    private function createOutputFileIfDoesNotExist( string $outputFilePath ) {
207
        // create output file
208
        if ( !file_exists( $outputFilePath ) ) {
209
            touch( $outputFilePath );
210
        }
211
    }
212
213
    /**
214
     * @param string $originalXliffPath
215
     * @param string $outputFilePath
216
     */
217
    private function setFileDescriptors( string $originalXliffPath, string $outputFilePath ) {
218
        $this->outputFP = fopen( $outputFilePath, 'w+' );
219
220
        $streamArgs = null;
221
222
        if ( !( $this->originalFP = fopen( $originalXliffPath, "r", false, stream_context_create( $streamArgs ) ) ) ) {
223
            throw new RuntimeException( "could not open XML input" );
224
        }
225
    }
226
227
    /**
228
     * AbstractXliffReplacer destructor.
229
     */
230
    public function __destruct() {
231
        //this stream can be closed outside the class
232
        //to permit multiple concurrent downloads, so suppress warnings
233
        if ( is_resource( $this->originalFP ) ) {
234
            fclose( $this->originalFP );
235
        }
236
237
        if ( is_resource( $this->outputFP ) ) {
238
            fclose( $this->outputFP );
239
        }
240
241
    }
242
243
    /**
244
     * Init Sax parser
245
     *
246
     * @return resource
247
     */
248
    protected function initSaxParser() {
249
        $xmlSaxParser = xml_parser_create( 'UTF-8' );
250
        xml_set_object( $xmlSaxParser, $this );
251
        xml_parser_set_option( $xmlSaxParser, XML_OPTION_CASE_FOLDING, false );
252
        xml_set_element_handler( $xmlSaxParser, 'tagOpen', 'tagClose' );
253
        xml_set_character_data_handler( $xmlSaxParser, 'characterData' );
254
255
        return $xmlSaxParser;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $xmlSaxParser also could return the type XmlParser which is incompatible with the documented return type resource.
Loading history...
256
    }
257
258
    /**
259
     * @param resource $xmlSaxParser
260
     */
261
    protected function closeSaxParser( $xmlSaxParser ) {
262
        xml_parser_free( $xmlSaxParser );
263
    }
264
265
    /**
266
     * @param resource $parser
267
     * @param string   $name
268
     * @param array    $attr
269
     *
270
     * @return mixed
271
     */
272
    abstract protected function tagOpen( $parser, string $name, array $attr );
273
274
    /**
275
     * @param resource $parser
276
     * @param string   $name
277
     *
278
     * @return mixed
279
     */
280
    abstract protected function tagClose( $parser, string $name );
281
282
    /**
283
     * @param resource $parser
284
     * @param string   $data
285
     *
286
     * @return mixed
287
     */
288
    protected function characterData( $parser, string $data ): void {
0 ignored issues
show
Unused Code introduced by
The parameter $parser is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

288
    protected function characterData( /** @scrutinizer ignore-unused */ $parser, string $data ): void {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
289
        // don't write <target> data
290
        if ( !$this->inTarget && !$this->bufferIsActive ) {
291
            $this->postProcAndFlush( $this->outputFP, $data );
292
        } elseif ( $this->bufferIsActive ) {
293
            $this->CDATABuffer .= $data;
294
        }
295
    }
296
297
    /**
298
     * postprocess escaped data and write to disk
299
     *
300
     * @param resource $fp
301
     * @param string $data
302
     * @param bool $treatAsCDATA
303
     * @param bool $parseMarks
304
     */
305
    protected function postProcAndFlush($fp, string $data, bool $treatAsCDATA = false, $parseMarks = false ) {
306
        //postprocess string
307
        $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data );
308
        $data = str_replace( '&nbsp;', ' ', $data );
309
310
        // extract <mrk> map only for <seg-source> tag
311
        if($parseMarks){
312
            // check if there are spaces between <mrk> tags
313
            preg_match_all('/<mrk \b[^>]*>(.*?)<\/mrk>(\s+)/', $data, $spacesBetweenMrkCheck);
314
315
            if(!empty($spacesBetweenMrkCheck[0])){
316
317
                // $spacesBetweenMrkCheck[0] // holds the complete tags
0 ignored issues
show
Unused Code Comprehensibility introduced by
53% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
318
                // $spacesBetweenMrkCheck[1] // holds the text
319
                // $spacesBetweenMrkCheck[2] // holds the spaces
320
321
                foreach ($spacesBetweenMrkCheck[0] as $index => $mrk){
322
323
                    if($this instanceof Xliff20){
324
                        preg_match('/id="(\d+)"/', $mrk, $markMatch);
325
                    } else {
326
                        preg_match('/mid="(\d+)"/', $mrk, $markMatch);
327
                    }
328
329
                    if(isset($markMatch[1])){
330
331
                        if(!isset($this->mrkTagsMap[$this->currentTransUnitId])){
332
                            $this->mrkTagsMap[$this->currentTransUnitId] = [];
333
                        }
334
335
                        $this->mrkTagsMap[$this->currentTransUnitId][$markMatch[1]] = $spacesBetweenMrkCheck[2][$index];
336
                    }
337
                }
338
            }
339
        }
340
341
        if ( !$treatAsCDATA ) {
342
            //unix2dos
343
            $data = str_replace( "\r\n", "\r", $data );
344
            $data = str_replace( "\n", "\r", $data );
345
            $data = str_replace( "\r", "\r\n", $data );
346
        }
347
348
        //flush to disk
349
        fwrite( $fp, $data );
350
    }
351
352
    /**
353
     * @param string $name
354
     * @param array  $attr
355
     *
356
     * @return void
357
     */
358
    protected function handleOpenUnit( string $name, array $attr ) {
359
360
        // check if we are entering into a <trans-unit> (xliff v1.*) or <unit> (xliff v2.*)
361
        if ( $this->tuTagName === $name ) {
362
            $this->inTU = true;
363
364
            // get id
365
            // trim to first 100 characters because this is the limit on Matecat's DB
366
            $this->currentTransUnitId = substr( $attr[ 'id' ], 0, 100 );
367
368
            // `translate` attribute can be only yes or no
369
            // current 'translate' attribute of the current trans-unit
370
            $this->currentTransUnitIsTranslatable = empty( $attr[ 'translate' ] ) ? 'yes' : $attr[ 'translate' ];
371
372
            $this->setLastTransUnitSegments();
373
374
        }
375
    }
376
377
    /**
378
     * @param string $name
379
     * @param array  $attr
380
     * @param string $tag
381
     *
382
     * @return string
383
     */
384
    protected function handleOpenXliffTag( string $name, array $attr, string $tag ): string {
385
386
        // Add MateCat specific namespace.
387
        // Add trgLang
388
        if ( $name === 'xliff' ) {
389
            if ( !array_key_exists( 'xmlns:' . $this->namespace, $attr ) ) {
390
                $tag .= ' xmlns:' . $this->namespace . '="https://www.matecat.com" ';
391
            }
392
            $tag = preg_replace( '/trgLang="(.*?)"/', 'trgLang="' . $this->targetLang . '"', $tag );
393
        }
394
395
        return $tag;
396
397
    }
398
399
    /**
400
     * @param string $name
401
     *
402
     * @return void
403
     */
404
    protected function checkSetInTarget( string $name ) {
405
406
        // check if we are entering into a <target>
407
        if ( 'target' == $name && !$this->inAltTrans ) {
408
            if ( $this->currentTransUnitIsTranslatable === 'no' ) {
409
                $this->inTarget = false;
410
            } else {
411
                $this->inTarget = true;
412
            }
413
        }
414
415
    }
416
417
    protected function trySetAltTrans( string $name ) {
418
        $this->inAltTrans = $this->inAltTrans || $this->alternativeMatchesTag == $name;
419
    }
420
421
    protected function tryUnsetAltTrans( string $name ) {
422
        if ( $this->alternativeMatchesTag == $name ) {
423
            $this->inAltTrans = false;
424
        }
425
    }
426
427
    /**
428
     * @param string $name
429
     *
430
     * @return void
431
     */
432
    protected function setInBuffer( string $name ) {
433
        if ( in_array( $name, $this->nodesToBuffer ) ) {
434
            $this->bufferIsActive = true;
435
        }
436
437
        // We need bufferIsActive for <target> nodes with currentTransUnitIsTranslatable = 'NO'
438
        // because in the other case, the target can be chunked into pieces by xml_set_character_data_handler()
439
        // and this can potentially lead to a wrong string rebuild by postProcAndFlush function if the internal placeholders are split
440
        if($name === 'target' and $this->currentTransUnitIsTranslatable === 'no'){
0 ignored issues
show
Comprehensibility Best Practice introduced by
Using logical operators such as and instead of && is generally not recommended.

PHP has two types of connecting operators (logical operators, and boolean operators):

  Logical Operators Boolean Operator
AND - meaning and &&
OR - meaning or ||

The difference between these is the order in which they are executed. In most cases, you would want to use a boolean operator like &&, or ||.

Let’s take a look at a few examples:

// Logical operators have lower precedence:
$f = false or true;

// is executed like this:
($f = false) or true;


// Boolean operators have higher precedence:
$f = false || true;

// is executed like this:
$f = (false || true);

Logical Operators are used for Control-Flow

One case where you explicitly want to use logical operators is for control-flow such as this:

$x === 5
    or die('$x must be 5.');

// Instead of
if ($x !== 5) {
    die('$x must be 5.');
}

Since die introduces problems of its own, f.e. it makes our code hardly testable, and prevents any kind of more sophisticated error handling; you probably do not want to use this in real-world code. Unfortunately, logical operators cannot be combined with throw at this point:

// The following is currently a parse error.
$x === 5
    or throw new RuntimeException('$x must be 5.');

These limitations lead to logical operators rarely being of use in current PHP code.

Loading history...
441
            $this->bufferIsActive = true;
442
        }
443
    }
444
445
    /**
446
     * @param array $seg
447
     */
448
    protected function updateSegmentCounts( array $seg = [] ) {
449
450
        $raw_word_count = $seg[ 'raw_word_count' ];
451
        $eq_word_count  = ( floor( $seg[ 'eq_word_count' ] * 100 ) / 100 );
452
453
        $this->counts[ 'segments_count_array' ][ $seg[ 'sid' ] ] = [
454
                'raw_word_count' => $raw_word_count,
455
                'eq_word_count'  => $eq_word_count,
456
        ];
457
458
        $this->counts[ 'raw_word_count' ] += $raw_word_count;
459
        $this->counts[ 'eq_word_count' ]  += $eq_word_count;
460
    }
461
462
    protected function resetCounts() {
463
        $this->counts[ 'segments_count_array' ] = [];
464
        $this->counts[ 'raw_word_count' ]       = 0;
465
        $this->counts[ 'eq_word_count' ]        = 0;
466
    }
467
468
    /**
469
     * @param resource $parser
470
     * @param string   $tag
471
     *
472
     * @return void
473
     */
474
    protected function checkForSelfClosedTagAndFlush( $parser, string $tag ) {
475
476
        $lastChar = $this->getLastCharacter( $parser );
477
478
        //trim last space
479
        $tag = rtrim( $tag );
480
481
        //detect empty tag
482
        $this->isEmpty = $lastChar == '/';
483
        if ( $this->isEmpty ) {
484
            $tag .= $lastChar;
485
        }
486
487
        //add tag ending
488
        $tag .= ">";
489
490
        //set a Buffer for the segSource Source tag
491
        if ( $this->bufferIsActive ) { // we are opening a critical CDATA section
492
            //these are NOT source/seg-source/value empty tags, THERE IS A CONTENT, write it in buffer
493
            $this->CDATABuffer .= $tag;
494
        } else {
495
            $this->postProcAndFlush( $this->outputFP, $tag );
496
        }
497
498
    }
499
500
    /**
501
     * A trans-unit can contain a list of segments because of mrk tags
502
     * Copy the segment's list for this trans-unit in a different structure
503
     *
504
     * @return void
505
     */
506
    protected function setLastTransUnitSegments() {
507
508
        /*
509
         * At the end of every cycle the segment grouping information is lost: unset( 'matecat|' . $this->currentId )
510
         *
511
         * We need to take the info about the last segment parsed
512
         *          ( normally more than 1 db row because of mrk tags )
513
         *
514
         * So, copy the current segment data group into another structure to keep the last segment
515
         * for the next tagOpen ( possible sdl:seg-defs )
516
         *
517
         */
518
        $this->lastTransUnit = [];
519
520
        if ( !isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
521
            return;
522
        }
523
524
        $listOfSegmentsIds = $this->transUnits[ $this->currentTransUnitId ];
525
        $last_value        = null;
526
        $segmentsCount     = count( $listOfSegmentsIds );
527
        for ( $i = 0; $i < $segmentsCount; $i++ ) {
528
            $id = $listOfSegmentsIds[ $i ];
529
            if ( isset( $this->segments[ $id ] ) && ( $i == 0 || $last_value + 1 == $listOfSegmentsIds[ $i ] ) ) {
530
                $last_value            = $listOfSegmentsIds[ $i ];
531
                $this->lastTransUnit[] = $this->segments[ $id ];
532
            }
533
        }
534
    }
535
536
    /**
537
     * @return array
538
     */
539
    protected function getCurrentSegment(): array {
540
        if (
541
            $this->currentTransUnitIsTranslatable !== 'no' &&
542
            isset( $this->transUnits[ $this->currentTransUnitId ] ) &&
543
            isset( $this->segments[ $this->segmentInUnitPosition ] )
544
        ) {
545
            return $this->segments[ $this->segmentInUnitPosition ];
546
        }
547
548
        return [];
549
    }
550
}