Passed
Push — master ( f74067...a562ef )
by Mauro
03:20
created

AbstractXliffReplacer::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 21
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 1
eloc 10
c 3
b 0
f 0
nc 1
nop 9
dl 0
loc 21
rs 9.9332

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
namespace Matecat\XliffParser\XliffReplacer;
4
5
use Psr\Log\LoggerInterface;
6
use RuntimeException;
7
8
abstract class AbstractXliffReplacer {
9
    protected $originalFP;
10
    protected $outputFP;                  // output stream pointer
11
12
    protected string $tuTagName;                 // <trans-unit> (forXliff v 1.*) or <unit> (forXliff v 2.*)
13
    protected bool   $inTU                  = false;  // flag to check whether we are in a <trans-unit>
14
    protected bool   $inTarget              = false;  // flag to check whether we are in a <target>, to ignore everything
15
    protected bool   $inAltTrans            = false;  // flag to check whether we are in an <alt-trans> (xliff 1.2) or <mtc:matches> (xliff 2.0)
16
    protected string $alternativeMatchesTag = ""; // polymorphic tag name for xliff 1.2 and 2.0
17
    protected bool   $isEmpty               = false;  // flag to check whether we are in an empty tag (<tag/>)
18
    protected bool   $targetWasWritten      = false;  // flag to check is <target> was written in the current unit
19
    protected string $CDATABuffer           = "";       // buffer for special tag
20
    protected string $namespace             = "";       // Custom namespace
21
    protected bool   $bufferIsActive        = false;    // flag for buffeting
22
23
    protected int $offset = 0;         // offset for SAX pointer
24
25
    protected string  $currentBuffer;             // the current piece of text it's been parsed
26
    protected int     $len;                       // length of the currentBuffer
27
    protected array   $segments;                  // array of translations
28
    protected array   $lastTransUnit                  = [];
29
    protected int     $segmentInUnitPosition          = 0;
30
    protected ?string $currentTransUnitId             = null;        // id of current <trans-unit>
31
    protected ?string $currentTransUnitIsTranslatable = null; // 'translate' attribute of current <trans-unit>
32
    protected bool    $hasWrittenCounts               = false;  // check if <unit> already wrote segment counts (forXliff v 2.*)
33
    protected string  $targetLang;
34
    protected bool    $sourceInTarget                 = false;
35
36
    protected array $nodesToBuffer;
37
38
    protected array $transUnits;
39
40
    /** @var int */
41
    protected int $xliffVersion;
42
43
    /**
44
     * @var XliffReplacerCallbackInterface|null
45
     */
46
    protected ?XliffReplacerCallbackInterface $callback;
47
48
    protected ?LoggerInterface $logger;
49
50
    protected static string $INTERNAL_TAG_PLACEHOLDER;
51
52
    protected $counts = [
53
            'raw_word_count' => 0,
54
            'eq_word_count'  => 0,
55
    ];
56
57
    /**
58
     * AbstractXliffReplacer constructor.
59
     *
60
     * @param string                              $originalXliffPath
61
     * @param int                                 $xliffVersion
62
     * @param array                               $segments
63
     * @param array                               $transUnits
64
     * @param string                              $trgLang
65
     * @param string                              $outputFilePath
66
     * @param bool                                $setSourceInTarget
67
     * @param LoggerInterface|null                $logger
68
     * @param XliffReplacerCallbackInterface|null $callback
69
     */
70
    public function __construct(
71
            string                         $originalXliffPath,
72
            int                            $xliffVersion,
73
            array                          $segments,
74
            array                          $transUnits,
75
            string                         $trgLang,
76
            string                         $outputFilePath,
77
            bool                           $setSourceInTarget,
78
            LoggerInterface                $logger = null,
79
            XliffReplacerCallbackInterface $callback = null
80
    ) {
81
        self::$INTERNAL_TAG_PLACEHOLDER = $this->getInternalTagPlaceholder();
82
        $this->createOutputFileIfDoesNotExist( $outputFilePath );
83
        $this->setFileDescriptors( $originalXliffPath, $outputFilePath );
84
        $this->xliffVersion   = $xliffVersion;
85
        $this->segments       = $segments;
86
        $this->targetLang     = $trgLang;
87
        $this->sourceInTarget = $setSourceInTarget;
88
        $this->transUnits     = $transUnits;
89
        $this->logger         = $logger;
90
        $this->callback       = $callback;
91
    }
92
93
    public function replaceTranslation() {
94
        fwrite( $this->outputFP, '<?xml version="1.0" encoding="UTF-8"?>' );
95
96
        //create Sax parser
97
        $xmlParser = $this->initSaxParser();
98
99
        while ( $this->currentBuffer = fread( $this->originalFP, 4096 ) ) {
100
            /*
101
               preprocess file
102
             */
103
            // obfuscate entities because sax automatically does html_entity_decode
104
            $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
105
106
            //avoid cutting entities in half:
107
            //the last fread could have truncated an entity (say, '&lt;' in '&l'), thus invalidating the escaping
108
            //***** and if there is an & that it is not an entity, this is an infinite loop !!!!!
109
            // 9 is the max length of an entity. So, suppose that the & is at the end of buffer,
110
            // add 9 Bytes and substitute the entities, if the & is present, and it is not at the end
111
            //it can't be an entity, exit the loop
112
            while ( true ) {
113
                $_ampPos = strpos( $temporary_check_buffer, '&' );
114
115
                //check for real entity or escape it to safely exit from the loop!!!
116
                if ( $_ampPos === false || strlen( substr( $temporary_check_buffer, $_ampPos ) ) > 9 ) {
117
                    break;
118
                }
119
120
                //if an entity is still present, fetch some more and repeat the escaping
121
                $this->currentBuffer    .= fread( $this->originalFP, 9 );
122
                $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
123
            }
124
125
            //free stuff outside the loop
126
            unset( $temporary_check_buffer );
127
128
            $this->currentBuffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
129
            $this->currentBuffer = str_replace( "&", self::$INTERNAL_TAG_PLACEHOLDER . 'amp' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
130
131
            //get length of chunk
132
            $this->len = strlen( $this->currentBuffer );
133
134
            /*
135
            * Get the accumulated this->offset in the document:
136
             * as long as SAX pointer advances, we keep track of total bytes it has seen so far;
137
             * this way, we can translate its global pointer in an address local to the current buffer of text to retrieve the last char of tag
138
            */
139
            $this->offset += $this->len;
140
141
            //parse chunk of text
142
            $this->runParser( $xmlParser );
143
144
        }
145
146
        // close Sax parser
147
        $this->closeSaxParser( $xmlParser );
148
149
    }
150
151
    /**
152
     * @param $xmlParser
153
     *
154
     * @return void
155
     */
156
    protected function runParser( $xmlParser ) {
157
        //parse chunk of text
158
        if ( !xml_parse( $xmlParser, $this->currentBuffer, feof( $this->originalFP ) ) ) {
159
            //if unable, raise an exception
160
            throw new RuntimeException( sprintf(
161
                    "XML error: %s at line %d",
162
                    xml_error_string( xml_get_error_code( $xmlParser ) ),
163
                    xml_get_current_line_number( $xmlParser )
164
            ) );
165
        }
166
    }
167
168
    /**
169
     * @param resource $parser
170
     *
171
     * @return string
172
     */
173
    protected function getLastCharacter( $parser ): string {
174
175
        //this logic helps detecting empty tags
176
        //get current position of SAX pointer in all the stream of data is has read so far:
177
        //it points at the end of current tag
178
        $idx = xml_get_current_byte_index( $parser );
179
180
        //check whether the bounds of current tag are entirely in current buffer or the end of the current tag
181
        //is outside current buffer (in the latter case, it's in next buffer to be read by the while loop);
182
        //this check is necessary because we may have truncated a tag in half with current read,
183
        //and the other half may be encountered in the next buffer it will be passed
184
        return $this->currentBuffer[ $idx - $this->offset ] ?? $this->currentBuffer[ $this->len - 1 ];
185
186
    }
187
188
    /**
189
     * @return string
190
     */
191
    private function getInternalTagPlaceholder(): string {
192
        return "§" .
193
                substr(
194
                        str_replace(
195
                                [ '+', '/' ],
196
                                '',
197
                                base64_encode( openssl_random_pseudo_bytes( 10, $_crypto_strong ) )
198
                        ),
199
                        0,
200
                        4
201
                );
202
    }
203
204
    private function createOutputFileIfDoesNotExist( string $outputFilePath ) {
205
        // create output file
206
        if ( !file_exists( $outputFilePath ) ) {
207
            touch( $outputFilePath );
208
        }
209
    }
210
211
    /**
212
     * @param string $originalXliffPath
213
     * @param string $outputFilePath
214
     */
215
    private function setFileDescriptors( string $originalXliffPath, string $outputFilePath ) {
216
        $this->outputFP = fopen( $outputFilePath, 'w+' );
217
218
        $streamArgs = null;
219
220
        if ( !( $this->originalFP = fopen( $originalXliffPath, "r", false, stream_context_create( $streamArgs ) ) ) ) {
221
            throw new RuntimeException( "could not open XML input" );
222
        }
223
    }
224
225
    /**
226
     * AbstractXliffReplacer destructor.
227
     */
228
    public function __destruct() {
229
        //this stream can be closed outside the class
230
        //to permit multiple concurrent downloads, so suppress warnings
231
        if ( is_resource( $this->originalFP ) ) {
232
            fclose( $this->originalFP );
233
        }
234
235
        if ( is_resource( $this->outputFP ) ) {
236
            fclose( $this->outputFP );
237
        }
238
239
    }
240
241
    /**
242
     * Init Sax parser
243
     *
244
     * @return resource
245
     */
246
    protected function initSaxParser() {
247
        $xmlSaxParser = xml_parser_create( 'UTF-8' );
248
        xml_set_object( $xmlSaxParser, $this );
249
        xml_parser_set_option( $xmlSaxParser, XML_OPTION_CASE_FOLDING, false );
250
        xml_set_element_handler( $xmlSaxParser, 'tagOpen', 'tagClose' );
251
        xml_set_character_data_handler( $xmlSaxParser, 'characterData' );
252
253
        return $xmlSaxParser;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $xmlSaxParser also could return the type XmlParser which is incompatible with the documented return type resource.
Loading history...
254
    }
255
256
    /**
257
     * @param resource $xmlSaxParser
258
     */
259
    protected function closeSaxParser( $xmlSaxParser ) {
260
        xml_parser_free( $xmlSaxParser );
261
    }
262
263
    /**
264
     * @param resource $parser
265
     * @param string   $name
266
     * @param array    $attr
267
     *
268
     * @return mixed
269
     */
270
    abstract protected function tagOpen( $parser, string $name, array $attr );
271
272
    /**
273
     * @param resource $parser
274
     * @param string   $name
275
     *
276
     * @return mixed
277
     */
278
    abstract protected function tagClose( $parser, string $name );
279
280
    /**
281
     * @param resource $parser
282
     * @param string   $data
283
     *
284
     * @return mixed
285
     */
286
    protected function characterData( $parser, string $data ): void {
0 ignored issues
show
Unused Code introduced by
The parameter $parser is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

286
    protected function characterData( /** @scrutinizer ignore-unused */ $parser, string $data ): void {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
287
        // don't write <target> data
288
        if ( !$this->inTarget && !$this->bufferIsActive ) {
289
            $this->postProcAndFlush( $this->outputFP, $data );
290
        } elseif ( $this->bufferIsActive ) {
291
            $this->CDATABuffer .= $data;
292
        }
293
    }
294
295
    /**
296
     * postprocess escaped data and write to disk
297
     *
298
     * @param resource $fp
299
     * @param string   $data
300
     * @param bool     $treatAsCDATA
301
     */
302
    protected function postProcAndFlush( $fp, string $data, bool $treatAsCDATA = false ) {
303
        //postprocess string
304
        $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data );
305
        $data = str_replace( '&nbsp;', ' ', $data );
306
        if ( !$treatAsCDATA ) {
307
            //unix2dos
308
            $data = str_replace( "\r\n", "\r", $data );
309
            $data = str_replace( "\n", "\r", $data );
310
            $data = str_replace( "\r", "\r\n", $data );
311
        }
312
313
        //flush to disk
314
        fwrite( $fp, $data );
315
    }
316
317
    /**
318
     * @param string $name
319
     * @param array  $attr
320
     *
321
     * @return void
322
     */
323
    protected function handleOpenUnit( string $name, array $attr ) {
324
325
        // check if we are entering into a <trans-unit> (xliff v1.*) or <unit> (xliff v2.*)
326
        if ( $this->tuTagName === $name ) {
327
            $this->inTU = true;
328
329
            // get id
330
            // trim to first 100 characters because this is the limit on Matecat's DB
331
            $this->currentTransUnitId = substr( $attr[ 'id' ], 0, 100 );
332
333
            // `translate` attribute can be only yes or no
334
            // current 'translate' attribute of the current trans-unit
335
            $this->currentTransUnitIsTranslatable = empty( $attr[ 'translate' ] ) ? 'yes' : $attr[ 'translate' ];
336
337
            $this->setLastTransUnitSegments();
338
339
        }
340
    }
341
342
    /**
343
     * @param string $name
344
     * @param array  $attr
345
     * @param string $tag
346
     *
347
     * @return string
348
     */
349
    protected function handleOpenXliffTag( string $name, array $attr, string $tag ): string {
350
351
        // Add MateCat specific namespace.
352
        // Add trgLang
353
        if ( $name === 'xliff' ) {
354
            if ( !array_key_exists( 'xmlns:' . $this->namespace, $attr ) ) {
355
                $tag .= ' xmlns:' . $this->namespace . '="https://www.matecat.com" ';
356
            }
357
            $tag = preg_replace( '/trgLang="(.*?)"/', 'trgLang="' . $this->targetLang . '"', $tag );
358
        }
359
360
        return $tag;
361
362
    }
363
364
    /**
365
     * @param string $name
366
     *
367
     * @return void
368
     */
369
    protected function checkSetInTarget( string $name ) {
370
371
        // check if we are entering into a <target>
372
        if ( 'target' == $name && !$this->inAltTrans ) {
373
            if ( $this->currentTransUnitIsTranslatable === 'no' ) {
374
                $this->inTarget = false;
375
            } else {
376
                $this->inTarget = true;
377
            }
378
        }
379
380
    }
381
382
    protected function trySetAltTrans( string $name ) {
383
        $this->inAltTrans = $this->inAltTrans || $this->alternativeMatchesTag == $name;
384
    }
385
386
    protected function tryUnsetAltTrans( string $name ) {
387
        if ( $this->alternativeMatchesTag == $name ) {
388
            $this->inAltTrans = false;
389
        }
390
    }
391
392
    /**
393
     * @param string $name
394
     *
395
     * @return void
396
     */
397
    protected function setInBuffer( string $name ) {
398
        if ( in_array( $name, $this->nodesToBuffer ) ) {
399
            $this->bufferIsActive = true;
400
        }
401
402
        // We need bufferIsActive for <target> nodes with currentTransUnitIsTranslatable = 'NO'
403
        // because in the other case, the target can be chunked into pieces by xml_set_character_data_handler()
404
        // and this can potentially lead to a wrong string rebuild by postProcAndFlush function if the internal placeholders are split
405
        if($name === 'target' and $this->currentTransUnitIsTranslatable === 'no'){
0 ignored issues
show
Comprehensibility Best Practice introduced by
Using logical operators such as and instead of && is generally not recommended.

PHP has two types of connecting operators (logical operators, and boolean operators):

  Logical Operators Boolean Operator
AND - meaning and &&
OR - meaning or ||

The difference between these is the order in which they are executed. In most cases, you would want to use a boolean operator like &&, or ||.

Let’s take a look at a few examples:

// Logical operators have lower precedence:
$f = false or true;

// is executed like this:
($f = false) or true;


// Boolean operators have higher precedence:
$f = false || true;

// is executed like this:
$f = (false || true);

Logical Operators are used for Control-Flow

One case where you explicitly want to use logical operators is for control-flow such as this:

$x === 5
    or die('$x must be 5.');

// Instead of
if ($x !== 5) {
    die('$x must be 5.');
}

Since die introduces problems of its own, f.e. it makes our code hardly testable, and prevents any kind of more sophisticated error handling; you probably do not want to use this in real-world code. Unfortunately, logical operators cannot be combined with throw at this point:

// The following is currently a parse error.
$x === 5
    or throw new RuntimeException('$x must be 5.');

These limitations lead to logical operators rarely being of use in current PHP code.

Loading history...
406
            $this->bufferIsActive = true;
407
        }
408
    }
409
410
    /**
411
     * @param array $seg
412
     */
413
    protected function updateSegmentCounts( array $seg = [] ) {
414
415
        $raw_word_count = $seg[ 'raw_word_count' ];
416
        $eq_word_count  = ( floor( $seg[ 'eq_word_count' ] * 100 ) / 100 );
417
418
        $this->counts[ 'segments_count_array' ][ $seg[ 'sid' ] ] = [
419
                'raw_word_count' => $raw_word_count,
420
                'eq_word_count'  => $eq_word_count,
421
        ];
422
423
        $this->counts[ 'raw_word_count' ] += $raw_word_count;
424
        $this->counts[ 'eq_word_count' ]  += $eq_word_count;
425
    }
426
427
    protected function resetCounts() {
428
        $this->counts[ 'segments_count_array' ] = [];
429
        $this->counts[ 'raw_word_count' ]       = 0;
430
        $this->counts[ 'eq_word_count' ]        = 0;
431
    }
432
433
    /**
434
     * @param resource $parser
435
     * @param string   $tag
436
     *
437
     * @return void
438
     */
439
    protected function checkForSelfClosedTagAndFlush( $parser, string $tag ) {
440
441
        $lastChar = $this->getLastCharacter( $parser );
442
443
        //trim last space
444
        $tag = rtrim( $tag );
445
446
        //detect empty tag
447
        $this->isEmpty = $lastChar == '/';
448
        if ( $this->isEmpty ) {
449
            $tag .= $lastChar;
450
        }
451
452
        //add tag ending
453
        $tag .= ">";
454
455
        //set a Buffer for the segSource Source tag
456
        if ( $this->bufferIsActive ) { // we are opening a critical CDATA section
457
            //these are NOT source/seg-source/value empty tags, THERE IS A CONTENT, write it in buffer
458
            $this->CDATABuffer .= $tag;
459
        } else {
460
            $this->postProcAndFlush( $this->outputFP, $tag );
461
        }
462
463
    }
464
465
    /**
466
     * A trans-unit can contain a list of segments because of mrk tags
467
     * Copy the segment's list for this trans-unit in a different structure
468
     *
469
     * @return void
470
     */
471
    protected function setLastTransUnitSegments() {
472
473
        /*
474
         * At the end of every cycle the segment grouping information is lost: unset( 'matecat|' . $this->currentId )
475
         *
476
         * We need to take the info about the last segment parsed
477
         *          ( normally more than 1 db row because of mrk tags )
478
         *
479
         * So, copy the current segment data group into another structure to keep the last segment
480
         * for the next tagOpen ( possible sdl:seg-defs )
481
         *
482
         */
483
        $this->lastTransUnit = [];
484
485
        if ( !isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
486
            return;
487
        }
488
489
        $listOfSegmentsIds = $this->transUnits[ $this->currentTransUnitId ];
490
        $last_value        = null;
491
        $segmentsCount     = count( $listOfSegmentsIds );
492
        for ( $i = 0; $i < $segmentsCount; $i++ ) {
493
            $id = $listOfSegmentsIds[ $i ];
494
            if ( isset( $this->segments[ $id ] ) && ( $i == 0 || $last_value + 1 == $listOfSegmentsIds[ $i ] ) ) {
495
                $last_value            = $listOfSegmentsIds[ $i ];
496
                $this->lastTransUnit[] = $this->segments[ $id ];
497
            }
498
        }
499
    }
500
501
    /**
502
     * @return array
503
     */
504
    protected function getCurrentSegment(): array {
505
        if (
506
            $this->currentTransUnitIsTranslatable !== 'no' &&
507
            isset( $this->transUnits[ $this->currentTransUnitId ] ) &&
508
            isset( $this->segments[ $this->segmentInUnitPosition ] )
509
        ) {
510
            return $this->segments[ $this->segmentInUnitPosition ];
511
        }
512
513
        return [];
514
    }
515
}