Passed
Pull Request — master (#90)
by Domenico
02:57
created

AbstractXliffReplacer::__construct()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 22
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 2
eloc 11
nc 2
nop 9
dl 0
loc 22
rs 9.9
c 3
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
namespace Matecat\XliffParser\XliffReplacer;
4
5
use Psr\Log\LoggerInterface;
6
use RuntimeException;
7
8
abstract class AbstractXliffReplacer {
9
    protected $originalFP;
10
    protected $outputFP;                  // output stream pointer
11
12
    protected string $tuTagName;                 // <trans-unit> (forXliff v 1.*) or <unit> (forXliff v 2.*)
13
    protected bool   $inTU             = false;  // flag to check whether we are in a <trans-unit>
14
    protected bool   $inTarget         = false;  // flag to check whether we are in a <target>, to ignore everything
15
    protected bool   $isEmpty          = false;  // flag to check whether we are in an empty tag (<tag/>)
16
    protected bool   $targetWasWritten = false;  // flag to check is <target> was written in the current unit
17
    protected string $CDATABuffer      = "";       // buffer for special tag
18
    protected bool   $bufferIsActive   = false;    // buffer for special tag
19
20
    protected int $offset = 0;         // offset for SAX pointer
21
22
    protected string  $currentBuffer;             // the current piece of text it's been parsed
23
    protected int     $len;                       // length of the currentBuffer
24
    protected array   $segments;                  // array of translations
25
    protected array   $lastTransUnit                  = [];
26
    protected int     $segmentInUnitPosition          = 0;
27
    protected ?string $currentTransUnitId             = null;        // id of current <trans-unit>
28
    protected ?string $currentTransUnitIsTranslatable = null; // 'translate' attribute of current <trans-unit>
29
    protected bool    $hasWrittenCounts               = false;  // check if <unit> already wrote segment counts (forXliff v 2.*)
30
    protected string  $targetLang;
31
    protected bool    $sourceInTarget                 = false;
32
33
    protected array $nodesToBuffer;
34
35
    protected array $transUnits;
36
37
    /** @var int */
38
    protected int $xliffVersion;
39
40
    /**
41
     * @var XliffReplacerCallbackInterface|null
42
     */
43
    protected ?XliffReplacerCallbackInterface $callback;
44
45
    protected ?LoggerInterface $logger;
46
47
    protected static $INTERNAL_TAG_PLACEHOLDER;
48
49
    protected $counts = [
50
            'raw_word_count' => 0,
51
            'eq_word_count'  => 0,
52
    ];
53
54
    /**
55
     * AbstractXliffReplacer constructor.
56
     *
57
     * @param string                              $originalXliffPath
58
     * @param int                                 $xliffVersion
59
     * @param array                               $segments
60
     * @param array                               $transUnits
61
     * @param string                              $trgLang
62
     * @param string                              $outputFilePath
63
     * @param bool                                $setSourceInTarget
64
     * @param LoggerInterface|null                $logger
65
     * @param XliffReplacerCallbackInterface|null $callback
66
     */
67
    public function __construct(
68
            string                         $originalXliffPath,
69
            int                            $xliffVersion,
70
            array                          $segments,
71
            array                          $transUnits,
72
            string                         $trgLang,
73
            string                         $outputFilePath,
74
            bool                           $setSourceInTarget,
75
            LoggerInterface                $logger = null,
76
            XliffReplacerCallbackInterface $callback = null
77
    ) {
78
        self::$INTERNAL_TAG_PLACEHOLDER = $this->getInternalTagPlaceholder();
79
        $this->createOutputFileIfDoesNotExist( $outputFilePath );
80
        $this->setFileDescriptors( $originalXliffPath, $outputFilePath );
81
        $this->xliffVersion   = $xliffVersion;
82
        $this->tuTagName      = ( $this->xliffVersion === 2 ) ? 'unit' : 'trans-unit';
83
        $this->segments       = $segments;
84
        $this->targetLang     = $trgLang;
85
        $this->sourceInTarget = $setSourceInTarget;
86
        $this->transUnits     = $transUnits;
87
        $this->logger         = $logger;
88
        $this->callback       = $callback;
89
    }
90
91
    public function replaceTranslation() {
92
        fwrite( $this->outputFP, '<?xml version="1.0" encoding="UTF-8"?>' );
93
94
        //create Sax parser
95
        $xmlParser = $this->initSaxParser();
96
97
        while ( $this->currentBuffer = fread( $this->originalFP, 4096 ) ) {
98
            /*
99
               preprocess file
100
             */
101
            // obfuscate entities because sax automatically does html_entity_decode
102
            $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
103
104
            //avoid cutting entities in half:
105
            //the last fread could have truncated an entity (say, '&lt;' in '&l'), thus invalidating the escaping
106
            //***** and if there is an & that it is not an entity, this is an infinite loop !!!!!
107
            // 9 is the max length of an entity. So, suppose that the & is at the end of buffer,
108
            // add 9 Bytes and substitute the entities, if the & is present, and it is not at the end
109
            //it can't be an entity, exit the loop
110
            while ( true ) {
111
                $_ampPos = strpos( $temporary_check_buffer, '&' );
112
113
                //check for real entity or escape it to safely exit from the loop!!!
114
                if ( $_ampPos === false || strlen( substr( $temporary_check_buffer, $_ampPos ) ) > 9 ) {
115
                    break;
116
                }
117
118
                //if an entity is still present, fetch some more and repeat the escaping
119
                $this->currentBuffer    .= fread( $this->originalFP, 9 );
120
                $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
121
            }
122
123
            //free stuff outside the loop
124
            unset( $temporary_check_buffer );
125
126
            $this->currentBuffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
127
            $this->currentBuffer = str_replace( "&", self::$INTERNAL_TAG_PLACEHOLDER . 'amp' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
128
129
            //get length of chunk
130
            $this->len = strlen( $this->currentBuffer );
131
132
            /*
133
            * Get the accumulated this->offset in the document:
134
             * as long as SAX pointer advances, we keep track of total bytes it has seen so far;
135
             * this way, we can translate its global pointer in an address local to the current buffer of text to retrieve the last char of tag
136
            */
137
            $this->offset += $this->len;
138
139
            //parse chunk of text
140
            $this->runParser( $xmlParser );
141
            
142
        }
143
144
        // close Sax parser
145
        $this->closeSaxParser( $xmlParser );
146
147
    }
148
149
    /**
150
     * @param $xmlParser
151
     *
152
     * @return void
153
     */
154
    protected function runParser( $xmlParser ) {
155
        //parse chunk of text
156
        if ( !xml_parse( $xmlParser, $this->currentBuffer, feof( $this->originalFP ) ) ) {
157
            //if unable, raise an exception
158
            throw new RuntimeException( sprintf(
159
                    "XML error: %s at line %d",
160
                    xml_error_string( xml_get_error_code( $xmlParser ) ),
161
                    xml_get_current_line_number( $xmlParser )
162
            ) );
163
        }
164
    }
165
166
    /**
167
     * @param resource $parser
168
     *
169
     * @return string
170
     */
171
    protected function getLastCharacter( $parser ): string {
172
173
        //this logic helps detecting empty tags
174
        //get current position of SAX pointer in all the stream of data is has read so far:
175
        //it points at the end of current tag
176
        $idx = xml_get_current_byte_index( $parser );
177
178
        //check whether the bounds of current tag are entirely in current buffer or the end of the current tag
179
        //is outside current buffer (in the latter case, it's in next buffer to be read by the while loop);
180
        //this check is necessary because we may have truncated a tag in half with current read,
181
        //and the other half may be encountered in the next buffer it will be passed
182
        if ( isset( $this->currentBuffer[ $idx - $this->offset ] ) ) {
183
            //if this tag entire lenght fitted in the buffer, the last char must be the last
184
            //symbol before the '>'; if it's an empty tag, it is assumed that it's a '/'
185
            $lastChar = $this->currentBuffer[ $idx - $this->offset ];
186
        } else {
187
            //if it's out, simple use the last character of the chunk
188
            $lastChar = $this->currentBuffer[ $this->len - 1 ];
189
        }
190
191
        return $lastChar;
192
193
    }
194
195
    /**
196
     * @return string
197
     */
198
    private function getInternalTagPlaceholder(): string {
199
        return "§" .
200
                substr(
201
                        str_replace(
202
                                [ '+', '/' ],
203
                                '',
204
                                base64_encode( openssl_random_pseudo_bytes( 10, $_crypto_strong ) )
205
                        ),
206
                        0,
207
                        4
208
                );
209
    }
210
211
    private function createOutputFileIfDoesNotExist( string $outputFilePath ) {
212
        // create output file
213
        if ( !file_exists( $outputFilePath ) ) {
214
            touch( $outputFilePath );
215
        }
216
    }
217
218
    /**
219
     * @param string $originalXliffPath
220
     * @param string $outputFilePath
221
     */
222
    private function setFileDescriptors( string $originalXliffPath, string $outputFilePath ) {
223
        $this->outputFP = fopen( $outputFilePath, 'w+' );
224
225
        $streamArgs = null;
226
227
        if ( !( $this->originalFP = fopen( $originalXliffPath, "r", false, stream_context_create( $streamArgs ) ) ) ) {
228
            throw new RuntimeException( "could not open XML input" );
229
        }
230
    }
231
232
    /**
233
     * AbstractXliffReplacer destructor.
234
     */
235
    public function __destruct() {
236
        //this stream can be closed outside the class
237
        //to permit multiple concurrent downloads, so suppress warnings
238
        @fclose( $this->originalFP );
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for fclose(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

238
        /** @scrutinizer ignore-unhandled */ @fclose( $this->originalFP );

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
239
        fclose( $this->outputFP );
240
    }
241
242
    /**
243
     * Init Sax parser
244
     *
245
     * @return resource
246
     */
247
    protected function initSaxParser() {
248
        $xmlSaxParser = xml_parser_create( 'UTF-8' );
249
        xml_set_object( $xmlSaxParser, $this );
250
        xml_parser_set_option( $xmlSaxParser, XML_OPTION_CASE_FOLDING, false );
251
        xml_set_element_handler( $xmlSaxParser, 'tagOpen', 'tagClose' );
252
        xml_set_character_data_handler( $xmlSaxParser, 'characterData' );
253
254
        return $xmlSaxParser;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $xmlSaxParser also could return the type XmlParser which is incompatible with the documented return type resource.
Loading history...
255
    }
256
257
    /**
258
     * @param resource $xmlSaxParser
259
     */
260
    protected function closeSaxParser( $xmlSaxParser ) {
261
        xml_parser_free( $xmlSaxParser );
262
    }
263
264
    /**
265
     * @param resource $parser
266
     * @param string   $name
267
     * @param array    $attr
268
     *
269
     * @return mixed
270
     */
271
    abstract protected function tagOpen( $parser, string $name, array $attr );
272
273
    /**
274
     * @param resource $parser
275
     * @param string   $name
276
     *
277
     * @return mixed
278
     */
279
    abstract protected function tagClose( $parser, string $name );
280
281
    /**
282
     * @param resource $parser
283
     * @param string   $data
284
     *
285
     * @return mixed
286
     */
287
    protected function characterData( $parser, string $data ): void {
0 ignored issues
show
Unused Code introduced by
The parameter $parser is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

287
    protected function characterData( /** @scrutinizer ignore-unused */ $parser, string $data ): void {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
288
        // don't write <target> data
289
        if ( !$this->inTarget && !$this->bufferIsActive ) {
290
            $this->postProcAndFlush( $this->outputFP, $data );
291
        } elseif ( $this->bufferIsActive ) {
292
            $this->CDATABuffer .= $data;
293
        }
294
    }
295
296
    /**
297
     * postprocess escaped data and write to disk
298
     *
299
     * @param resource $fp
300
     * @param string   $data
301
     * @param bool     $treatAsCDATA
302
     */
303
    protected function postProcAndFlush( $fp, string $data, bool $treatAsCDATA = false ) {
304
        //postprocess string
305
        $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data );
306
        $data = str_replace( '&nbsp;', ' ', $data );
307
        if ( !$treatAsCDATA ) {
308
            //unix2dos
309
            $data = str_replace( "\r\n", "\r", $data );
310
            $data = str_replace( "\n", "\r", $data );
311
            $data = str_replace( "\r", "\r\n", $data );
312
        }
313
314
        //flush to disk
315
        fwrite( $fp, $data );
316
    }
317
318
    /**
319
     * @param string $name
320
     * @param array  $attr
321
     *
322
     * @return void
323
     */
324
    protected function handleOpenUnit( string $name, array $attr ) {
325
326
        // check if we are entering into a <trans-unit> (xliff v1.*) or <unit> (xliff v2.*)
327
        if ( $this->tuTagName === $name ) {
328
            $this->inTU = true;
329
330
            // get id
331
            // trim to first 100 characters because this is the limit on Matecat's DB
332
            $this->currentTransUnitId = substr( $attr[ 'id' ], 0, 100 );
333
334
            // `translate` attribute can be only yes or no
335
            // current 'translate' attribute of the current trans-unit
336
            $this->currentTransUnitIsTranslatable = empty( $attr[ 'translate' ] ) ? 'yes' : $attr[ 'translate' ];
337
338
            $this->setLastTransUnitSegments();
339
340
        }
341
    }
342
343
    /**
344
     * @param string $name
345
     * @param array  $attr
346
     * @param string $tag
347
     *
348
     * @return string
349
     */
350
    protected function handleOpenXliffTag( string $name, array $attr, string $tag ): string {
351
352
        // Add MateCat specific namespace.
353
        // Add trgLang
354
        if ( $name === 'xliff' ) {
355
            if ( !array_key_exists( 'xmlns:mtc', $attr ) ) {
356
                $tag .= ' xmlns:mtc="https://www.matecat.com" ';
357
            }
358
            $tag = preg_replace( '/trgLang="(.*?)"/', 'trgLang="' . $this->targetLang . '"', $tag );
359
        }
360
361
        return $tag;
362
363
    }
364
365
    /**
366
     * @param string $name
367
     *
368
     * @return void
369
     */
370
    protected function checkSetInTarget( string $name ) {
371
        // check if we are entering into a <target>
372
        if ( 'target' === $name ) {
373
            if ( $this->currentTransUnitIsTranslatable === 'no' ) {
374
                $this->inTarget = false;
375
            } else {
376
                $this->inTarget = true;
377
            }
378
        }
379
    }
380
381
    /**
382
     * @param string $name
383
     *
384
     * @return void
385
     */
386
    protected function setInBuffer( string $name ) {
387
        if ( in_array( $name, $this->nodesToBuffer ) ) {
388
            $this->bufferIsActive = true;
389
        }
390
    }
391
392
    /**
393
     * @param array $seg
394
     */
395
    protected function updateSegmentCounts( array $seg = [] ) {
396
397
        $raw_word_count = $seg[ 'raw_word_count' ];
398
        $eq_word_count  = ( floor( $seg[ 'eq_word_count' ] * 100 ) / 100 );
399
400
        $this->counts[ 'segments_count_array' ][ $seg[ 'sid' ] ] = [
401
                'raw_word_count' => $raw_word_count,
402
                'eq_word_count'  => $eq_word_count,
403
        ];
404
405
        $this->counts[ 'raw_word_count' ] += $raw_word_count;
406
        $this->counts[ 'eq_word_count' ]  += $eq_word_count;
407
    }
408
409
    protected function resetCounts() {
410
        $this->counts[ 'segments_count_array' ] = [];
411
        $this->counts[ 'raw_word_count' ]       = 0;
412
        $this->counts[ 'eq_word_count' ]        = 0;
413
    }
414
415
    /**
416
     * @param resource $parser
417
     * @param string   $tag
418
     *
419
     * @return void
420
     */
421
    protected function checkForSelfClosedTagAndFlush( $parser, string $tag ) {
422
423
        $lastChar = $this->getLastCharacter( $parser );
424
425
        //trim last space
426
        $tag = rtrim( $tag );
427
428
        //detect empty tag
429
        $this->isEmpty = $lastChar == '/';
430
        if ( $this->isEmpty ) {
431
            $tag .= $lastChar;
432
        }
433
434
        //add tag ending
435
        $tag .= ">";
436
437
        //set a Buffer for the segSource Source tag
438
        if ( $this->bufferIsActive ) { // we are opening a critical CDATA section
439
            //these are NOT source/seg-source/value empty tags, THERE IS A CONTENT, write it in buffer
440
            $this->CDATABuffer .= $tag;
441
        } else {
442
            $this->postProcAndFlush( $this->outputFP, $tag );
443
        }
444
445
    }
446
447
    /**
448
     * A trans-unit can contain a list of segments because of mrk tags
449
     * Copy the segment's list for this trans-unit in a different structure
450
     *
451
     * @return void
452
     */
453
    protected function setLastTransUnitSegments() {
454
455
        /*
456
         * At the end of every cycle the segment grouping information is lost: unset( 'matecat|' . $this->currentId )
457
         *
458
         * We need to take the info about the last segment parsed
459
         *          ( normally more than 1 db row because of mrk tags )
460
         *
461
         * So, copy the current segment data group into another structure to keep the last segment
462
         * for the next tagOpen ( possible sdl:seg-defs )
463
         *
464
         */
465
        $this->lastTransUnit = [];
466
467
        if ( !isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
468
            return;
469
        }
470
471
        $listOfSegmentsIds = $this->transUnits[ $this->currentTransUnitId ];
472
        $last_value        = null;
473
        $segmentsCount     = count( $listOfSegmentsIds );
474
        for ( $i = 0; $i < $segmentsCount; $i++ ) {
475
            $id = $listOfSegmentsIds[ $i ];
476
            if ( isset( $this->segments[ $id ] ) && ( $i == 0 || $last_value + 1 == $listOfSegmentsIds[ $i ] ) ) {
477
                $last_value            = $listOfSegmentsIds[ $i ];
478
                $this->lastTransUnit[] = $this->segments[ $id ];
479
            }
480
        }
481
482
    }
483
484
    /**
485
     * @return array
486
     */
487
    protected function getCurrentSegment(): array {
488
        if ( $this->currentTransUnitIsTranslatable !== 'no' && isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
489
            return $this->segments[ $this->segmentInUnitPosition ];
490
        }
491
492
        return [];
493
    }
494
495
}