Passed
Pull Request — master (#90)
by Domenico
03:06
created

AbstractXliffReplacer::__construct()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 22
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 2
eloc 11
nc 2
nop 9
dl 0
loc 22
rs 9.9
c 3
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
namespace Matecat\XliffParser\XliffReplacer;
4
5
use Psr\Log\LoggerInterface;
6
use RuntimeException;
7
8
abstract class AbstractXliffReplacer {
9
    protected $originalFP;
10
    protected $outputFP;                  // output stream pointer
11
12
    protected string $tuTagName;                 // <trans-unit> (forXliff v 1.*) or <unit> (forXliff v 2.*)
13
    protected bool   $inTU             = false;  // flag to check whether we are in a <trans-unit>
14
    protected bool   $inTarget         = false;  // flag to check whether we are in a <target>, to ignore everything
15
    protected bool   $isEmpty          = false;  // flag to check whether we are in an empty tag (<tag/>)
16
    protected bool   $targetWasWritten = false;  // flag to check is <target> was written in the current unit
17
    protected string $CDATABuffer      = "";       // buffer for special tag
18
    protected bool   $bufferIsActive   = false;    // buffer for special tag
19
20
    protected int $offset = 0;         // offset for SAX pointer
21
22
    protected string  $currentBuffer;             // the current piece of text it's been parsed
23
    protected int     $len;                       // length of the currentBuffer
24
    protected array   $segments;                  // array of translations
25
    protected array   $lastTransUnit                  = [];
26
    protected int     $segmentInUnitPosition          = 0;
27
    protected ?string $currentTransUnitId             = null;        // id of current <trans-unit>
28
    protected ?string $currentTransUnitIsTranslatable = null; // 'translate' attribute of current <trans-unit>
29
    protected bool    $unitContainsMda                = false;   // check if <unit> already contains a <mda:metadata> (forXliff v 2.*)
30
    protected bool    $hasWrittenCounts               = false;  // check if <unit> already wrote segment counts (forXliff v 2.*)
31
    protected string  $targetLang;
32
    protected bool    $sourceInTarget                 = false;
33
34
    protected array $nodesToBuffer;
35
36
    protected array $transUnits;
37
38
    /** @var int */
39
    protected int $xliffVersion;
40
41
    /**
42
     * @var XliffReplacerCallbackInterface|null
43
     */
44
    protected ?XliffReplacerCallbackInterface $callback;
45
46
    protected ?LoggerInterface $logger;
47
48
    protected static $INTERNAL_TAG_PLACEHOLDER;
49
50
    protected $counts = [
51
            'raw_word_count' => 0,
52
            'eq_word_count'  => 0,
53
    ];
54
55
    /**
56
     * AbstractXliffReplacer constructor.
57
     *
58
     * @param string                              $originalXliffPath
59
     * @param int                                 $xliffVersion
60
     * @param array                               $segments
61
     * @param array                               $transUnits
62
     * @param string                              $trgLang
63
     * @param string                              $outputFilePath
64
     * @param bool                                $setSourceInTarget
65
     * @param LoggerInterface|null                $logger
66
     * @param XliffReplacerCallbackInterface|null $callback
67
     */
68
    public function __construct(
69
            string                         $originalXliffPath,
70
            int                            $xliffVersion,
71
            array                          $segments,
72
            array                          $transUnits,
73
            string                         $trgLang,
74
            string                         $outputFilePath,
75
            bool                           $setSourceInTarget,
76
            LoggerInterface                $logger = null,
77
            XliffReplacerCallbackInterface $callback = null
78
    ) {
79
        self::$INTERNAL_TAG_PLACEHOLDER = $this->getInternalTagPlaceholder();
80
        $this->createOutputFileIfDoesNotExist( $outputFilePath );
81
        $this->setFileDescriptors( $originalXliffPath, $outputFilePath );
82
        $this->xliffVersion   = $xliffVersion;
83
        $this->tuTagName      = ( $this->xliffVersion === 2 ) ? 'unit' : 'trans-unit';
84
        $this->segments       = $segments;
85
        $this->targetLang     = $trgLang;
86
        $this->sourceInTarget = $setSourceInTarget;
87
        $this->transUnits     = $transUnits;
88
        $this->logger         = $logger;
89
        $this->callback       = $callback;
90
    }
91
92
    public function replaceTranslation() {
93
        fwrite( $this->outputFP, '<?xml version="1.0" encoding="UTF-8"?>' );
94
95
        //create Sax parser
96
        $xmlParser = $this->initSaxParser();
97
98
        while ( $this->currentBuffer = fread( $this->originalFP, 4096 ) ) {
99
            /*
100
               preprocess file
101
             */
102
            // obfuscate entities because sax automatically does html_entity_decode
103
            $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
104
105
            //avoid cutting entities in half:
106
            //the last fread could have truncated an entity (say, '&lt;' in '&l'), thus invalidating the escaping
107
            //***** and if there is an & that it is not an entity, this is an infinite loop !!!!!
108
            // 9 is the max length of an entity. So, suppose that the & is at the end of buffer,
109
            // add 9 Bytes and substitute the entities, if the & is present, and it is not at the end
110
            //it can't be an entity, exit the loop
111
            while ( true ) {
112
                $_ampPos = strpos( $temporary_check_buffer, '&' );
113
114
                //check for real entity or escape it to safely exit from the loop!!!
115
                if ( $_ampPos === false || strlen( substr( $temporary_check_buffer, $_ampPos ) ) > 9 ) {
116
                    break;
117
                }
118
119
                //if an entity is still present, fetch some more and repeat the escaping
120
                $this->currentBuffer    .= fread( $this->originalFP, 9 );
121
                $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
122
            }
123
124
            //free stuff outside the loop
125
            unset( $temporary_check_buffer );
126
127
            $this->currentBuffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
128
            $this->currentBuffer = str_replace( "&", self::$INTERNAL_TAG_PLACEHOLDER . 'amp' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
129
130
            //get length of chunk
131
            $this->len = strlen( $this->currentBuffer );
132
133
            //parse chunk of text
134
            if ( !xml_parse( $xmlParser, $this->currentBuffer, feof( $this->originalFP ) ) ) {
135
                //if unable, raise an exception
136
                throw new RuntimeException( sprintf(
137
                        "XML error: %s at line %d",
138
                        xml_error_string( xml_get_error_code( $xmlParser ) ),
139
                        xml_get_current_line_number( $xmlParser )
140
                ) );
141
            }
142
            /*
143
            * Get the accumulated this->offset in the document:
144
             * as long as SAX pointer advances, we keep track of total bytes it has seen so far;
145
             * this way, we can translate its global pointer in an address local to the current buffer of text to retrieve the last char of tag
146
            */
147
            $this->offset += $this->len;
148
        }
149
150
        // close Sax parser
151
        $this->closeSaxParser( $xmlParser );
152
153
    }
154
155
    /**
156
     * @param resource $parser
157
     *
158
     * @return string
159
     */
160
    protected function getLastCharacter( $parser ): string {
161
162
        //this logic helps detecting empty tags
163
        //get current position of SAX pointer in all the stream of data is has read so far:
164
        //it points at the end of current tag
165
        $idx = xml_get_current_byte_index( $parser );
166
167
        //check whether the bounds of current tag are entirely in current buffer or the end of the current tag
168
        //is outside current buffer (in the latter case, it's in next buffer to be read by the while loop);
169
        //this check is necessary because we may have truncated a tag in half with current read,
170
        //and the other half may be encountered in the next buffer it will be passed
171
        if ( isset( $this->currentBuffer[ $idx - $this->offset ] ) ) {
172
            //if this tag entire lenght fitted in the buffer, the last char must be the last
173
            //symbol before the '>'; if it's an empty tag, it is assumed that it's a '/'
174
            $lastChar = $this->currentBuffer[ $idx - $this->offset ];
175
        } else {
176
            //if it's out, simple use the last character of the chunk
177
            $lastChar = $this->currentBuffer[ $this->len - 1 ];
178
        }
179
180
        return $lastChar;
181
182
    }
183
184
    /**
185
     * @return string
186
     */
187
    private function getInternalTagPlaceholder(): string {
188
        return "§" .
189
                substr(
190
                        str_replace(
191
                                [ '+', '/' ],
192
                                '',
193
                                base64_encode( openssl_random_pseudo_bytes( 10, $_crypto_strong ) )
194
                        ),
195
                        0,
196
                        4
197
                );
198
    }
199
200
    private function createOutputFileIfDoesNotExist( string $outputFilePath ) {
201
        // create output file
202
        if ( !file_exists( $outputFilePath ) ) {
203
            touch( $outputFilePath );
204
        }
205
    }
206
207
    /**
208
     * @param string $originalXliffPath
209
     * @param string $outputFilePath
210
     */
211
    private function setFileDescriptors( string $originalXliffPath, string $outputFilePath ) {
212
        $this->outputFP = fopen( $outputFilePath, 'w+' );
213
214
        $streamArgs = null;
215
216
        if ( !( $this->originalFP = fopen( $originalXliffPath, "r", false, stream_context_create( $streamArgs ) ) ) ) {
217
            throw new RuntimeException( "could not open XML input" );
218
        }
219
    }
220
221
    /**
222
     * AbstractXliffReplacer destructor.
223
     */
224
    public function __destruct() {
225
        //this stream can be closed outside the class
226
        //to permit multiple concurrent downloads, so suppress warnings
227
        @fclose( $this->originalFP );
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for fclose(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

227
        /** @scrutinizer ignore-unhandled */ @fclose( $this->originalFP );

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
228
        fclose( $this->outputFP );
229
    }
230
231
    /**
232
     * Init Sax parser
233
     *
234
     * @return resource
235
     */
236
    protected function initSaxParser() {
237
        $xmlSaxParser = xml_parser_create( 'UTF-8' );
238
        xml_set_object( $xmlSaxParser, $this );
239
        xml_parser_set_option( $xmlSaxParser, XML_OPTION_CASE_FOLDING, false );
240
        xml_set_element_handler( $xmlSaxParser, 'tagOpen', 'tagClose' );
241
        xml_set_character_data_handler( $xmlSaxParser, 'characterData' );
242
243
        return $xmlSaxParser;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $xmlSaxParser also could return the type XmlParser which is incompatible with the documented return type resource.
Loading history...
244
    }
245
246
    /**
247
     * @param resource $xmlSaxParser
248
     */
249
    protected function closeSaxParser( $xmlSaxParser ) {
250
        xml_parser_free( $xmlSaxParser );
251
    }
252
253
    /**
254
     * @param resource $parser
255
     * @param string   $name
256
     * @param array    $attr
257
     *
258
     * @return mixed
259
     */
260
    abstract protected function tagOpen( $parser, string $name, array $attr );
261
262
    /**
263
     * @param resource $parser
264
     * @param string   $name
265
     *
266
     * @return mixed
267
     */
268
    abstract protected function tagClose( $parser, string $name );
269
270
    /**
271
     * @param resource $parser
272
     * @param string   $data
273
     *
274
     * @return mixed
275
     */
276
    protected function characterData( $parser, string $data ): void {
0 ignored issues
show
Unused Code introduced by
The parameter $parser is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

276
    protected function characterData( /** @scrutinizer ignore-unused */ $parser, string $data ): void {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
277
        // don't write <target> data
278
        if ( !$this->inTarget && !$this->bufferIsActive ) {
279
            $this->postProcAndFlush( $this->outputFP, $data );
280
        } elseif ( $this->bufferIsActive ) {
281
            $this->CDATABuffer .= $data;
282
        }
283
    }
284
285
    /**
286
     * postprocess escaped data and write to disk
287
     *
288
     * @param resource $fp
289
     * @param string   $data
290
     * @param ?bool    $treatAsCDATA
291
     */
292
    protected function postProcAndFlush( $fp, string $data, ?bool $treatAsCDATA = false ) {
293
        //postprocess string
294
        $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data );
295
        $data = str_replace( '&nbsp;', ' ', $data );
296
        if ( !$treatAsCDATA ) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $treatAsCDATA of type boolean|null is loosely compared to false; this is ambiguous if the boolean can be false. You might want to explicitly use !== null instead.

If an expression can have both false, and null as possible values. It is generally a good practice to always use strict comparison to clearly distinguish between those two values.

$a = canBeFalseAndNull();

// Instead of
if ( ! $a) { }

// Better use one of the explicit versions:
if ($a !== null) { }
if ($a !== false) { }
if ($a !== null && $a !== false) { }
Loading history...
297
            //unix2dos
298
            $data = str_replace( "\r\n", "\r", $data );
299
            $data = str_replace( "\n", "\r", $data );
300
            $data = str_replace( "\r", "\r\n", $data );
301
        }
302
303
        //flush to disk
304
        fwrite( $fp, $data );
305
    }
306
307
    /**
308
     * @param string $name
309
     * @param array  $attr
310
     *
311
     * @return void
312
     */
313
    protected function handleOpenUnit( string $name, array $attr ) {
314
315
        // check if we are entering into a <trans-unit> (xliff v1.*) or <unit> (xliff v2.*)
316
        if ( $this->tuTagName === $name ) {
317
            $this->inTU = true;
318
319
            // get id
320
            // trim to first 100 characters because this is the limit on Matecat's DB
321
            $this->currentTransUnitId = substr( $attr[ 'id' ], 0, 100 );
322
323
            // `translate` attribute can be only yes or no
324
            // current 'translate' attribute of the current trans-unit
325
            $this->currentTransUnitIsTranslatable = empty( $attr[ 'translate' ] ) ? 'yes' : $attr[ 'translate' ];
326
327
            $this->setLastTransUnitSegments();
328
329
        }
330
    }
331
332
    /**
333
     * @param string $name
334
     * @param array  $attr
335
     * @param string $tag
336
     *
337
     * @return string
338
     */
339
    protected function handleOpenXliffTag( string $name, array $attr, string $tag ): string {
340
341
        // Add MateCat specific namespace.
342
        // Add trgLang
343
        if ( $name === 'xliff' ) {
344
            if ( !array_key_exists( 'xmlns:mtc', $attr ) ) {
345
                $tag .= ' xmlns:mtc="https://www.matecat.com" ';
346
            }
347
            $tag = preg_replace( '/trgLang="(.*?)"/', 'trgLang="' . $this->targetLang . '"', $tag );
348
        }
349
350
        return $tag;
351
352
    }
353
354
    /**
355
     * @param string $name
356
     *
357
     * @return void
358
     */
359
    protected function checkSetInTarget( string $name ) {
360
        // check if we are entering into a <target>
361
        if ( 'target' === $name ) {
362
            if ( $this->currentTransUnitIsTranslatable === 'no' ) {
363
                $this->inTarget = false;
364
            } else {
365
                $this->inTarget = true;
366
            }
367
        }
368
    }
369
370
    /**
371
     * @param string $name
372
     *
373
     * @return void
374
     */
375
    protected function setInBuffer( string $name ) {
376
        if ( in_array( $name, $this->nodesToBuffer ) ) {
377
            $this->bufferIsActive = true;
378
        }
379
    }
380
381
    /**
382
     * @param array $seg
383
     */
384
    protected function updateSegmentCounts( array $seg = [] ) {
385
386
        $raw_word_count = $seg[ 'raw_word_count' ];
387
        $eq_word_count  = ( floor( $seg[ 'eq_word_count' ] * 100 ) / 100 );
388
389
        $this->counts[ 'segments_count_array' ][ $seg[ 'sid' ] ] = [
390
                'raw_word_count' => $raw_word_count,
391
                'eq_word_count'  => $eq_word_count,
392
        ];
393
394
        $this->counts[ 'raw_word_count' ] += $raw_word_count;
395
        $this->counts[ 'eq_word_count' ]  += $eq_word_count;
396
    }
397
398
    protected function resetCounts() {
399
        $this->counts[ 'segments_count_array' ] = [];
400
        $this->counts[ 'raw_word_count' ]       = 0;
401
        $this->counts[ 'eq_word_count' ]        = 0;
402
    }
403
404
    /**
405
     * @param resource $parser
406
     * @param string   $tag
407
     *
408
     * @return void
409
     */
410
    protected function checkForSelfClosedTagAndFlush( $parser, string $tag ) {
411
412
        $lastChar = $this->getLastCharacter( $parser );
413
414
        //trim last space
415
        $tag = rtrim( $tag );
416
417
        //detect empty tag
418
        $this->isEmpty = $lastChar == '/';
419
        if ( $this->isEmpty ) {
420
            $tag .= $lastChar;
421
        }
422
423
        //add tag ending
424
        $tag .= ">";
425
426
        //set a Buffer for the segSource Source tag
427
        if ( $this->bufferIsActive ) { // we are opening a critical CDATA section
428
            //these are NOT source/seg-source/value empty tags, THERE IS A CONTENT, write it in buffer
429
            $this->CDATABuffer .= $tag;
430
        } else {
431
            $this->postProcAndFlush( $this->outputFP, $tag );
432
        }
433
434
    }
435
436
    /**
437
     * A trans-unit can contain a list of segments because of mrk tags
438
     * Copy the segment's list for this trans-unit in a different structure
439
     *
440
     * @return void
441
     */
442
    protected function setLastTransUnitSegments() {
443
444
        /*
445
         * At the end of every cycle the segment grouping information is lost: unset( 'matecat|' . $this->currentId )
446
         *
447
         * We need to take the info about the last segment parsed
448
         *          ( normally more than 1 db row because of mrk tags )
449
         *
450
         * So, copy the current segment data group into another structure to keep the last segment
451
         * for the next tagOpen ( possible sdl:seg-defs )
452
         *
453
         */
454
        $this->lastTransUnit = [];
455
456
        if ( !isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
457
            return;
458
        }
459
460
        $listOfSegmentsIds = $this->transUnits[ $this->currentTransUnitId ];
461
        $last_value        = null;
462
        $segmentsCount     = count( $listOfSegmentsIds );
463
        for ( $i = 0; $i < $segmentsCount; $i++ ) {
464
            $id = $listOfSegmentsIds[ $i ];
465
            if ( isset( $this->segments[ $id ] ) && ( $i == 0 || $last_value + 1 == $listOfSegmentsIds[ $i ] ) ) {
466
                $last_value            = $listOfSegmentsIds[ $i ];
467
                $this->lastTransUnit[] = $this->segments[ $id ];
468
            }
469
        }
470
471
    }
472
473
    /**
474
     * @return array
475
     */
476
    protected function getCurrentSegment(): array {
477
        if ( $this->currentTransUnitIsTranslatable !== 'no' && isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
478
            return $this->segments[ $this->segmentInUnitPosition ];
479
        }
480
481
        return [];
482
    }
483
484
}