Passed
Pull Request — master (#90)
by Domenico
02:53
created

AbstractXliffReplacer::__construct()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 22
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 2
eloc 11
nc 2
nop 9
dl 0
loc 22
rs 9.9
c 3
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
namespace Matecat\XliffParser\XliffReplacer;
4
5
use Psr\Log\LoggerInterface;
6
use RuntimeException;
7
8
abstract class AbstractXliffReplacer {
9
    protected $originalFP;
10
    protected $outputFP;                  // output stream pointer
11
12
    protected string $tuTagName;                 // <trans-unit> (forXliff v 1.*) or <unit> (forXliff v 2.*)
13
    protected bool   $inTU             = false;  // flag to check whether we are in a <trans-unit>
14
    protected bool   $inTarget         = false;  // flag to check whether we are in a <target>, to ignore everything
15
    protected bool   $isEmpty          = false;  // flag to check whether we are in an empty tag (<tag/>)
16
    protected bool   $targetWasWritten = false;  // flag to check is <target> was written in the current unit
17
    protected string $CDATABuffer      = "";       // buffer for special tag
18
    protected bool   $bufferIsActive   = false;    // buffer for special tag
19
20
    protected int $offset = 0;         // offset for SAX pointer
21
22
    protected string  $currentBuffer;             // the current piece of text it's been parsed
23
    protected int     $len;                       // length of the currentBuffer
24
    protected array   $segments;                  // array of translations
25
    protected array   $lastTransUnit                  = [];
26
    protected int     $segmentInUnitPosition          = 0;
27
    protected ?string $currentTransUnitId             = null;        // id of current <trans-unit>
28
    protected ?string $currentTransUnitIsTranslatable = null; // 'translate' attribute of current <trans-unit>
29
    protected bool    $hasWrittenCounts               = false;  // check if <unit> already wrote segment counts (forXliff v 2.*)
30
    protected string  $targetLang;
31
    protected bool    $sourceInTarget                 = false;
32
33
    protected array $nodesToBuffer;
34
35
    protected array $transUnits;
36
37
    /** @var int */
38
    protected int $xliffVersion;
39
40
    /**
41
     * @var XliffReplacerCallbackInterface|null
42
     */
43
    protected ?XliffReplacerCallbackInterface $callback;
44
45
    protected ?LoggerInterface $logger;
46
47
    protected static $INTERNAL_TAG_PLACEHOLDER;
48
49
    protected $counts = [
50
            'raw_word_count' => 0,
51
            'eq_word_count'  => 0,
52
    ];
53
54
    /**
55
     * AbstractXliffReplacer constructor.
56
     *
57
     * @param string                              $originalXliffPath
58
     * @param int                                 $xliffVersion
59
     * @param array                               $segments
60
     * @param array                               $transUnits
61
     * @param string                              $trgLang
62
     * @param string                              $outputFilePath
63
     * @param bool                                $setSourceInTarget
64
     * @param LoggerInterface|null                $logger
65
     * @param XliffReplacerCallbackInterface|null $callback
66
     */
67
    public function __construct(
68
            string                         $originalXliffPath,
69
            int                            $xliffVersion,
70
            array                          $segments,
71
            array                          $transUnits,
72
            string                         $trgLang,
73
            string                         $outputFilePath,
74
            bool                           $setSourceInTarget,
75
            LoggerInterface                $logger = null,
76
            XliffReplacerCallbackInterface $callback = null
77
    ) {
78
        self::$INTERNAL_TAG_PLACEHOLDER = $this->getInternalTagPlaceholder();
79
        $this->createOutputFileIfDoesNotExist( $outputFilePath );
80
        $this->setFileDescriptors( $originalXliffPath, $outputFilePath );
81
        $this->xliffVersion   = $xliffVersion;
82
        $this->tuTagName      = ( $this->xliffVersion === 2 ) ? 'unit' : 'trans-unit';
83
        $this->segments       = $segments;
84
        $this->targetLang     = $trgLang;
85
        $this->sourceInTarget = $setSourceInTarget;
86
        $this->transUnits     = $transUnits;
87
        $this->logger         = $logger;
88
        $this->callback       = $callback;
89
    }
90
91
    public function replaceTranslation() {
92
        fwrite( $this->outputFP, '<?xml version="1.0" encoding="UTF-8"?>' );
93
94
        //create Sax parser
95
        $xmlParser = $this->initSaxParser();
96
97
        while ( $this->currentBuffer = fread( $this->originalFP, 4096 ) ) {
98
            /*
99
               preprocess file
100
             */
101
            // obfuscate entities because sax automatically does html_entity_decode
102
            $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
103
104
            //avoid cutting entities in half:
105
            //the last fread could have truncated an entity (say, '&lt;' in '&l'), thus invalidating the escaping
106
            //***** and if there is an & that it is not an entity, this is an infinite loop !!!!!
107
            // 9 is the max length of an entity. So, suppose that the & is at the end of buffer,
108
            // add 9 Bytes and substitute the entities, if the & is present, and it is not at the end
109
            //it can't be an entity, exit the loop
110
            while ( true ) {
111
                $_ampPos = strpos( $temporary_check_buffer, '&' );
112
113
                //check for real entity or escape it to safely exit from the loop!!!
114
                if ( $_ampPos === false || strlen( substr( $temporary_check_buffer, $_ampPos ) ) > 9 ) {
115
                    break;
116
                }
117
118
                //if an entity is still present, fetch some more and repeat the escaping
119
                $this->currentBuffer    .= fread( $this->originalFP, 9 );
120
                $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
121
            }
122
123
            //free stuff outside the loop
124
            unset( $temporary_check_buffer );
125
126
            $this->currentBuffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
127
            $this->currentBuffer = str_replace( "&", self::$INTERNAL_TAG_PLACEHOLDER . 'amp' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
128
129
            //get length of chunk
130
            $this->len = strlen( $this->currentBuffer );
131
132
            /*
133
            * Get the accumulated this->offset in the document:
134
             * as long as SAX pointer advances, we keep track of total bytes it has seen so far;
135
             * this way, we can translate its global pointer in an address local to the current buffer of text to retrieve the last char of tag
136
            */
137
            $this->offset += $this->len;
138
139
            //parse chunk of text
140
            $this->runParser( $xmlParser );
141
142
        }
143
144
        // close Sax parser
145
        $this->closeSaxParser( $xmlParser );
146
147
    }
148
149
    /**
150
     * @param $xmlParser
151
     *
152
     * @return void
153
     */
154
    protected function runParser( $xmlParser ) {
155
        //parse chunk of text
156
        if ( !xml_parse( $xmlParser, $this->currentBuffer, feof( $this->originalFP ) ) ) {
157
            //if unable, raise an exception
158
            throw new RuntimeException( sprintf(
159
                    "XML error: %s at line %d",
160
                    xml_error_string( xml_get_error_code( $xmlParser ) ),
161
                    xml_get_current_line_number( $xmlParser )
162
            ) );
163
        }
164
    }
165
166
    /**
167
     * @param resource $parser
168
     *
169
     * @return string
170
     */
171
    protected function getLastCharacter( $parser ): string {
172
173
        //this logic helps detecting empty tags
174
        //get current position of SAX pointer in all the stream of data is has read so far:
175
        //it points at the end of current tag
176
        $idx = xml_get_current_byte_index( $parser );
177
178
        //check whether the bounds of current tag are entirely in current buffer or the end of the current tag
179
        //is outside current buffer (in the latter case, it's in next buffer to be read by the while loop);
180
        //this check is necessary because we may have truncated a tag in half with current read,
181
        //and the other half may be encountered in the next buffer it will be passed
182
        if ( isset( $this->currentBuffer[ $idx - $this->offset ] ) ) {
183
            //if this tag entire lenght fitted in the buffer, the last char must be the last
184
            //symbol before the '>'; if it's an empty tag, it is assumed that it's a '/'
185
            $lastChar = $this->currentBuffer[ $idx - $this->offset ];
186
        } else {
187
            //if it's out, simple use the last character of the chunk
188
            $lastChar = $this->currentBuffer[ $this->len - 1 ];
189
        }
190
191
        return $lastChar;
192
193
    }
194
195
    /**
196
     * @return string
197
     */
198
    private function getInternalTagPlaceholder(): string {
199
        return "§" .
200
                substr(
201
                        str_replace(
202
                                [ '+', '/' ],
203
                                '',
204
                                base64_encode( openssl_random_pseudo_bytes( 10, $_crypto_strong ) )
205
                        ),
206
                        0,
207
                        4
208
                );
209
    }
210
211
    private function createOutputFileIfDoesNotExist( string $outputFilePath ) {
212
        // create output file
213
        if ( !file_exists( $outputFilePath ) ) {
214
            touch( $outputFilePath );
215
        }
216
    }
217
218
    /**
219
     * @param string $originalXliffPath
220
     * @param string $outputFilePath
221
     */
222
    private function setFileDescriptors( string $originalXliffPath, string $outputFilePath ) {
223
        $this->outputFP = fopen( $outputFilePath, 'w+' );
224
225
        $streamArgs = null;
226
227
        if ( !( $this->originalFP = fopen( $originalXliffPath, "r", false, stream_context_create( $streamArgs ) ) ) ) {
228
            throw new RuntimeException( "could not open XML input" );
229
        }
230
    }
231
232
    /**
233
     * AbstractXliffReplacer destructor.
234
     */
235
    public function __destruct() {
236
        //this stream can be closed outside the class
237
        //to permit multiple concurrent downloads, so suppress warnings
238
        if ( is_resource( $this->originalFP ) ) {
239
            fclose( $this->originalFP );
240
        }
241
242
        if ( is_resource( $this->outputFP ) ) {
243
            fclose( $this->outputFP );
244
        }
245
246
    }
247
248
    /**
249
     * Init Sax parser
250
     *
251
     * @return resource
252
     */
253
    protected function initSaxParser() {
254
        $xmlSaxParser = xml_parser_create( 'UTF-8' );
255
        xml_set_object( $xmlSaxParser, $this );
256
        xml_parser_set_option( $xmlSaxParser, XML_OPTION_CASE_FOLDING, false );
257
        xml_set_element_handler( $xmlSaxParser, 'tagOpen', 'tagClose' );
258
        xml_set_character_data_handler( $xmlSaxParser, 'characterData' );
259
260
        return $xmlSaxParser;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $xmlSaxParser also could return the type XmlParser which is incompatible with the documented return type resource.
Loading history...
261
    }
262
263
    /**
264
     * @param resource $xmlSaxParser
265
     */
266
    protected function closeSaxParser( $xmlSaxParser ) {
267
        xml_parser_free( $xmlSaxParser );
268
    }
269
270
    /**
271
     * @param resource $parser
272
     * @param string   $name
273
     * @param array    $attr
274
     *
275
     * @return mixed
276
     */
277
    abstract protected function tagOpen( $parser, string $name, array $attr );
278
279
    /**
280
     * @param resource $parser
281
     * @param string   $name
282
     *
283
     * @return mixed
284
     */
285
    abstract protected function tagClose( $parser, string $name );
286
287
    /**
288
     * @param resource $parser
289
     * @param string   $data
290
     *
291
     * @return mixed
292
     */
293
    protected function characterData( $parser, string $data ): void {
0 ignored issues
show
Unused Code introduced by
The parameter $parser is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

293
    protected function characterData( /** @scrutinizer ignore-unused */ $parser, string $data ): void {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
294
        // don't write <target> data
295
        if ( !$this->inTarget && !$this->bufferIsActive ) {
296
            $this->postProcAndFlush( $this->outputFP, $data );
297
        } elseif ( $this->bufferIsActive ) {
298
            $this->CDATABuffer .= $data;
299
        }
300
    }
301
302
    /**
303
     * postprocess escaped data and write to disk
304
     *
305
     * @param resource $fp
306
     * @param string   $data
307
     * @param bool     $treatAsCDATA
308
     */
309
    protected function postProcAndFlush( $fp, string $data, bool $treatAsCDATA = false ) {
310
        //postprocess string
311
        $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data );
312
        $data = str_replace( '&nbsp;', ' ', $data );
313
        if ( !$treatAsCDATA ) {
314
            //unix2dos
315
            $data = str_replace( "\r\n", "\r", $data );
316
            $data = str_replace( "\n", "\r", $data );
317
            $data = str_replace( "\r", "\r\n", $data );
318
        }
319
320
        //flush to disk
321
        fwrite( $fp, $data );
322
    }
323
324
    /**
325
     * @param string $name
326
     * @param array  $attr
327
     *
328
     * @return void
329
     */
330
    protected function handleOpenUnit( string $name, array $attr ) {
331
332
        // check if we are entering into a <trans-unit> (xliff v1.*) or <unit> (xliff v2.*)
333
        if ( $this->tuTagName === $name ) {
334
            $this->inTU = true;
335
336
            // get id
337
            // trim to first 100 characters because this is the limit on Matecat's DB
338
            $this->currentTransUnitId = substr( $attr[ 'id' ], 0, 100 );
339
340
            // `translate` attribute can be only yes or no
341
            // current 'translate' attribute of the current trans-unit
342
            $this->currentTransUnitIsTranslatable = empty( $attr[ 'translate' ] ) ? 'yes' : $attr[ 'translate' ];
343
344
            $this->setLastTransUnitSegments();
345
346
        }
347
    }
348
349
    /**
350
     * @param string $name
351
     * @param array  $attr
352
     * @param string $tag
353
     *
354
     * @return string
355
     */
356
    protected function handleOpenXliffTag( string $name, array $attr, string $tag ): string {
357
358
        // Add MateCat specific namespace.
359
        // Add trgLang
360
        if ( $name === 'xliff' ) {
361
            if ( !array_key_exists( 'xmlns:mtc', $attr ) ) {
362
                $tag .= ' xmlns:mtc="https://www.matecat.com" ';
363
            }
364
            $tag = preg_replace( '/trgLang="(.*?)"/', 'trgLang="' . $this->targetLang . '"', $tag );
365
        }
366
367
        return $tag;
368
369
    }
370
371
    /**
372
     * @param string $name
373
     *
374
     * @return void
375
     */
376
    protected function checkSetInTarget( string $name ) {
377
        // check if we are entering into a <target>
378
        if ( 'target' === $name ) {
379
            if ( $this->currentTransUnitIsTranslatable === 'no' ) {
380
                $this->inTarget = false;
381
            } else {
382
                $this->inTarget = true;
383
            }
384
        }
385
    }
386
387
    /**
388
     * @param string $name
389
     *
390
     * @return void
391
     */
392
    protected function setInBuffer( string $name ) {
393
        if ( in_array( $name, $this->nodesToBuffer ) ) {
394
            $this->bufferIsActive = true;
395
        }
396
    }
397
398
    /**
399
     * @param array $seg
400
     */
401
    protected function updateSegmentCounts( array $seg = [] ) {
402
403
        $raw_word_count = $seg[ 'raw_word_count' ];
404
        $eq_word_count  = ( floor( $seg[ 'eq_word_count' ] * 100 ) / 100 );
405
406
        $this->counts[ 'segments_count_array' ][ $seg[ 'sid' ] ] = [
407
                'raw_word_count' => $raw_word_count,
408
                'eq_word_count'  => $eq_word_count,
409
        ];
410
411
        $this->counts[ 'raw_word_count' ] += $raw_word_count;
412
        $this->counts[ 'eq_word_count' ]  += $eq_word_count;
413
    }
414
415
    protected function resetCounts() {
416
        $this->counts[ 'segments_count_array' ] = [];
417
        $this->counts[ 'raw_word_count' ]       = 0;
418
        $this->counts[ 'eq_word_count' ]        = 0;
419
    }
420
421
    /**
422
     * @param resource $parser
423
     * @param string   $tag
424
     *
425
     * @return void
426
     */
427
    protected function checkForSelfClosedTagAndFlush( $parser, string $tag ) {
428
429
        $lastChar = $this->getLastCharacter( $parser );
430
431
        //trim last space
432
        $tag = rtrim( $tag );
433
434
        //detect empty tag
435
        $this->isEmpty = $lastChar == '/';
436
        if ( $this->isEmpty ) {
437
            $tag .= $lastChar;
438
        }
439
440
        //add tag ending
441
        $tag .= ">";
442
443
        //set a Buffer for the segSource Source tag
444
        if ( $this->bufferIsActive ) { // we are opening a critical CDATA section
445
            //these are NOT source/seg-source/value empty tags, THERE IS A CONTENT, write it in buffer
446
            $this->CDATABuffer .= $tag;
447
        } else {
448
            $this->postProcAndFlush( $this->outputFP, $tag );
449
        }
450
451
    }
452
453
    /**
454
     * A trans-unit can contain a list of segments because of mrk tags
455
     * Copy the segment's list for this trans-unit in a different structure
456
     *
457
     * @return void
458
     */
459
    protected function setLastTransUnitSegments() {
460
461
        /*
462
         * At the end of every cycle the segment grouping information is lost: unset( 'matecat|' . $this->currentId )
463
         *
464
         * We need to take the info about the last segment parsed
465
         *          ( normally more than 1 db row because of mrk tags )
466
         *
467
         * So, copy the current segment data group into another structure to keep the last segment
468
         * for the next tagOpen ( possible sdl:seg-defs )
469
         *
470
         */
471
        $this->lastTransUnit = [];
472
473
        if ( !isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
474
            return;
475
        }
476
477
        $listOfSegmentsIds = $this->transUnits[ $this->currentTransUnitId ];
478
        $last_value        = null;
479
        $segmentsCount     = count( $listOfSegmentsIds );
480
        for ( $i = 0; $i < $segmentsCount; $i++ ) {
481
            $id = $listOfSegmentsIds[ $i ];
482
            if ( isset( $this->segments[ $id ] ) && ( $i == 0 || $last_value + 1 == $listOfSegmentsIds[ $i ] ) ) {
483
                $last_value            = $listOfSegmentsIds[ $i ];
484
                $this->lastTransUnit[] = $this->segments[ $id ];
485
            }
486
        }
487
488
    }
489
490
    /**
491
     * @return array
492
     */
493
    protected function getCurrentSegment(): array {
494
        if ( $this->currentTransUnitIsTranslatable !== 'no' && isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
495
            return $this->segments[ $this->segmentInUnitPosition ];
496
        }
497
498
        return [];
499
    }
500
501
}