Passed
Push — master ( cd421c...e31be6 )
by Domenico
02:52
created

AbstractXliffReplacer::__construct()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 22
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 2
eloc 11
nc 2
nop 9
dl 0
loc 22
rs 9.9
c 3
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
namespace Matecat\XliffParser\XliffReplacer;
4
5
use Psr\Log\LoggerInterface;
6
use RuntimeException;
7
8
abstract class AbstractXliffReplacer {
9
    protected $originalFP;
10
    protected $outputFP;                  // output stream pointer
11
12
    protected string $tuTagName;                 // <trans-unit> (forXliff v 1.*) or <unit> (forXliff v 2.*)
13
    protected bool   $inTU             = false;  // flag to check whether we are in a <trans-unit>
14
    protected bool   $inTarget         = false;  // flag to check whether we are in a <target>, to ignore everything
15
    protected bool   $isEmpty          = false;  // flag to check whether we are in an empty tag (<tag/>)
16
    protected bool   $targetWasWritten = false;  // flag to check is <target> was written in the current unit
17
    protected string $CDATABuffer      = "";       // buffer for special tag
18
    protected bool   $bufferIsActive   = false;    // buffer for special tag
19
20
    protected int $offset = 0;         // offset for SAX pointer
21
22
    protected string  $currentBuffer;             // the current piece of text it's been parsed
23
    protected int     $len;                       // length of the currentBuffer
24
    protected array   $segments;                  // array of translations
25
    protected array   $lastTransUnit                  = [];
26
    protected int     $segmentInUnitPosition          = 0;
27
    protected ?string $currentTransUnitId             = null;        // id of current <trans-unit>
28
    protected ?string $currentTransUnitIsTranslatable = null; // 'translate' attribute of current <trans-unit>
29
    protected bool    $hasWrittenCounts               = false;  // check if <unit> already wrote segment counts (forXliff v 2.*)
30
    protected string  $targetLang;
31
    protected bool    $sourceInTarget                 = false;
32
33
    protected array $nodesToBuffer;
34
35
    protected array $transUnits;
36
37
    /** @var int */
38
    protected int $xliffVersion;
39
40
    /**
41
     * @var XliffReplacerCallbackInterface|null
42
     */
43
    protected ?XliffReplacerCallbackInterface $callback;
44
45
    protected ?LoggerInterface $logger;
46
47
    protected static $INTERNAL_TAG_PLACEHOLDER;
48
49
    protected $counts = [
50
            'raw_word_count' => 0,
51
            'eq_word_count'  => 0,
52
    ];
53
54
    /**
55
     * AbstractXliffReplacer constructor.
56
     *
57
     * @param string                              $originalXliffPath
58
     * @param int                                 $xliffVersion
59
     * @param array                               $segments
60
     * @param array                               $transUnits
61
     * @param string                              $trgLang
62
     * @param string                              $outputFilePath
63
     * @param bool                                $setSourceInTarget
64
     * @param LoggerInterface|null                $logger
65
     * @param XliffReplacerCallbackInterface|null $callback
66
     */
67
    public function __construct(
68
            string                         $originalXliffPath,
69
            int                            $xliffVersion,
70
            array                          $segments,
71
            array                          $transUnits,
72
            string                         $trgLang,
73
            string                         $outputFilePath,
74
            bool                           $setSourceInTarget,
75
            LoggerInterface                $logger = null,
76
            XliffReplacerCallbackInterface $callback = null
77
    ) {
78
        self::$INTERNAL_TAG_PLACEHOLDER = $this->getInternalTagPlaceholder();
79
        $this->createOutputFileIfDoesNotExist( $outputFilePath );
80
        $this->setFileDescriptors( $originalXliffPath, $outputFilePath );
81
        $this->xliffVersion   = $xliffVersion;
82
        $this->tuTagName      = ( $this->xliffVersion === 2 ) ? 'unit' : 'trans-unit';
83
        $this->segments       = $segments;
84
        $this->targetLang     = $trgLang;
85
        $this->sourceInTarget = $setSourceInTarget;
86
        $this->transUnits     = $transUnits;
87
        $this->logger         = $logger;
88
        $this->callback       = $callback;
89
    }
90
91
    public function replaceTranslation() {
92
        fwrite( $this->outputFP, '<?xml version="1.0" encoding="UTF-8"?>' );
93
94
        //create Sax parser
95
        $xmlParser = $this->initSaxParser();
96
97
        while ( $this->currentBuffer = fread( $this->originalFP, 4096 ) ) {
98
            /*
99
               preprocess file
100
             */
101
            // obfuscate entities because sax automatically does html_entity_decode
102
            $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
103
104
            //avoid cutting entities in half:
105
            //the last fread could have truncated an entity (say, '&lt;' in '&l'), thus invalidating the escaping
106
            //***** and if there is an & that it is not an entity, this is an infinite loop !!!!!
107
            // 9 is the max length of an entity. So, suppose that the & is at the end of buffer,
108
            // add 9 Bytes and substitute the entities, if the & is present, and it is not at the end
109
            //it can't be an entity, exit the loop
110
            while ( true ) {
111
                $_ampPos = strpos( $temporary_check_buffer, '&' );
112
113
                //check for real entity or escape it to safely exit from the loop!!!
114
                if ( $_ampPos === false || strlen( substr( $temporary_check_buffer, $_ampPos ) ) > 9 ) {
115
                    break;
116
                }
117
118
                //if an entity is still present, fetch some more and repeat the escaping
119
                $this->currentBuffer    .= fread( $this->originalFP, 9 );
120
                $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
121
            }
122
123
            //free stuff outside the loop
124
            unset( $temporary_check_buffer );
125
126
            $this->currentBuffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
127
            $this->currentBuffer = str_replace( "&", self::$INTERNAL_TAG_PLACEHOLDER . 'amp' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
128
129
            //get length of chunk
130
            $this->len = strlen( $this->currentBuffer );
131
132
            /*
133
            * Get the accumulated this->offset in the document:
134
             * as long as SAX pointer advances, we keep track of total bytes it has seen so far;
135
             * this way, we can translate its global pointer in an address local to the current buffer of text to retrieve the last char of tag
136
            */
137
            $this->offset += $this->len;
138
139
            //parse chunk of text
140
            $this->runParser( $xmlParser );
141
142
        }
143
144
        // close Sax parser
145
        $this->closeSaxParser( $xmlParser );
146
147
    }
148
149
    /**
150
     * @param $xmlParser
151
     *
152
     * @return void
153
     */
154
    protected function runParser( $xmlParser ) {
155
        //parse chunk of text
156
        if ( !xml_parse( $xmlParser, $this->currentBuffer, feof( $this->originalFP ) ) ) {
157
            //if unable, raise an exception
158
            throw new RuntimeException( sprintf(
159
                    "XML error: %s at line %d",
160
                    xml_error_string( xml_get_error_code( $xmlParser ) ),
161
                    xml_get_current_line_number( $xmlParser )
162
            ) );
163
        }
164
    }
165
166
    /**
167
     * @param resource $parser
168
     *
169
     * @return string
170
     */
171
    protected function getLastCharacter( $parser ): string {
172
173
        //this logic helps detecting empty tags
174
        //get current position of SAX pointer in all the stream of data is has read so far:
175
        //it points at the end of current tag
176
        $idx = xml_get_current_byte_index( $parser );
177
178
        //check whether the bounds of current tag are entirely in current buffer or the end of the current tag
179
        //is outside current buffer (in the latter case, it's in next buffer to be read by the while loop);
180
        //this check is necessary because we may have truncated a tag in half with current read,
181
        //and the other half may be encountered in the next buffer it will be passed
182
        return $this->currentBuffer[ $idx - $this->offset ] ?? $this->currentBuffer[ $this->len - 1 ];
183
184
    }
185
186
    /**
187
     * @return string
188
     */
189
    private function getInternalTagPlaceholder(): string {
190
        return "§" .
191
                substr(
192
                        str_replace(
193
                                [ '+', '/' ],
194
                                '',
195
                                base64_encode( openssl_random_pseudo_bytes( 10, $_crypto_strong ) )
196
                        ),
197
                        0,
198
                        4
199
                );
200
    }
201
202
    private function createOutputFileIfDoesNotExist( string $outputFilePath ) {
203
        // create output file
204
        if ( !file_exists( $outputFilePath ) ) {
205
            touch( $outputFilePath );
206
        }
207
    }
208
209
    /**
210
     * @param string $originalXliffPath
211
     * @param string $outputFilePath
212
     */
213
    private function setFileDescriptors( string $originalXliffPath, string $outputFilePath ) {
214
        $this->outputFP = fopen( $outputFilePath, 'w+' );
215
216
        $streamArgs = null;
217
218
        if ( !( $this->originalFP = fopen( $originalXliffPath, "r", false, stream_context_create( $streamArgs ) ) ) ) {
219
            throw new RuntimeException( "could not open XML input" );
220
        }
221
    }
222
223
    /**
224
     * AbstractXliffReplacer destructor.
225
     */
226
    public function __destruct() {
227
        //this stream can be closed outside the class
228
        //to permit multiple concurrent downloads, so suppress warnings
229
        if ( is_resource( $this->originalFP ) ) {
230
            fclose( $this->originalFP );
231
        }
232
233
        if ( is_resource( $this->outputFP ) ) {
234
            fclose( $this->outputFP );
235
        }
236
237
    }
238
239
    /**
240
     * Init Sax parser
241
     *
242
     * @return resource
243
     */
244
    protected function initSaxParser() {
245
        $xmlSaxParser = xml_parser_create( 'UTF-8' );
246
        xml_set_object( $xmlSaxParser, $this );
247
        xml_parser_set_option( $xmlSaxParser, XML_OPTION_CASE_FOLDING, false );
248
        xml_set_element_handler( $xmlSaxParser, 'tagOpen', 'tagClose' );
249
        xml_set_character_data_handler( $xmlSaxParser, 'characterData' );
250
251
        return $xmlSaxParser;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $xmlSaxParser also could return the type XmlParser which is incompatible with the documented return type resource.
Loading history...
252
    }
253
254
    /**
255
     * @param resource $xmlSaxParser
256
     */
257
    protected function closeSaxParser( $xmlSaxParser ) {
258
        xml_parser_free( $xmlSaxParser );
259
    }
260
261
    /**
262
     * @param resource $parser
263
     * @param string   $name
264
     * @param array    $attr
265
     *
266
     * @return mixed
267
     */
268
    abstract protected function tagOpen( $parser, string $name, array $attr );
269
270
    /**
271
     * @param resource $parser
272
     * @param string   $name
273
     *
274
     * @return mixed
275
     */
276
    abstract protected function tagClose( $parser, string $name );
277
278
    /**
279
     * @param resource $parser
280
     * @param string   $data
281
     *
282
     * @return mixed
283
     */
284
    protected function characterData( $parser, string $data ): void {
0 ignored issues
show
Unused Code introduced by
The parameter $parser is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

284
    protected function characterData( /** @scrutinizer ignore-unused */ $parser, string $data ): void {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
285
        // don't write <target> data
286
        if ( !$this->inTarget && !$this->bufferIsActive ) {
287
            $this->postProcAndFlush( $this->outputFP, $data );
288
        } elseif ( $this->bufferIsActive ) {
289
            $this->CDATABuffer .= $data;
290
        }
291
    }
292
293
    /**
294
     * postprocess escaped data and write to disk
295
     *
296
     * @param resource $fp
297
     * @param string   $data
298
     * @param bool     $treatAsCDATA
299
     */
300
    protected function postProcAndFlush( $fp, string $data, bool $treatAsCDATA = false ) {
301
        //postprocess string
302
        $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data );
303
        $data = str_replace( '&nbsp;', ' ', $data );
304
        if ( !$treatAsCDATA ) {
305
            //unix2dos
306
            $data = str_replace( "\r\n", "\r", $data );
307
            $data = str_replace( "\n", "\r", $data );
308
            $data = str_replace( "\r", "\r\n", $data );
309
        }
310
311
        //flush to disk
312
        fwrite( $fp, $data );
313
    }
314
315
    /**
316
     * @param string $name
317
     * @param array  $attr
318
     *
319
     * @return void
320
     */
321
    protected function handleOpenUnit( string $name, array $attr ) {
322
323
        // check if we are entering into a <trans-unit> (xliff v1.*) or <unit> (xliff v2.*)
324
        if ( $this->tuTagName === $name ) {
325
            $this->inTU = true;
326
327
            // get id
328
            // trim to first 100 characters because this is the limit on Matecat's DB
329
            $this->currentTransUnitId = substr( $attr[ 'id' ], 0, 100 );
330
331
            // `translate` attribute can be only yes or no
332
            // current 'translate' attribute of the current trans-unit
333
            $this->currentTransUnitIsTranslatable = empty( $attr[ 'translate' ] ) ? 'yes' : $attr[ 'translate' ];
334
335
            $this->setLastTransUnitSegments();
336
337
        }
338
    }
339
340
    /**
341
     * @param string $name
342
     * @param array  $attr
343
     * @param string $tag
344
     *
345
     * @return string
346
     */
347
    protected function handleOpenXliffTag( string $name, array $attr, string $tag ): string {
348
349
        // Add MateCat specific namespace.
350
        // Add trgLang
351
        if ( $name === 'xliff' ) {
352
            if ( !array_key_exists( 'xmlns:mtc', $attr ) ) {
353
                $tag .= ' xmlns:mtc="https://www.matecat.com" ';
354
            }
355
            $tag = preg_replace( '/trgLang="(.*?)"/', 'trgLang="' . $this->targetLang . '"', $tag );
356
        }
357
358
        return $tag;
359
360
    }
361
362
    /**
363
     * @param string $name
364
     *
365
     * @return void
366
     */
367
    protected function checkSetInTarget( string $name ) {
368
        // check if we are entering into a <target>
369
        if ( 'target' === $name ) {
370
            if ( $this->currentTransUnitIsTranslatable === 'no' ) {
371
                $this->inTarget = false;
372
            } else {
373
                $this->inTarget = true;
374
            }
375
        }
376
    }
377
378
    /**
379
     * @param string $name
380
     *
381
     * @return void
382
     */
383
    protected function setInBuffer( string $name ) {
384
        if ( in_array( $name, $this->nodesToBuffer ) ) {
385
            $this->bufferIsActive = true;
386
        }
387
    }
388
389
    /**
390
     * @param array $seg
391
     */
392
    protected function updateSegmentCounts( array $seg = [] ) {
393
394
        $raw_word_count = $seg[ 'raw_word_count' ];
395
        $eq_word_count  = ( floor( $seg[ 'eq_word_count' ] * 100 ) / 100 );
396
397
        $this->counts[ 'segments_count_array' ][ $seg[ 'sid' ] ] = [
398
                'raw_word_count' => $raw_word_count,
399
                'eq_word_count'  => $eq_word_count,
400
        ];
401
402
        $this->counts[ 'raw_word_count' ] += $raw_word_count;
403
        $this->counts[ 'eq_word_count' ]  += $eq_word_count;
404
    }
405
406
    protected function resetCounts() {
407
        $this->counts[ 'segments_count_array' ] = [];
408
        $this->counts[ 'raw_word_count' ]       = 0;
409
        $this->counts[ 'eq_word_count' ]        = 0;
410
    }
411
412
    /**
413
     * @param resource $parser
414
     * @param string   $tag
415
     *
416
     * @return void
417
     */
418
    protected function checkForSelfClosedTagAndFlush( $parser, string $tag ) {
419
420
        $lastChar = $this->getLastCharacter( $parser );
421
422
        //trim last space
423
        $tag = rtrim( $tag );
424
425
        //detect empty tag
426
        $this->isEmpty = $lastChar == '/';
427
        if ( $this->isEmpty ) {
428
            $tag .= $lastChar;
429
        }
430
431
        //add tag ending
432
        $tag .= ">";
433
434
        //set a Buffer for the segSource Source tag
435
        if ( $this->bufferIsActive ) { // we are opening a critical CDATA section
436
            //these are NOT source/seg-source/value empty tags, THERE IS A CONTENT, write it in buffer
437
            $this->CDATABuffer .= $tag;
438
        } else {
439
            $this->postProcAndFlush( $this->outputFP, $tag );
440
        }
441
442
    }
443
444
    /**
445
     * A trans-unit can contain a list of segments because of mrk tags
446
     * Copy the segment's list for this trans-unit in a different structure
447
     *
448
     * @return void
449
     */
450
    protected function setLastTransUnitSegments() {
451
452
        /*
453
         * At the end of every cycle the segment grouping information is lost: unset( 'matecat|' . $this->currentId )
454
         *
455
         * We need to take the info about the last segment parsed
456
         *          ( normally more than 1 db row because of mrk tags )
457
         *
458
         * So, copy the current segment data group into another structure to keep the last segment
459
         * for the next tagOpen ( possible sdl:seg-defs )
460
         *
461
         */
462
        $this->lastTransUnit = [];
463
464
        if ( !isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
465
            return;
466
        }
467
468
        $listOfSegmentsIds = $this->transUnits[ $this->currentTransUnitId ];
469
        $last_value        = null;
470
        $segmentsCount     = count( $listOfSegmentsIds );
471
        for ( $i = 0; $i < $segmentsCount; $i++ ) {
472
            $id = $listOfSegmentsIds[ $i ];
473
            if ( isset( $this->segments[ $id ] ) && ( $i == 0 || $last_value + 1 == $listOfSegmentsIds[ $i ] ) ) {
474
                $last_value            = $listOfSegmentsIds[ $i ];
475
                $this->lastTransUnit[] = $this->segments[ $id ];
476
            }
477
        }
478
479
    }
480
481
    /**
482
     * @return array
483
     */
484
    protected function getCurrentSegment(): array {
485
        if ( $this->currentTransUnitIsTranslatable !== 'no' && isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
486
            return $this->segments[ $this->segmentInUnitPosition ];
487
        }
488
489
        return [];
490
    }
491
492
}