Passed
Pull Request — master (#90)
by Domenico
02:57
created

AbstractXliffReplacer::__construct()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 22
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 2
eloc 11
nc 2
nop 9
dl 0
loc 22
rs 9.9
c 3
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
namespace Matecat\XliffParser\XliffReplacer;
4
5
use Psr\Log\LoggerInterface;
6
use RuntimeException;
7
8
abstract class AbstractXliffReplacer {
9
    protected $originalFP;
10
    protected $outputFP;                  // output stream pointer
11
12
    protected string $tuTagName;                 // <trans-unit> (forXliff v 1.*) or <unit> (forXliff v 2.*)
13
    protected bool   $inTU             = false;  // flag to check whether we are in a <trans-unit>
14
    protected bool   $inTarget         = false;  // flag to check whether we are in a <target>, to ignore everything
15
    protected bool   $isEmpty          = false;  // flag to check whether we are in an empty tag (<tag/>)
16
    protected bool   $targetWasWritten = false;  // flag to check is <target> was written in the current unit
17
    protected string $CDATABuffer      = "";       // buffer for special tag
18
    protected bool   $bufferIsActive   = false;    // buffer for special tag
19
20
    protected int $offset = 0;         // offset for SAX pointer
21
22
    protected string  $currentBuffer;             // the current piece of text it's been parsed
23
    protected int     $len;                       // length of the currentBuffer
24
    protected array   $segments;                  // array of translations
25
    protected array   $lastTransUnit                  = [];
26
    protected int     $segmentInUnitPosition          = 0;
27
    protected ?string $currentTransUnitId             = null;        // id of current <trans-unit>
28
    protected ?string $currentTransUnitIsTranslatable = null; // 'translate' attribute of current <trans-unit>
29
    protected bool    $hasWrittenCounts               = false;  // check if <unit> already wrote segment counts (forXliff v 2.*)
30
    protected string  $targetLang;
31
    protected bool    $sourceInTarget                 = false;
32
33
    protected array $nodesToBuffer;
34
35
    protected array $transUnits;
36
37
    /** @var int */
38
    protected int $xliffVersion;
39
40
    /**
41
     * @var XliffReplacerCallbackInterface|null
42
     */
43
    protected ?XliffReplacerCallbackInterface $callback;
44
45
    protected ?LoggerInterface $logger;
46
47
    protected static $INTERNAL_TAG_PLACEHOLDER;
48
49
    protected $counts = [
50
            'raw_word_count' => 0,
51
            'eq_word_count'  => 0,
52
    ];
53
54
    /**
55
     * AbstractXliffReplacer constructor.
56
     *
57
     * @param string                              $originalXliffPath
58
     * @param int                                 $xliffVersion
59
     * @param array                               $segments
60
     * @param array                               $transUnits
61
     * @param string                              $trgLang
62
     * @param string                              $outputFilePath
63
     * @param bool                                $setSourceInTarget
64
     * @param LoggerInterface|null                $logger
65
     * @param XliffReplacerCallbackInterface|null $callback
66
     */
67
    public function __construct(
68
            string                         $originalXliffPath,
69
            int                            $xliffVersion,
70
            array                          $segments,
71
            array                          $transUnits,
72
            string                         $trgLang,
73
            string                         $outputFilePath,
74
            bool                           $setSourceInTarget,
75
            LoggerInterface                $logger = null,
76
            XliffReplacerCallbackInterface $callback = null
77
    ) {
78
        self::$INTERNAL_TAG_PLACEHOLDER = $this->getInternalTagPlaceholder();
79
        $this->createOutputFileIfDoesNotExist( $outputFilePath );
80
        $this->setFileDescriptors( $originalXliffPath, $outputFilePath );
81
        $this->xliffVersion   = $xliffVersion;
82
        $this->tuTagName      = ( $this->xliffVersion === 2 ) ? 'unit' : 'trans-unit';
83
        $this->segments       = $segments;
84
        $this->targetLang     = $trgLang;
85
        $this->sourceInTarget = $setSourceInTarget;
86
        $this->transUnits     = $transUnits;
87
        $this->logger         = $logger;
88
        $this->callback       = $callback;
89
    }
90
91
    public function replaceTranslation() {
92
        fwrite( $this->outputFP, '<?xml version="1.0" encoding="UTF-8"?>' );
93
94
        //create Sax parser
95
        $xmlParser = $this->initSaxParser();
96
97
        while ( $this->currentBuffer = fread( $this->originalFP, 4096 ) ) {
98
            /*
99
               preprocess file
100
             */
101
            // obfuscate entities because sax automatically does html_entity_decode
102
            $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
103
104
            //avoid cutting entities in half:
105
            //the last fread could have truncated an entity (say, '&lt;' in '&l'), thus invalidating the escaping
106
            //***** and if there is an & that it is not an entity, this is an infinite loop !!!!!
107
            // 9 is the max length of an entity. So, suppose that the & is at the end of buffer,
108
            // add 9 Bytes and substitute the entities, if the & is present, and it is not at the end
109
            //it can't be an entity, exit the loop
110
            while ( true ) {
111
                $_ampPos = strpos( $temporary_check_buffer, '&' );
112
113
                //check for real entity or escape it to safely exit from the loop!!!
114
                if ( $_ampPos === false || strlen( substr( $temporary_check_buffer, $_ampPos ) ) > 9 ) {
115
                    break;
116
                }
117
118
                //if an entity is still present, fetch some more and repeat the escaping
119
                $this->currentBuffer    .= fread( $this->originalFP, 9 );
120
                $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
121
            }
122
123
            //free stuff outside the loop
124
            unset( $temporary_check_buffer );
125
126
            $this->currentBuffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
127
            $this->currentBuffer = str_replace( "&", self::$INTERNAL_TAG_PLACEHOLDER . 'amp' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
128
129
            //get length of chunk
130
            $this->len = strlen( $this->currentBuffer );
131
132
            //parse chunk of text
133
            if ( !xml_parse( $xmlParser, $this->currentBuffer, feof( $this->originalFP ) ) ) {
134
                //if unable, raise an exception
135
                throw new RuntimeException( sprintf(
136
                        "XML error: %s at line %d",
137
                        xml_error_string( xml_get_error_code( $xmlParser ) ),
138
                        xml_get_current_line_number( $xmlParser )
139
                ) );
140
            }
141
            /*
142
            * Get the accumulated this->offset in the document:
143
             * as long as SAX pointer advances, we keep track of total bytes it has seen so far;
144
             * this way, we can translate its global pointer in an address local to the current buffer of text to retrieve the last char of tag
145
            */
146
            $this->offset += $this->len;
147
        }
148
149
        // close Sax parser
150
        $this->closeSaxParser( $xmlParser );
151
152
    }
153
154
    /**
155
     * @param resource $parser
156
     *
157
     * @return string
158
     */
159
    protected function getLastCharacter( $parser ): string {
160
161
        //this logic helps detecting empty tags
162
        //get current position of SAX pointer in all the stream of data is has read so far:
163
        //it points at the end of current tag
164
        $idx = xml_get_current_byte_index( $parser );
165
166
        //check whether the bounds of current tag are entirely in current buffer or the end of the current tag
167
        //is outside current buffer (in the latter case, it's in next buffer to be read by the while loop);
168
        //this check is necessary because we may have truncated a tag in half with current read,
169
        //and the other half may be encountered in the next buffer it will be passed
170
        if ( isset( $this->currentBuffer[ $idx - $this->offset ] ) ) {
171
            //if this tag entire lenght fitted in the buffer, the last char must be the last
172
            //symbol before the '>'; if it's an empty tag, it is assumed that it's a '/'
173
            $lastChar = $this->currentBuffer[ $idx - $this->offset ];
174
        } else {
175
            //if it's out, simple use the last character of the chunk
176
            $lastChar = $this->currentBuffer[ $this->len - 1 ];
177
        }
178
179
        return $lastChar;
180
181
    }
182
183
    /**
184
     * @return string
185
     */
186
    private function getInternalTagPlaceholder(): string {
187
        return "§" .
188
                substr(
189
                        str_replace(
190
                                [ '+', '/' ],
191
                                '',
192
                                base64_encode( openssl_random_pseudo_bytes( 10, $_crypto_strong ) )
193
                        ),
194
                        0,
195
                        4
196
                );
197
    }
198
199
    private function createOutputFileIfDoesNotExist( string $outputFilePath ) {
200
        // create output file
201
        if ( !file_exists( $outputFilePath ) ) {
202
            touch( $outputFilePath );
203
        }
204
    }
205
206
    /**
207
     * @param string $originalXliffPath
208
     * @param string $outputFilePath
209
     */
210
    private function setFileDescriptors( string $originalXliffPath, string $outputFilePath ) {
211
        $this->outputFP = fopen( $outputFilePath, 'w+' );
212
213
        $streamArgs = null;
214
215
        if ( !( $this->originalFP = fopen( $originalXliffPath, "r", false, stream_context_create( $streamArgs ) ) ) ) {
216
            throw new RuntimeException( "could not open XML input" );
217
        }
218
    }
219
220
    /**
221
     * AbstractXliffReplacer destructor.
222
     */
223
    public function __destruct() {
224
        //this stream can be closed outside the class
225
        //to permit multiple concurrent downloads, so suppress warnings
226
        @fclose( $this->originalFP );
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for fclose(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

226
        /** @scrutinizer ignore-unhandled */ @fclose( $this->originalFP );

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
227
        fclose( $this->outputFP );
228
    }
229
230
    /**
231
     * Init Sax parser
232
     *
233
     * @return resource
234
     */
235
    protected function initSaxParser() {
236
        $xmlSaxParser = xml_parser_create( 'UTF-8' );
237
        xml_set_object( $xmlSaxParser, $this );
238
        xml_parser_set_option( $xmlSaxParser, XML_OPTION_CASE_FOLDING, false );
239
        xml_set_element_handler( $xmlSaxParser, 'tagOpen', 'tagClose' );
240
        xml_set_character_data_handler( $xmlSaxParser, 'characterData' );
241
242
        return $xmlSaxParser;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $xmlSaxParser also could return the type XmlParser which is incompatible with the documented return type resource.
Loading history...
243
    }
244
245
    /**
246
     * @param resource $xmlSaxParser
247
     */
248
    protected function closeSaxParser( $xmlSaxParser ) {
249
        xml_parser_free( $xmlSaxParser );
250
    }
251
252
    /**
253
     * @param resource $parser
254
     * @param string   $name
255
     * @param array    $attr
256
     *
257
     * @return mixed
258
     */
259
    abstract protected function tagOpen( $parser, string $name, array $attr );
260
261
    /**
262
     * @param resource $parser
263
     * @param string   $name
264
     *
265
     * @return mixed
266
     */
267
    abstract protected function tagClose( $parser, string $name );
268
269
    /**
270
     * @param resource $parser
271
     * @param string   $data
272
     *
273
     * @return mixed
274
     */
275
    protected function characterData( $parser, string $data ): void {
0 ignored issues
show
Unused Code introduced by
The parameter $parser is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

275
    protected function characterData( /** @scrutinizer ignore-unused */ $parser, string $data ): void {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
276
        // don't write <target> data
277
        if ( !$this->inTarget && !$this->bufferIsActive ) {
278
            $this->postProcAndFlush( $this->outputFP, $data );
279
        } elseif ( $this->bufferIsActive ) {
280
            $this->CDATABuffer .= $data;
281
        }
282
    }
283
284
    /**
285
     * postprocess escaped data and write to disk
286
     *
287
     * @param resource $fp
288
     * @param string   $data
289
     * @param bool     $treatAsCDATA
290
     */
291
    protected function postProcAndFlush( $fp, string $data, bool $treatAsCDATA = false ) {
292
        //postprocess string
293
        $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data );
294
        $data = str_replace( '&nbsp;', ' ', $data );
295
        if ( !$treatAsCDATA ) {
296
            //unix2dos
297
            $data = str_replace( "\r\n", "\r", $data );
298
            $data = str_replace( "\n", "\r", $data );
299
            $data = str_replace( "\r", "\r\n", $data );
300
        }
301
302
        //flush to disk
303
        fwrite( $fp, $data );
304
    }
305
306
    /**
307
     * @param string $name
308
     * @param array  $attr
309
     *
310
     * @return void
311
     */
312
    protected function handleOpenUnit( string $name, array $attr ) {
313
314
        // check if we are entering into a <trans-unit> (xliff v1.*) or <unit> (xliff v2.*)
315
        if ( $this->tuTagName === $name ) {
316
            $this->inTU = true;
317
318
            // get id
319
            // trim to first 100 characters because this is the limit on Matecat's DB
320
            $this->currentTransUnitId = substr( $attr[ 'id' ], 0, 100 );
321
322
            // `translate` attribute can be only yes or no
323
            // current 'translate' attribute of the current trans-unit
324
            $this->currentTransUnitIsTranslatable = empty( $attr[ 'translate' ] ) ? 'yes' : $attr[ 'translate' ];
325
326
            $this->setLastTransUnitSegments();
327
328
        }
329
    }
330
331
    /**
332
     * @param string $name
333
     * @param array  $attr
334
     * @param string $tag
335
     *
336
     * @return string
337
     */
338
    protected function handleOpenXliffTag( string $name, array $attr, string $tag ): string {
339
340
        // Add MateCat specific namespace.
341
        // Add trgLang
342
        if ( $name === 'xliff' ) {
343
            if ( !array_key_exists( 'xmlns:mtc', $attr ) ) {
344
                $tag .= ' xmlns:mtc="https://www.matecat.com" ';
345
            }
346
            $tag = preg_replace( '/trgLang="(.*?)"/', 'trgLang="' . $this->targetLang . '"', $tag );
347
        }
348
349
        return $tag;
350
351
    }
352
353
    /**
354
     * @param string $name
355
     *
356
     * @return void
357
     */
358
    protected function checkSetInTarget( string $name ) {
359
        // check if we are entering into a <target>
360
        if ( 'target' === $name ) {
361
            if ( $this->currentTransUnitIsTranslatable === 'no' ) {
362
                $this->inTarget = false;
363
            } else {
364
                $this->inTarget = true;
365
            }
366
        }
367
    }
368
369
    /**
370
     * @param string $name
371
     *
372
     * @return void
373
     */
374
    protected function setInBuffer( string $name ) {
375
        if ( in_array( $name, $this->nodesToBuffer ) ) {
376
            $this->bufferIsActive = true;
377
        }
378
    }
379
380
    /**
381
     * @param array $seg
382
     */
383
    protected function updateSegmentCounts( array $seg = [] ) {
384
385
        $raw_word_count = $seg[ 'raw_word_count' ];
386
        $eq_word_count  = ( floor( $seg[ 'eq_word_count' ] * 100 ) / 100 );
387
388
        $this->counts[ 'segments_count_array' ][ $seg[ 'sid' ] ] = [
389
                'raw_word_count' => $raw_word_count,
390
                'eq_word_count'  => $eq_word_count,
391
        ];
392
393
        $this->counts[ 'raw_word_count' ] += $raw_word_count;
394
        $this->counts[ 'eq_word_count' ]  += $eq_word_count;
395
    }
396
397
    protected function resetCounts() {
398
        $this->counts[ 'segments_count_array' ] = [];
399
        $this->counts[ 'raw_word_count' ]       = 0;
400
        $this->counts[ 'eq_word_count' ]        = 0;
401
    }
402
403
    /**
404
     * @param resource $parser
405
     * @param string   $tag
406
     *
407
     * @return void
408
     */
409
    protected function checkForSelfClosedTagAndFlush( $parser, string $tag ) {
410
411
        $lastChar = $this->getLastCharacter( $parser );
412
413
        //trim last space
414
        $tag = rtrim( $tag );
415
416
        //detect empty tag
417
        $this->isEmpty = $lastChar == '/';
418
        if ( $this->isEmpty ) {
419
            $tag .= $lastChar;
420
        }
421
422
        //add tag ending
423
        $tag .= ">";
424
425
        //set a Buffer for the segSource Source tag
426
        if ( $this->bufferIsActive ) { // we are opening a critical CDATA section
427
            //these are NOT source/seg-source/value empty tags, THERE IS A CONTENT, write it in buffer
428
            $this->CDATABuffer .= $tag;
429
        } else {
430
            $this->postProcAndFlush( $this->outputFP, $tag );
431
        }
432
433
    }
434
435
    /**
436
     * A trans-unit can contain a list of segments because of mrk tags
437
     * Copy the segment's list for this trans-unit in a different structure
438
     *
439
     * @return void
440
     */
441
    protected function setLastTransUnitSegments() {
442
443
        /*
444
         * At the end of every cycle the segment grouping information is lost: unset( 'matecat|' . $this->currentId )
445
         *
446
         * We need to take the info about the last segment parsed
447
         *          ( normally more than 1 db row because of mrk tags )
448
         *
449
         * So, copy the current segment data group into another structure to keep the last segment
450
         * for the next tagOpen ( possible sdl:seg-defs )
451
         *
452
         */
453
        $this->lastTransUnit = [];
454
455
        if ( !isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
456
            return;
457
        }
458
459
        $listOfSegmentsIds = $this->transUnits[ $this->currentTransUnitId ];
460
        $last_value        = null;
461
        $segmentsCount     = count( $listOfSegmentsIds );
462
        for ( $i = 0; $i < $segmentsCount; $i++ ) {
463
            $id = $listOfSegmentsIds[ $i ];
464
            if ( isset( $this->segments[ $id ] ) && ( $i == 0 || $last_value + 1 == $listOfSegmentsIds[ $i ] ) ) {
465
                $last_value            = $listOfSegmentsIds[ $i ];
466
                $this->lastTransUnit[] = $this->segments[ $id ];
467
            }
468
        }
469
470
    }
471
472
    /**
473
     * @return array
474
     */
475
    protected function getCurrentSegment(): array {
476
        if ( $this->currentTransUnitIsTranslatable !== 'no' && isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
477
            return $this->segments[ $this->segmentInUnitPosition ];
478
        }
479
480
        return [];
481
    }
482
483
}