Passed
Branch fix-lack-of-untranslated-when-... (eeef8d)
by Domenico
04:05
created

AbstractXliffReplacer::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 22
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 1
eloc 11
nc 1
nop 9
dl 0
loc 22
rs 9.9
c 3
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
namespace Matecat\XliffParser\XliffReplacer;
4
5
use Psr\Log\LoggerInterface;
6
use RuntimeException;
7
8
abstract class AbstractXliffReplacer {
9
    protected $originalFP;
10
11
    protected $tuTagName;                 // <trans-unit> (forXliff v 1.*) or <unit> (forXliff v 2.*)
12
    protected $inTU                = false;  // flag to check whether we are in a <trans-unit>
13
    protected $inTarget            = false;  // flag to check whether we are in a <target>, to ignore everything
14
    protected $isEmpty             = false;  // flag to check whether we are in an empty tag (<tag/>)
15
    protected $targetWasWritten    = false;  // flag to check is <target> was written in the current unit
16
    protected $segmentPositionInTu = -1;  // the current position of segment in the current <unit> (forXliff v 2.*)
17
18
    protected $CDATABuffer    = "";       // buffer for special tag
19
    protected $bufferIsActive = false;    // buffer for special tag
20
21
    protected     $offset                = 0;         // offset for SAX pointer
22
    protected     $outputFP;                  // output stream pointer
23
    protected     $currentBuffer;             // the current piece of text it's been parsed
24
    protected     $len;                       // length of the currentBuffer
25
    protected     $segments;                  // array of translations
26
    protected     $lastTransUnit         = [];
27
    protected int $segmentInUnitPosition = 0;
28
    protected     $currentTransUnitId;        // id of current <trans-unit>
29
    protected     $currentTransUnitIsTranslatable; // 'translate' attribute of current <trans-unit>
30
    protected     $unitContainsMda       = false;   // check if <unit> already contains a <mda:metadata> (forXliff v 2.*)
31
    protected     $hasWrittenCounts      = false;  // check if <unit> already wrote segment counts (forXliff v 2.*)
32
33
    protected $targetLang;
34
35
    protected $sourceInTarget;
36
37
    protected $transUnits;
38
39
    /** @var int */
40
    protected $xliffVersion;
41
42
    protected $callback;
43
44
    protected $logger;
45
46
    protected static $INTERNAL_TAG_PLACEHOLDER;
47
48
    protected $counts = [
49
            'raw_word_count' => 0,
50
            'eq_word_count'  => 0,
51
    ];
52
53
    /**
54
     * AbstractXliffReplacer constructor.
55
     *
56
     * @param string                              $originalXliffPath
57
     * @param int                                 $xliffVersion
58
     * @param array                               $segments
59
     * @param array                               $transUnits
60
     * @param string                              $trgLang
61
     * @param string                              $outputFilePath
62
     * @param bool                                $setSourceInTarget
63
     * @param LoggerInterface|null                $logger
64
     * @param XliffReplacerCallbackInterface|null $callback
65
     */
66
    public function __construct(
67
            $originalXliffPath,
68
            $xliffVersion,
69
            $segments,
70
            $transUnits,
71
            $trgLang,
72
            $outputFilePath,
73
            $setSourceInTarget,
74
            LoggerInterface $logger = null,
75
            XliffReplacerCallbackInterface $callback = null
76
    ) {
77
        self::$INTERNAL_TAG_PLACEHOLDER = $this->getInternalTagPlaceholder();
78
        $this->createOutputFileIfDoesNotExist( $outputFilePath );
79
        $this->setFileDescriptors( $originalXliffPath, $outputFilePath );
80
        $this->xliffVersion = $xliffVersion;
81
        $this->setTuTagName();
82
        $this->segments       = $segments;
83
        $this->targetLang     = $trgLang;
84
        $this->sourceInTarget = $setSourceInTarget;
85
        $this->transUnits     = $transUnits;
86
        $this->logger         = $logger;
87
        $this->callback       = $callback;
88
    }
89
90
    public function replaceTranslation() {
91
        fwrite( $this->outputFP, '<?xml version="1.0" encoding="UTF-8"?>' );
92
93
        //create Sax parser
94
        $xmlParser = $this->initSaxParser();
95
96
        while ( $this->currentBuffer = fread( $this->originalFP, 4096 ) ) {
97
            /*
98
               preprocess file
99
             */
100
            // obfuscate entities because sax automatically does html_entity_decode
101
            $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
102
103
            //avoid cutting entities in half:
104
            //the last fread could have truncated an entity (say, '&lt;' in '&l'), thus invalidating the escaping
105
            //***** and if there is an & that it is not an entity, this is an infinite loop !!!!!
106
107
            $escape_AMP = false;
108
109
            // 9 is the max length of an entity. So, suppose that the & is at the end of buffer,
110
            // add 9 Bytes and substitute the entities, if the & is present, and it is not at the end
111
            //it can't be an entity, exit the loop
112
113
            while ( true ) {
114
                $_ampPos = strpos( $temporary_check_buffer, '&' );
115
116
                //check for real entity or escape it to safely exit from the loop!!!
117
                if ( $_ampPos === false || strlen( substr( $temporary_check_buffer, $_ampPos ) ) > 9 ) {
118
                    $escape_AMP = true;
119
                    break;
120
                }
121
122
                //if an entity is still present, fetch some more and repeat the escaping
123
                $this->currentBuffer    .= fread( $this->originalFP, 9 );
124
                $temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
125
            }
126
127
            //free stuff outside the loop
128
            unset( $temporary_check_buffer );
129
130
            $this->currentBuffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
131
            if ( $escape_AMP ) {
132
                $this->currentBuffer = str_replace( "&", self::$INTERNAL_TAG_PLACEHOLDER . 'amp' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer );
133
            }
134
135
            //get length of chunk
136
            $this->len = strlen( $this->currentBuffer );
137
138
            //parse chunk of text
139
            if ( !xml_parse( $xmlParser, $this->currentBuffer, feof( $this->originalFP ) ) ) {
140
                //if unable, raise an exception
141
                throw new RuntimeException( sprintf(
142
                        "XML error: %s at line %d",
143
                        xml_error_string( xml_get_error_code( $xmlParser ) ),
144
                        xml_get_current_line_number( $xmlParser )
145
                ) );
146
            }
147
            //get accumulated this->offset in document: as long as SAX pointer advances, we keep track of total bytes it has seen so far; this way, we can translate its global pointer in an address local to the current buffer of text to retrieve last char of tag
148
            $this->offset += $this->len;
149
        }
150
151
        // close Sax parser
152
        $this->closeSaxParser( $xmlParser );
153
154
    }
155
156
    protected function getLastCharacter( $parser ) {
157
158
        //this logic helps detecting empty tags
159
        //get current position of SAX pointer in all the stream of data is has read so far:
160
        //it points at the end of current tag
161
        $idx = xml_get_current_byte_index( $parser );
162
163
        //check whether the bounds of current tag are entirely in current buffer or the end of the current tag
164
        //is outside current buffer (in the latter case, it's in next buffer to be read by the while loop);
165
        //this check is necessary because we may have truncated a tag in half with current read,
166
        //and the other half may be encountered in the next buffer it will be passed
167
        if ( isset( $this->currentBuffer[ $idx - $this->offset ] ) ) {
168
            //if this tag entire lenght fitted in the buffer, the last char must be the last
169
            //symbol before the '>'; if it's an empty tag, it is assumed that it's a '/'
170
            $lastChar = $this->currentBuffer[ $idx - $this->offset ];
171
        } else {
172
            //if it's out, simple use the last character of the chunk
173
            $lastChar = $this->currentBuffer[ $this->len - 1 ];
174
        }
175
176
        return $lastChar;
177
178
    }
179
180
    /**
181
     * @return string
182
     */
183
    private function getInternalTagPlaceholder() {
184
        return "§" .
185
                substr(
186
                        str_replace(
187
                                [ '+', '/' ],
188
                                '',
189
                                base64_encode( openssl_random_pseudo_bytes( 10, $_crypto_strong ) )
190
                        ),
191
                        0,
192
                        4
193
                );
194
    }
195
196
    private function createOutputFileIfDoesNotExist( $outputFilePath ) {
197
        // create output file
198
        if ( !file_exists( $outputFilePath ) ) {
199
            touch( $outputFilePath );
200
        }
201
    }
202
203
    /**
204
     * @param $originalXliffPath
205
     * @param $outputFilePath
206
     */
207
    private function setFileDescriptors( $originalXliffPath, $outputFilePath ) {
208
        $this->outputFP = fopen( $outputFilePath, 'w+' );
209
210
        $streamArgs = null;
211
212
        if ( !( $this->originalFP = fopen( $originalXliffPath, "r", false, stream_context_create( $streamArgs ) ) ) ) {
213
            throw new RuntimeException( "could not open XML input" );
214
        }
215
    }
216
217
    /**
218
     * set tuTagName
219
     * <trans-unit> (xliff v1.*) || <unit> (xliff v2.*)
220
     */
221
    private function setTuTagName() {
222
        $this->tuTagName = ( $this->xliffVersion === 2 ) ? 'unit' : 'trans-unit';
223
    }
224
225
    /**
226
     * AbstractXliffReplacer destructor.
227
     */
228
    public function __destruct() {
229
        //this stream can be closed outside the class
230
        //to permit multiple concurrent downloads, so suppress warnings
231
        @fclose( $this->originalFP );
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for fclose(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

231
        /** @scrutinizer ignore-unhandled */ @fclose( $this->originalFP );

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
232
        fclose( $this->outputFP );
233
    }
234
235
    /**
236
     * Init Sax parser
237
     *
238
     * @return resource
239
     */
240
    protected function initSaxParser() {
241
        $xmlSaxParser = xml_parser_create( 'UTF-8' );
242
        xml_set_object( $xmlSaxParser, $this );
243
        xml_parser_set_option( $xmlSaxParser, XML_OPTION_CASE_FOLDING, false );
244
        xml_set_element_handler( $xmlSaxParser, 'tagOpen', 'tagClose' );
245
        xml_set_character_data_handler( $xmlSaxParser, 'characterData' );
246
247
        return $xmlSaxParser;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $xmlSaxParser also could return the type XmlParser which is incompatible with the documented return type resource.
Loading history...
248
    }
249
250
    /**
251
     * @param resource $xmlSaxParser
252
     */
253
    protected function closeSaxParser( $xmlSaxParser ) {
254
        xml_parser_free( $xmlSaxParser );
255
    }
256
257
    /**
258
     * @param $parser
259
     * @param $name
260
     * @param $attr
261
     *
262
     * @return mixed
263
     */
264
    abstract protected function tagOpen( $parser, $name, $attr );
265
266
    /**
267
     * @param $parser
268
     * @param $name
269
     *
270
     * @return mixed
271
     */
272
    abstract protected function tagClose( $parser, $name );
273
274
    /**
275
     * @param $parser
276
     * @param $data
277
     *
278
     * @return mixed
279
     */
280
    protected function characterData( $parser, $data ): void {
0 ignored issues
show
Unused Code introduced by
The parameter $parser is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

280
    protected function characterData( /** @scrutinizer ignore-unused */ $parser, $data ): void {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
281
        // don't write <target> data
282
        if ( !$this->inTarget && !$this->bufferIsActive ) {
283
            $this->postProcAndFlush( $this->outputFP, $data );
284
        } elseif ( $this->bufferIsActive ) {
285
            $this->CDATABuffer .= $data;
286
        }
287
    }
288
289
    /**
290
     * postprocess escaped data and write to disk
291
     *
292
     * @param resource $fp
293
     * @param string   $data
294
     * @param bool     $treatAsCDATA
295
     */
296
    protected function postProcAndFlush( $fp, $data, $treatAsCDATA = false ) {
297
        //postprocess string
298
        $data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data );
299
        $data = str_replace( '&nbsp;', ' ', $data );
300
        if ( !$treatAsCDATA ) {
301
            //unix2dos
302
            $data = str_replace( "\r\n", "\r", $data );
303
            $data = str_replace( "\n", "\r", $data );
304
            $data = str_replace( "\r", "\r\n", $data );
305
        }
306
307
        //flush to disk
308
        fwrite( $fp, $data );
309
    }
310
311
    protected function handleOpenUnit( string $name, array $attr ) {
312
313
        // check if we are entering into a <trans-unit> (xliff v1.*) or <unit> (xliff v2.*)
314
        if ( $this->tuTagName === $name ) {
315
            $this->inTU = true;
316
317
            // get id
318
            // trim to first 100 characters because this is the limit on Matecat's DB
319
            $this->currentTransUnitId = substr( $attr[ 'id' ], 0, 100 );
320
321
            // `translate` attribute can be only yes or no
322
            // current 'translate' attribute of the current trans-unit
323
            $this->currentTransUnitIsTranslatable = empty( $attr[ 'translate' ] ) ? 'yes' : $attr[ 'translate' ];
324
325
            $this->setLastTransUnitSegments();
326
327
        }
328
    }
329
330
    /**
331
     * @param array $seg
332
     */
333
    protected function updateSegmentCounts( array $seg = [] ) {
334
335
        $raw_word_count = $seg[ 'raw_word_count' ];
336
        $eq_word_count  = ( floor( $seg[ 'eq_word_count' ] * 100 ) / 100 );
337
338
        $this->counts[ 'segments_count_array' ][ $seg[ 'sid' ] ] = [
339
                'raw_word_count' => $raw_word_count,
340
                'eq_word_count'  => $eq_word_count,
341
        ];
342
343
        $this->counts[ 'raw_word_count' ] += $raw_word_count;
344
        $this->counts[ 'eq_word_count' ]  += $eq_word_count;
345
    }
346
347
    protected function resetCounts() {
348
        $this->counts[ 'segments_count_array' ] = [];
349
        $this->counts[ 'raw_word_count' ]       = 0;
350
        $this->counts[ 'eq_word_count' ]        = 0;
351
    }
352
353
    protected function checkForSelfClosedTagAndFlush( $parser, $tag ) {
354
355
        $lastChar = $this->getLastCharacter( $parser );
356
357
        //trim last space
358
        $tag = rtrim( $tag );
359
360
        //detect empty tag
361
        $this->isEmpty = $lastChar == '/';
362
        if ( $this->isEmpty ) {
363
            $tag .= $lastChar;
364
        }
365
366
        //add tag ending
367
        $tag .= ">";
368
369
        //set a Buffer for the segSource Source tag
370
        if ( $this->bufferIsActive ) { // we are opening a critical CDATA section
371
            //these are NOT source/seg-source/value empty tags, THERE IS A CONTENT, write it in buffer
372
            $this->CDATABuffer .= $tag;
373
        } else {
374
            $this->postProcAndFlush( $this->outputFP, $tag );
375
        }
376
377
    }
378
379
    /**
380
     * A trans-unit can contain a list of segments because of mrk tags
381
     * Copy the segment's list for this trans-unit in a different structure
382
     *
383
     * @return void
384
     */
385
    protected function setLastTransUnitSegments() {
386
387
        /*
388
         * At the end of every cycle the segment grouping information is lost: unset( 'matecat|' . $this->currentId )
389
         *
390
         * We need to take the info about the last segment parsed
391
         *          ( normally more than 1 db row because of mrk tags )
392
         *
393
         * So, copy the current segment data group into another structure to keep the last segment
394
         * for the next tagOpen ( possible sdl:seg-defs )
395
         *
396
         */
397
        $this->lastTransUnit = [];
398
399
        if ( !isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
400
            return;
401
        }
402
403
        $listOfSegmentsIds = $this->transUnits[ $this->currentTransUnitId ];
404
        $last_value        = null;
405
        $segmentsCount     = count( $listOfSegmentsIds );
406
        for ( $i = 0; $i < $segmentsCount; $i++ ) {
407
            $id = $listOfSegmentsIds[ $i ];
408
            if ( isset( $this->segments[ $id ] ) && ( $i == 0 || $last_value + 1 == $listOfSegmentsIds[ $i ] ) ) {
409
                $last_value            = $listOfSegmentsIds[ $i ];
410
                $this->lastTransUnit[] = $this->segments[ $id ];
411
            }
412
        }
413
414
    }
415
416
    /**
417
     * @return array
418
     */
419
    protected function getCurrentSegment(): array {
420
        if ( $this->currentTransUnitIsTranslatable !== 'no' && isset( $this->transUnits[ $this->currentTransUnitId ] ) ) {
421
            return $this->segments[ $this->segmentInUnitPosition ];
422
        }
423
424
        return [];
425
    }
426
427
}