Passed
Push — master ( a19423...673e99 )
by Domenico
02:59 queued 29s
created

DataRefReplacer::removeAngleBrackets()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 2
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 2
rs 10
1
<?php
2
/**
3
 * Created by PhpStorm.
4
 * @author hashashiyyin [email protected] / [email protected]
5
 * Date: 22/04/24
6
 * Time: 15:13
7
 *
8
 */
9
10
namespace Matecat\SubFiltering\Utils;
11
12
use DOMException;
13
use Matecat\XmlParser\Exception\InvalidXmlException;
14
use Matecat\XmlParser\Exception\XmlParsingException;
15
use Matecat\XmlParser\XmlParser;
16
17
class DataRefReplacer {
18
    /**
19
     * @var array
20
     */
21
    private $map;
22
23
    /**
24
     * DataRefReplacer constructor.
25
     *
26
     * @param array $map
27
     */
28
    public function __construct( array $map = null ) {
29
        $this->map = $map;
30
    }
31
32
    /**
33
     * This function inserts a new attribute called 'equiv-text' from dataRef contained in <ph>, <sc>, <ec>, <pc> tags against the provided map array
34
     *
35
     * For a complete reference see:
36
     *
37
     * Http://docs.oasis-open.org/xliff/xliff-core/v2.1/os/xliff-core-v2.1-os.html#dataref
38
     *
39
     * @param string $string
40
     *
41
     * @return string
42
     * @throws DOMException
43
     * @throws InvalidXmlException
44
     * @throws XmlParsingException
45
     */
46
    public function replace( $string ) {
47
48
        // if map is empty
49
        // or the string has not a dataRef attribute
50
        // return string as is
51
        if ( empty( $this->map ) || !$this->hasAnyDataRefAttribute( $string ) ) {
52
            return $string;
53
        }
54
55
        // (recursively) clean string from equiv-text eventually present
56
        $string = $this->cleanFromEquivText( $string );
57
58
        $html = XmlParser::parse( $string, true );
59
60
        // 1. Replace <ph>|<sc>|<ec> tags
61
        foreach ( $html as $node ) {
62
            $string = $this->recursiveAddEquivTextToPhTag( $node, $string );
63
        }
64
65
        // 2. Replace <pc> tags
66
        if ( $this->stringContainsPcTags( $string ) ) {
67
68
            // replace self-closed <pc />
69
            $string = $this->replaceSelfClosedPcTags( $string );
70
71
            // create a dataRefEnd map
72
            // (needed for correct handling of </pc> closing tags)
73
            $dataRefEndMap = $this->buildDataRefEndMap( $html );
74
            $string        = $this->replaceOpeningPcTags( $string );
75
            $string        = $this->replaceClosingPcTags( $string, $dataRefEndMap );
76
        }
77
78
        return $string;
79
    }
80
81
    /**
82
     * @param string $string
83
     *
84
     * @return bool
85
     */
86
    private function hasAnyDataRefAttribute( $string ) {
87
        return (bool)preg_match( '/(dataRef|dataRefStart|dataRefEnd)=[\'"].*?[\'"]/', $string );
88
    }
89
90
    /**
91
     * @param string $string
92
     *
93
     * @return string
94
     * @throws DOMException
95
     * @throws InvalidXmlException
96
     * @throws XmlParsingException
97
     */
98
    private function cleanFromEquivText( $string ) {
99
        $html = XmlParser::parse( $string, true );
100
101
        foreach ( $html as $node ) {
102
            $string = $this->recursiveCleanFromEquivText( $node, $string );
103
        }
104
105
        return $string;
106
    }
107
108
    /**
109
     * This function add equiv-text attribute to <ph>, <ec>, and <sc> tags.
110
     *
111
     * Please note that <ec> and <sc> tags are converted to <ph> tags (needed by Matecat);
112
     * in this case, another special attribute (dataType) is added just before equiv-text
113
     *
114
     * If there is no id tag, it will be copied from dataRef attribute
115
     *
116
     * @param object $node
117
     * @param string $string
118
     *
119
     * @return string
120
     */
121
    private function recursiveAddEquivTextToPhTag( $node, $string ) {
122
        if ( $node->has_children ) {
123
            foreach ( $node->inner_html as $childNode ) {
124
                $string = $this->recursiveAddEquivTextToPhTag( $childNode, $string );
125
            }
126
        } else {
127
            if ( $node->tagName === 'ph' || $node->tagName === 'sc' || $node->tagName === 'ec' ) {
128
                if ( !isset( $node->attributes[ 'dataRef' ] ) ) {
129
                    return $string;
130
                }
131
132
                $a = $node->node;  // complete match. Eg:  <ph id="source1" dataRef="source1"/>
133
                $b = $node->attributes[ 'dataRef' ];   // map identifier. Eg: source1
134
135
                // if isset a value in the map calculate base64 encoded value
136
                // otherwise skip
137
                if ( !in_array( $b, array_keys( $this->map ) ) ) {
138
                    return $string;
139
                }
140
141
                // check if is null, in this case convert it to NULL string
142
                if ( is_null( $this->map[ $b ] ) ) {
143
                    $this->map[ $b ] = 'NULL';
144
                }
145
146
                $value              = $this->map[ $b ];
147
                $base64EncodedValue = base64_encode( $value );
148
149
                if ( empty( $base64EncodedValue ) ) {
150
                    return $string;
151
                }
152
153
                // if there is no id copy it from dataRef
154
                $id = ( !isset( $node->attributes[ 'id' ] ) ) ? ' id="' . $b . '" removeId="true"' : '';
155
156
                // introduce dataType for <ec>/<sc> tag handling
157
                $dataType = ( $this->isAEcOrScTag( $node ) ) ? ' dataType="' . $node->tagName . '"' : '';
158
159
                // replacement
160
                $d = str_replace( '/', $id . $dataType . ' equiv-text="base64:' . $base64EncodedValue . '"/', $a );
161
                $a = $this->removeAngleBrackets( $a );
162
                $d = $this->removeAngleBrackets( $d );
163
164
                // convert <ec>/<sc> into <ph>
165
                if ( $this->isAEcOrScTag( $node ) ) {
166
                    $d = 'ph' . substr( $d, 2 );
167
                    $d = trim( $d );
168
                }
169
170
                return str_replace( $a, $d, $string );
171
            }
172
        }
173
174
        return $string;
175
    }
176
177
    /**
178
     * @param $string
179
     *
180
     * @return bool
181
     */
182
    private function stringContainsPcTags( $string ) {
183
        preg_match_all( '/<pc [^>]+?>/iu', $string, $openingPcMatches );
184
185
        return ( isset( $openingPcMatches[ 0 ] ) && count( $openingPcMatches[ 0 ] ) > 0 );
186
    }
187
188
    /**
189
     * @param $string
190
     *
191
     * @return mixed
192
     * @throws DOMException
193
     * @throws InvalidXmlException
194
     * @throws XmlParsingException
195
     */
196
    private function replaceSelfClosedPcTags( $string ) {
197
198
        $regex = '|<pc[^>]+?/>|iu';
199
        preg_match_all( $regex, $string, $selfClosedPcMatches );
200
201
        foreach ( $selfClosedPcMatches[ 0 ] as $match ) {
202
203
            $html       = XmlParser::parse( $match, true );
204
            $node       = $html[ 0 ];
205
            $attributes = $node->attributes;
206
207
            if ( isset( $attributes[ 'dataRefStart' ] ) && array_key_exists( $node->attributes[ 'dataRefStart' ], $this->map ) ) {
208
                $replacement = '<ph id="' . $attributes[ 'id' ] . '" dataType="pcSelf" originalData="' . base64_encode( $match ) . '" dataRef="' . $attributes[ 'dataRefStart' ] . '" equiv-text="base64:' . base64_encode( $this->map[ $node->attributes[ 'dataRefStart' ] ] ) . '"/>';
209
                $string      = str_replace( $match, $replacement, $string );
210
            }
211
        }
212
213
        return $string;
214
    }
215
216
    /**
217
     * Build the DataRefEndMap needed by replaceClosingPcTags function
218
     * (only for <pc> tags handling)
219
     *
220
     * @param $html
221
     *
222
     * @return array
223
     */
224
    private function buildDataRefEndMap( $html ) {
225
        $dataRefEndMap = [];
226
227
        foreach ( $html as $index => $node ) {
228
            if ( $node->tagName === 'pc' ) {
229
                $this->extractDataRefMapRecursively( $node, $dataRefEndMap );
230
            }
231
        }
232
233
        return $dataRefEndMap;
234
    }
235
236
    /**
237
     * Extract (recursively) the dataRefEnd map from single nodes
238
     *
239
     * @param object $node
240
     * @param        $dataRefEndMap
241
     */
242
    private function extractDataRefMapRecursively( $node, &$dataRefEndMap ) {
243
        if ( $this->nodeContainsNestedPcTags( $node ) ) {
244
            foreach ( $node->inner_html as $nestedNode ) {
245
                $this->extractDataRefMapRecursively( $nestedNode, $dataRefEndMap );
246
            }
247
        }
248
249
        // EXCLUDE self closed <pc/>
250
        if ( $node->tagName === 'pc' && $node->self_closed === false ) {
251
            if ( isset( $node->attributes[ 'dataRefEnd' ] ) ) {
252
                $dataRefEnd = $node->attributes[ 'dataRefEnd' ];
253
            } elseif ( isset( $node->attributes[ 'dataRefStart' ] ) ) {
254
                $dataRefEnd = $node->attributes[ 'dataRefStart' ];
255
            } else {
256
                $dataRefEnd = null;
257
            }
258
259
            $dataRefEndMap[] = [
260
                    'id'         => isset( $node->attributes[ 'id' ] ) ? $node->attributes[ 'id' ] : null,
261
                    'dataRefEnd' => $dataRefEnd,
262
            ];
263
        }
264
    }
265
266
    /**
267
     * @param object $node
268
     * @param        $string
269
     *
270
     * @return string|string[]
271
     */
272
    private function recursiveCleanFromEquivText( $node, $string ) {
273
274
        if ( $node->tagName == '#text' ) {
275
            return $string;
276
        }
277
278
        if ( $node->has_children ) {
279
            foreach ( $node->inner_html as $childNode ) {
280
                $string = $this->recursiveCleanFromEquivText( $childNode, $string );
281
            }
282
        } else {
283
            if ( isset( $node->attributes[ 'dataRef' ] ) && array_key_exists( $node->attributes[ 'dataRef' ], $this->map ) ) {
284
                $cleaned = preg_replace( '/ equiv-text="(.*?)"/', '', $node->node );
285
                $string  = str_replace( $node->node, $cleaned, $string );
286
            }
287
        }
288
289
        return $string;
290
    }
291
292
    /**
293
     * Replace opening <pc> tags with correct reference in the $string
294
     *
295
     * @param string $string
296
     *
297
     * @return string
298
     */
299
    private function replaceOpeningPcTags( $string ) {
300
301
        preg_match_all( '|<pc ([^>/]+?)>|iu', $string, $openingPcMatches );
302
303
        foreach ( $openingPcMatches[ 0 ] as $index => $match ) {
304
305
            preg_match_all( '|([a-zA-Z]+?)\s*=\s*[\'"](.+?)[\'"]|', $openingPcMatches[ 1 ][ $index ], $_attr, PREG_SET_ORDER );
306
307
            $attr = [];
308
            foreach ( $_attr as $attrGroup ) {
309
                $attr[ $attrGroup[ 1 ] ] = $attrGroup[ 2 ];
310
            }
311
312
            // CASE 1 - Missing `dataRefStart`
313
            if ( isset( $attr[ 'dataRefEnd' ] ) && !isset( $attr[ 'dataRefStart' ] ) ) {
314
                $attr[ 'dataRefStart' ] = $attr[ 'dataRefEnd' ];
315
            }
316
317
            // CASE 2 - Missing `dataRefEnd`
318
            if ( isset( $attr[ 'dataRefStart' ] ) && !isset( $attr[ 'dataRefEnd' ] ) ) {
319
                $attr[ 'dataRefEnd' ] = $attr[ 'dataRefStart' ];
320
            }
321
322
            if ( isset( $attr[ 'dataRefStart' ] ) ) {
323
                $startOriginalData       = $match; // opening <pc>
324
                $startValue              = $this->map[ $attr[ 'dataRefStart' ] ] ?: 'NULL'; //handling null values in original data map
325
                $base64EncodedStartValue = base64_encode( $startValue );
326
                $base64StartOriginalData = base64_encode( $startOriginalData );
327
328
                // conversion for opening <pc> tag
329
                $openingPcConverted = '<ph ' . ( ( isset( $attr[ 'id' ] ) ) ? 'id="' . $attr[ 'id' ] . '_1"' : '' ) . ' dataType="pcStart" originalData="' . $base64StartOriginalData . '" dataRef="'
330
                        . $attr[ 'dataRefStart' ] . '" equiv-text="base64:'
331
                        . $base64EncodedStartValue . '"/>';
332
333
                $string = str_replace( $startOriginalData, $openingPcConverted, $string );
334
            }
335
        }
336
337
        return $string;
338
    }
339
340
    /**
341
     * Replace closing </pc> tags with correct reference in the $string
342
     * thanks to $dataRefEndMap
343
     *
344
     * @param string $string
345
     * @param array  $dataRefEndMap
346
     *
347
     * @return string
348
     */
349
    private function replaceClosingPcTags( $string, $dataRefEndMap = [] ) {
350
        preg_match_all( '|</pc>|iu', $string, $closingPcMatches, PREG_OFFSET_CAPTURE );
351
        $delta = 0;
352
353
        foreach ( $closingPcMatches[ 0 ] as $index => $match ) {
354
            $offset = $match[ 1 ];
355
            $length = strlen( $match[ 0 ] );
356
            $attr   = $dataRefEndMap[ $index ];
357
358
            if ( !empty( $attr ) && isset( $attr[ 'dataRefEnd' ] ) ) {
359
                $endOriginalData       = $match[ 0 ]; // </pc>
360
                $endValue              = $this->map[ $attr[ 'dataRefEnd' ] ] ?: 'NULL';
361
                $base64EncodedEndValue = base64_encode( $endValue );
362
                $base64EndOriginalData = base64_encode( $endOriginalData );
363
364
                // conversion for closing <pc> tag
365
                $closingPcConverted = '<ph ' . ( ( isset( $attr[ 'id' ] ) ) ? 'id="' . $attr[ 'id' ] . '_2"' : '' ) . ' dataType="pcEnd" originalData="' . $base64EndOriginalData . '" dataRef="'
366
                        . $attr[ 'dataRefEnd' ] . '" equiv-text="base64:' . $base64EncodedEndValue . '"/>';
367
368
                $realOffset = ( $delta === 0 ) ? $offset : ( $offset + $delta );
369
370
                $string = substr_replace( $string, $closingPcConverted, $realOffset, $length );
371
                $delta  = $delta + strlen( $closingPcConverted ) - $length;
372
            }
373
        }
374
375
        return !is_array( $string ) ? $string : implode( $string );
376
    }
377
378
    /**
379
     * @param object $node
380
     *
381
     * @return bool
382
     */
383
    private function nodeContainsNestedPcTags( $node ) {
384
        if ( !$node->has_children ) {
385
            return false;
386
        }
387
388
        foreach ( $node->inner_html as $nestedNode ) {
389
            if ( $nestedNode->tagName === 'pc' && ( isset( $node->attributes[ 'dataRefEnd' ] ) || isset( $node->attributes[ 'dataRefStart' ] ) ) ) {
390
                return true;
391
            }
392
        }
393
394
        return false;
395
    }
396
397
    /**
398
     * @param string $string
399
     *
400
     * @return string
401
     */
402
    public function restore( $string ) {
403
        // if map is empty return string as is
404
        if ( empty( $this->map ) ) {
405
            return $string;
406
        }
407
408
        // replace eventual empty equiv-text=""
409
        $string = str_replace( ' equiv-text=""', '', $string );
410
        $html   = XmlParser::parse( $string, true );
411
412
        foreach ( $html as $node ) {
413
            $string = $this->recursiveRemoveOriginalData( $node, $string );
414
        }
415
416
        return $string;
417
    }
418
419
    /**
420
     * @param object $node
421
     * @param        $string
422
     *
423
     * @return string|string[]
424
     */
425
    private function recursiveRemoveOriginalData( $node, $string ) {
426
        if ( $node->has_children ) {
427
            foreach ( $node->inner_html as $childNode ) {
428
                $string = $this->recursiveRemoveOriginalData( $childNode, $string );
429
            }
430
        } else {
431
432
            if ( !isset( $node->attributes[ 'dataRef' ] ) ) {
433
                return $string;
434
            }
435
436
            $a = $node->node;                  // complete match. Eg:  <ph id="source1" dataRef="source1"/>
437
            $b = $node->attributes[ 'dataRef' ]; // map identifier. Eg: source1
438
439
            // if isset a value in the map calculate base64 encoded value
440
            // or it is an empty string
441
            // otherwise skip
442
            if ( !in_array( $b, array_keys( $this->map ) ) ) {
443
                return $string;
444
            }
445
446
            // check if is null, in this case convert it to NULL string
447
            if ( is_null( $this->map[ $b ] ) ) {
448
                $this->map[ $b ] = 'NULL';
449
            }
450
451
            // remove id?
452
            $removeId = ( isset( $node->attributes[ 'removeId' ] ) && $node->attributes[ 'removeId' ] === "true" ) ? ' id="' . $b . '" removeId="true"' : '';
453
454
            // grab dataType attribute for <ec>/<sc> tag handling
455
            $dataType = ( $this->wasAEcOrScTag( $node ) ) ? ' dataType="' . $node->attributes[ 'dataType' ] . '"' : '';
456
457
            $d = str_replace( $removeId . $dataType . ' equiv-text="base64:' . base64_encode( $this->map[ $b ] ) . '"/>', '/>', $a );
458
459
            // replace original <ec>/<sc> tag
460
            if ( $this->wasAEcOrScTag( $node ) ) {
461
                $d = $node->attributes[ 'dataType' ] . substr( $d, 3 );
462
                $d = trim( $d );
463
            }
464
465
            // replace only content tag, no matter if the string is encoded or not
466
            // in this way we can handle string with mixed tags (encoded and not-encoded)
467
            // in the same string
468
            $a = $this->removeAngleBrackets( $a );
469
            $d = $this->removeAngleBrackets( $d );
470
471
            $string = str_replace( $a, $d, $string );
472
473
            // restoring <pc/> self-closed here
474
            if ( CatUtils::contains( 'dataType="pcSelf"', $d ) ) {
475
                preg_match( '/\s?originalData="(.*?)"\s?/', $d, $originalDataMatches );
476
477
                if ( isset( $originalDataMatches[ 1 ] ) ) {
478
                    $originalData = base64_decode( $originalDataMatches[ 1 ] );
479
                    $originalData = $this->removeAngleBrackets( $originalData );
480
                    $string       = str_replace( $d, $originalData, $string );
481
                }
482
            }
483
484
            // restoring <pc> tags here
485
            // if <ph> tag has originalData and originalType is pcStart or pcEnd,
486
            // replace with original data
487
            if ( CatUtils::contains( 'dataType="pcStart"', $d ) || CatUtils::contains( 'dataType="pcEnd"', $d ) ) {
488
                preg_match( '/\s?originalData="(.*?)"\s?/', $d, $originalDataMatches );
489
490
                if ( isset( $originalDataMatches[ 1 ] ) ) {
491
                    $originalData = base64_decode( $originalDataMatches[ 1 ] );
492
                    $originalData = $this->removeAngleBrackets( $originalData );
493
                    $string       = str_replace( $d, $originalData, $string );
494
                }
495
            }
496
        }
497
498
        return $string;
499
    }
500
501
    /**
502
     * @param string $string
503
     *
504
     * @return string
505
     */
506
    private function removeAngleBrackets( $string ) {
507
        return str_replace( [ '<', '>', '&lt;', '&gt;' ], '', $string );
508
    }
509
510
    /**
511
     * This function checks if a node is a tag <ec> or <sc>
512
     *
513
     * @param $node
514
     *
515
     * @return bool
516
     */
517
    private function isAEcOrScTag( $node ) {
518
        return ( $node->tagName === 'ec' || $node->tagName === 'sc' );
519
    }
520
521
    /**
522
     * This function checks if a <ph> tag node
523
     * was originally a <ec> or <sc>
524
     *
525
     * @param $node
526
     *
527
     * @return bool
528
     */
529
    private function wasAEcOrScTag( $node ) {
530
        return ( isset( $node->attributes[ 'dataType' ] ) && ( $node->attributes[ 'dataType' ] === 'ec' || $node->attributes[ 'dataType' ] === 'sc' ) );
531
    }
532
}