Passed
Push — master ( 0e1770...775632 )
by Mauro
03:12
created

AbstractXliffParser   A

Complexity

Total Complexity 37

Size/Duplication

Total Lines 283
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 77
dl 0
loc 283
rs 9.44
c 1
b 0
f 0
wmc 37

10 Methods

Rating   Name   Duplication   Size   Complexity  
A extractContent() 0 4 1
A getTuTagName() 0 2 2
B extractContentWithMarksAndExtTags() 0 52 11
A __construct() 0 4 1
A stringContainsMarks() 0 4 1
A JSONOrRawContentArray() 0 22 3
A getDataRefMap() 0 10 3
B extractTuFromNode() 0 27 9
A extractTagAttributes() 0 10 3
A extractTagContent() 0 11 3
1
<?php
2
3
namespace Matecat\XliffParser\XliffParser;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use Matecat\XliffParser\Constants\Placeholder;
9
use Matecat\XliffParser\Utils\Emoji;
10
use Matecat\XliffParser\Utils\Strings;
11
use Matecat\XliffParser\XliffUtils\DataRefReplacer;
12
use Psr\Log\LoggerInterface;
13
14
abstract class AbstractXliffParser {
15
16
    const MAX_GROUP_RECURSION_LEVEL = 5;
17
18
    /**
19
     * @var LoggerInterface
20
     */
21
    protected $logger;
22
23
    /**
24
     * @var string|null
25
     */
26
    protected $xliffProprietary;
27
28
    /**
29
     * @var int
30
     */
31
    protected $xliffVersion;
32
33
    /**
34
     * AbstractXliffParser constructor.
35
     *
36
     * @param int                  $xliffVersion
37
     * @param string|null          $xliffProprietary
38
     * @param LoggerInterface|null $logger
39
     */
40
    public function __construct( $xliffVersion, $xliffProprietary = null, LoggerInterface $logger = null ) {
41
        $this->xliffVersion     = $xliffVersion;
42
        $this->logger           = $logger;
43
        $this->xliffProprietary = $xliffProprietary;
44
    }
45
46
    /**
47
     * @return string
48
     */
49
    protected function getTuTagName() {
50
        return ( $this->xliffVersion === 1 ) ? 'trans-unit' : 'unit';
51
    }
52
53
    /**
54
     * @param DOMDocument $dom
55
     *
56
     * @return array
57
     */
58
    abstract public function parse( DOMDocument $dom, $output = [] );
59
60
    /**
61
     * Extract trans-unit content from the current node
62
     *
63
     * @param $childNode
64
     * @param              $transUnitIdArrayForUniquenessCheck
65
     * @param DOMDocument $dom
66
     * @param              $output
67
     * @param              $i
68
     * @param              $j
69
     * @param array $contextGroups
70
     * @param int $recursionLevel
71
     */
72
    protected function extractTuFromNode( $childNode, &$transUnitIdArrayForUniquenessCheck, DOMDocument $dom, &$output, &$i, &$j, $contextGroups = [], $recursionLevel = 0 ) {
73
        if ( $childNode->nodeName === 'group' ) {
74
75
            // add nested context-groups
76
            foreach ( $childNode->childNodes as $nestedChildNode ) {
77
                if ( $nestedChildNode->nodeName ===  'context-group' ) {
78
                    $contextGroups[] = $nestedChildNode;
79
                }
80
            }
81
82
            foreach ( $childNode->childNodes as $nestedChildNode ) {
83
84
                // nested groups
85
                if ( $nestedChildNode->nodeName === 'group' ) {
86
87
                    // avoid infinite recursion
88
                    $recursionLevel++;
89
                    if($recursionLevel < self::MAX_GROUP_RECURSION_LEVEL){
90
                        $this->extractTuFromNode( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups, $recursionLevel );
91
                    }
92
93
                } elseif ( $nestedChildNode->nodeName === $this->getTuTagName() ) {
94
                    $this->extractTransUnit( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
95
                }
96
            }
97
        } elseif ( $childNode->nodeName === $this->getTuTagName() ) {
98
            $this->extractTransUnit( $childNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
99
        }
100
    }
101
102
    /**
103
     * Extract and populate 'trans-units' array
104
     *
105
     * @param $transUnit
106
     * @param $transUnitIdArrayForUniquenessCheck
107
     * @param $dom
108
     * @param $output
109
     * @param $i
110
     * @param $j
111
     * @param $contextGroups
112
     *
113
     * @return mixed
114
     */
115
    abstract protected function extractTransUnit( $transUnit, &$transUnitIdArrayForUniquenessCheck, $dom, &$output, &$i, &$j,$contextGroups = [] );
116
117
    /**
118
     * @param DOMDocument $dom
119
     * @param DOMElement  $node
120
     *
121
     * @return array
122
     */
123
    protected function extractContent( DOMDocument $dom, DOMNode $node ) {
124
        return [
125
                'raw-content' => $this->extractTagContent( $dom, $node ),
126
                'attr'        => $this->extractTagAttributes( $node )
127
        ];
128
    }
129
130
    /**
131
     * Extract attributes if they are present
132
     *
133
     * Ex:
134
     * <p align=center style="font-size: 12px;">some text</p>
135
     *
136
     * $attr->nodeName == 'align' :: $attr->nodeValue == 'center'
137
     * $attr->nodeName == 'style' :: $attr->nodeValue == 'font-size: 12px;'
138
     *
139
     * @param DOMNode $element
140
     *
141
     * @return array
142
     */
143
    protected function extractTagAttributes( DOMNode $element ) {
144
        $tagAttributes = [];
145
146
        if ( $element->hasAttributes() ) {
147
            foreach ( $element->attributes as $attr ) {
148
                $tagAttributes[ $attr->nodeName ] = $attr->nodeValue;
149
            }
150
        }
151
152
        return $tagAttributes;
153
    }
154
155
    /**
156
     * Extract tag content from DOMDocument node
157
     *
158
     * @param DOMDocument $dom
159
     * @param DOMNode     $element
160
     *
161
     * @return string
162
     */
163
    protected function extractTagContent( DOMDocument $dom, DOMNode $element ) {
164
        $childNodes       = $element->hasChildNodes();
165
        $extractedContent = '';
166
167
        if ( !empty( $childNodes ) ) {
168
            foreach ( $element->childNodes as $node ) {
169
                $extractedContent .= Emoji::toEntity( Strings::fixNonWellFormedXml( $dom->saveXML( $node ) ) );
170
            }
171
        }
172
173
        return str_replace( Placeholder::EMPTY_TAG_PLACEHOLDER, '', $extractedContent );
174
    }
175
176
    /**
177
     * Used to extract <seg-source> and <seg-target>
178
     *
179
     * @param DOMDocument $dom
180
     * @param DOMElement  $childNode
181
     * @param string      $originalRawContent
182
     * @param array       $originalData
183
     *
184
     * @return array
185
     */
186
    protected function extractContentWithMarksAndExtTags( DOMDocument $dom, DOMElement $childNode, $originalRawContent, array $originalData = [] ) {
0 ignored issues
show
Unused Code introduced by
The parameter $originalRawContent is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

186
    protected function extractContentWithMarksAndExtTags( DOMDocument $dom, DOMElement $childNode, /** @scrutinizer ignore-unused */ $originalRawContent, array $originalData = [] ) {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
187
        $source = [];
188
189
        // example:
190
        // <g id="1"><mrk mid="0" mtype="seg">An English string with g tags</mrk></g>
191
        $raw = $this->extractTagContent( $dom, $childNode );
192
193
        $markers = preg_split( '#<mrk\s#si', $raw, -1 );
194
195
        $mi = 0;
196
        while ( isset( $markers[ $mi + 1 ] ) ) {
197
            unset( $mid );
198
199
            preg_match( '|mid\s?=\s?["\'](.*?)["\']|si', $markers[ $mi + 1 ], $mid );
200
201
            // if it's a Trados file the trailing spaces after </mrk> are meaningful
202
            // so we add them to
203
            $trailingSpaces = '';
204
            if ( $this->xliffProprietary === 'trados' ) {
205
                preg_match_all( '/<\/mrk>[\s]+/iu', $markers[ $mi + 1 ], $trailingSpacesMatches );
206
207
                if ( isset( $trailingSpacesMatches[ 0 ] ) && count( $trailingSpacesMatches[ 0 ] ) > 0 ) {
208
                    foreach ( $trailingSpacesMatches[ 0 ] as $match ) {
209
                        $trailingSpaces = str_replace( '</mrk>', '', $match );
210
                    }
211
                }
212
            }
213
214
            //re-build the mrk tag after the split
215
            $originalMark = trim( '<mrk ' . $markers[ $mi + 1 ] );
216
217
            $mark_string  = preg_replace( '#^<mrk\s[^>]+>(.*)#', '$1', $originalMark ); // at this point we have: ---> 'Test </mrk> </g>>'
218
            $mark_content = preg_split( '#</mrk>#si', $mark_string );
219
220
            $sourceArray = [
221
                    'mid'           => ( isset( $mid[ 1 ] ) ) ? $mid[ 1 ] : $mi,
222
                    'ext-prec-tags' => ( $mi == 0 ? $markers[ 0 ] : "" ),
223
                    'raw-content'   => ( isset( $mark_content[ 0 ] ) ) ? $mark_content[ 0 ] . $trailingSpaces : '',
224
                    'ext-succ-tags' => ( isset( $mark_content[ 1 ] ) ) ? $mark_content[ 1 ] : '',
225
            ];
226
227
            if ( !empty( $originalData ) ) {
228
                $dataRefMap                        = $this->getDataRefMap( $originalData );
229
                $sourceArray[ 'replaced-content' ] = ( new DataRefReplacer( $dataRefMap ) )->replace( $mark_content[ 0 ] );
230
            }
231
232
            $source[] = $sourceArray;
233
234
            $mi++;
235
        }
236
237
        return $source;
238
    }
239
240
    /**
241
     * @param array $originalData
242
     *
243
     * @return array
244
     */
245
    protected function getDataRefMap( $originalData ) {
246
        // dataRef map
247
        $dataRefMap = [];
248
        foreach ( $originalData as $datum ) {
249
            if ( isset( $datum[ 'attr' ][ 'id' ] ) ) {
250
                $dataRefMap[ $datum[ 'attr' ][ 'id' ] ] = $datum[ 'raw-content' ];
251
            }
252
        }
253
254
        return $dataRefMap;
255
    }
256
257
    /**
258
     * @param $raw
259
     *
260
     * @return bool
261
     */
262
    protected function stringContainsMarks( $raw ) {
263
        $markers = preg_split( '#<mrk\s#si', $raw, -1 );
264
265
        return isset( $markers[ 1 ] );
266
    }
267
268
    /**
269
     * @param      $noteValue
270
     * @param bool $escapeStrings
271
     *
272
     * @return array
273
     * @throws \Exception
274
     */
275
    protected function JSONOrRawContentArray( $noteValue, $escapeStrings = true ) {
276
        //
277
        // convert double escaped entites
278
        //
279
        // Example:
280
        //
281
        // &amp;#39; ---> &#39;
282
        // &amp;amp; ---> &amp;
283
        // &amp;apos ---> &apos;
284
        //
285
        if ( Strings::isADoubleEscapedEntity( $noteValue ) ) {
286
            $noteValue = Strings::htmlspecialchars_decode( $noteValue, true );
287
        } else {
288
            // for non escaped entities $escapeStrings is always true for security reasons
289
            $escapeStrings = true;
290
        }
291
292
        if ( Strings::isJSON( $noteValue ) ) {
293
            return [ 'json' => Strings::cleanCDATA( $noteValue ) ];
294
        }
295
296
        return [ 'raw-content' => Strings::fixNonWellFormedXml( $noteValue, $escapeStrings ) ];
297
    }
298
}
299