AbstractXliffParser::extractTagContent()   A
last analyzed

Complexity

Conditions 3
Paths 2

Size

Total Lines 11
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 6
c 0
b 0
f 0
nc 2
nop 2
dl 0
loc 11
rs 10
1
<?php
2
3
namespace Matecat\XliffParser\XliffParser;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use Exception;
9
use Matecat\EmojiParser\Emoji;
10
use Matecat\XliffParser\Constants\Placeholder;
11
use Matecat\XliffParser\Utils\Strings;
12
use OverflowException;
13
use Psr\Log\LoggerInterface;
14
15
abstract class AbstractXliffParser {
16
17
    const MAX_GROUP_RECURSION_LEVEL = 50;
18
19
    /**
20
     * @var LoggerInterface|null
21
     */
22
    protected ?LoggerInterface $logger;
23
24
    /**
25
     * @var string|null
26
     */
27
    protected ?string $xliffProprietary;
28
29
    /**
30
     * @var int
31
     */
32
    protected $xliffVersion;
33
34
    /**
35
     * AbstractXliffParser constructor.
36
     *
37
     * @param int                  $xliffVersion
38
     * @param string|null          $xliffProprietary
39
     * @param LoggerInterface|null $logger
40
     */
41
    public function __construct( int $xliffVersion, ?string $xliffProprietary = null, LoggerInterface $logger = null ) {
42
        $this->xliffVersion     = $xliffVersion;
43
        $this->logger           = $logger;
44
        $this->xliffProprietary = $xliffProprietary;
45
    }
46
47
    /**
48
     * @return string
49
     */
50
    protected function getTuTagName(): string {
51
        return ( $this->xliffVersion === 1 ) ? 'trans-unit' : 'unit';
52
    }
53
54
    /**
55
     * @param DOMDocument $dom
56
     * @param array|null  $output
57
     *
58
     * @return array
59
     */
60
    abstract public function parse( DOMDocument $dom, ?array $output = [] ): array;
61
62
    /**
63
     * Extract trans-unit content from the current node
64
     *
65
     * @param DOMElement  $childNode
66
     * @param array       $transUnitIdArrayForUniquenessCheck
67
     * @param DOMDocument $dom
68
     * @param array       $output
69
     * @param int         $i
70
     * @param int         $j
71
     * @param array|null  $contextGroups
72
     * @param int|null    $recursionLevel
73
     */
74
    protected function extractTuFromNode( DOMNode $childNode, array &$transUnitIdArrayForUniquenessCheck, DOMDocument $dom, array &$output, int &$i, int &$j, ?array $contextGroups = [], ?int $recursionLevel = 0 ) {
75
76
        if ( $childNode->nodeType != XML_ELEMENT_NODE ) {
77
            return;
78
        }
79
80
        if ( $childNode->nodeName === 'group' ) {
81
82
            // add nested context-groups
83
            foreach ( $childNode->childNodes as $nestedChildNode ) {
84
                if ( $nestedChildNode->nodeName === 'context-group' ) {
85
                    $contextGroups[] = $nestedChildNode;
86
                }
87
            }
88
89
            // avoid infinite recursion
90
            $recursionLevel++;
91
92
            foreach ( $childNode->childNodes as $nestedChildNode ) {
93
94
                // nested groups
95
                if ( $nestedChildNode->nodeName === 'group' ) {
96
97
                    if ( $recursionLevel < self::MAX_GROUP_RECURSION_LEVEL ) {
98
                        $this->extractTuFromNode( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups, $recursionLevel );
99
                    } else {
100
                        throw new OverflowException( "Maximum tag group nesting level of '" . self::MAX_GROUP_RECURSION_LEVEL . "' reached, aborting!" );
101
                    }
102
103
                } elseif ( $nestedChildNode->nodeName === $this->getTuTagName() ) {
104
                    $this->extractTransUnit( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
105
                }
106
            }
107
        } elseif ( $childNode->nodeName === $this->getTuTagName() ) {
108
            $this->extractTransUnit( $childNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
109
        }
110
    }
111
112
    /**
113
     * Extract and populate 'trans-units' array
114
     *
115
     * @param DOMElement  $transUnit
116
     * @param array       $transUnitIdArrayForUniquenessCheck
117
     * @param DOMDocument $dom
118
     * @param array       $output
119
     * @param int         $i
120
     * @param int         $j
121
     * @param array|null  $contextGroups
122
     *
123
     * @return mixed
124
     */
125
    abstract protected function extractTransUnit( DOMElement $transUnit, array &$transUnitIdArrayForUniquenessCheck, DomDocument $dom, array &$output, int &$i, int &$j, ?array $contextGroups = [] );
126
127
    /**
128
     * @param DOMDocument $dom
129
     * @param DOMElement  $node
130
     *
131
     * @return array
132
     */
133
    protected function extractContent( DOMDocument $dom, DOMNode $node ): array {
134
        return [
135
                'raw-content' => $this->extractTagContent( $dom, $node ),
136
                'attr'        => $this->extractTagAttributes( $node )
137
        ];
138
    }
139
140
    /**
141
     * Extract attributes if they are present
142
     *
143
     * Ex:
144
     * <p align=center style="font-size: 12px;">some text</p>
145
     *
146
     * $attr->nodeName == 'align' :: $attr->nodeValue == 'center'
147
     * $attr->nodeName == 'style' :: $attr->nodeValue == 'font-size: 12px;'
148
     *
149
     * @param DOMNode $element
150
     *
151
     * @return array
152
     */
153
    protected function extractTagAttributes( DOMNode $element ): array {
154
        $tagAttributes = [];
155
156
        if ( $element->hasAttributes() ) {
157
            foreach ( $element->attributes as $attr ) {
158
                $tagAttributes[ $attr->nodeName ] = $attr->nodeValue;
159
            }
160
        }
161
162
        return $tagAttributes;
163
    }
164
165
    /**
166
     * Extract tag content from DOMDocument node
167
     *
168
     * @param DOMDocument $dom
169
     * @param DOMNode     $element
170
     *
171
     * @return string
172
     */
173
    protected function extractTagContent( DOMDocument $dom, DOMNode $element ): string {
174
        $childNodes       = $element->hasChildNodes();
175
        $extractedContent = '';
176
177
        if ( !empty( $childNodes ) ) {
178
            foreach ( $element->childNodes as $node ) {
179
                $extractedContent .= Emoji::toEntity( Strings::fixNonWellFormedXml( $dom->saveXML( $node ) ) );
180
            }
181
        }
182
183
        return str_replace( Placeholder::EMPTY_TAG_PLACEHOLDER, '', $extractedContent );
184
    }
185
186
    /**
187
     * Used to extract <seg-source> and <seg-target>
188
     *
189
     * @param DOMDocument $dom
190
     * @param DOMElement  $childNode
191
     *
192
     * @return array
193
     */
194
    protected function extractContentWithMarksAndExtTags( DOMDocument $dom, DOMElement $childNode ): array {
195
        $source = [];
196
197
        // example:
198
        // <g id="1"><mrk mid="0" mtype="seg">An English string with g tags</mrk></g>
199
        $raw = $this->extractTagContent( $dom, $childNode );
200
201
        $markers = preg_split( '#<mrk\s#si', $raw, -1 );
202
203
        $mi = 0;
204
        while ( isset( $markers[ $mi + 1 ] ) ) {
205
            unset( $mid );
206
207
            preg_match( '|mid\s?=\s?["\'](.*?)["\']|si', $markers[ $mi + 1 ], $mid );
208
209
            // if it's a Trados file the trailing spaces after </mrk> are meaningful
210
            // so we add them to
211
            $trailingSpaces = '';
212
            if ( $this->xliffProprietary === 'trados' ) {
213
                preg_match_all( '/<\/mrk>[\s]+/iu', $markers[ $mi + 1 ], $trailingSpacesMatches );
214
215
                if ( isset( $trailingSpacesMatches[ 0 ] ) && count( $trailingSpacesMatches[ 0 ] ) > 0 ) {
216
                    foreach ( $trailingSpacesMatches[ 0 ] as $match ) {
217
                        $trailingSpaces = str_replace( '</mrk>', '', $match );
218
                    }
219
                }
220
            }
221
222
            //re-build the mrk tag after the split
223
            $originalMark = trim( '<mrk ' . $markers[ $mi + 1 ] );
224
225
            $mark_string  = preg_replace( '#^<mrk\s[^>]+>(.*)#', '$1', $originalMark ); // at this point we have: ---> 'Test </mrk> </g>>'
226
            $mark_content = preg_split( '#</mrk>#si', $mark_string );
227
228
            $sourceArray = [
229
                    'mid'           => ( isset( $mid[ 1 ] ) ) ? $mid[ 1 ] : $mi,
230
                    'ext-prec-tags' => ( $mi == 0 ? $markers[ 0 ] : "" ),
231
                    'raw-content'   => ( isset( $mark_content[ 0 ] ) ) ? $mark_content[ 0 ] . $trailingSpaces : '',
232
                    'ext-succ-tags' => ( isset( $mark_content[ 1 ] ) ) ? $mark_content[ 1 ] : '',
233
            ];
234
235
            $source[] = $sourceArray;
236
237
            $mi++;
238
        }
239
240
        return $source;
241
    }
242
243
    /**
244
     * @param array $originalData
245
     *
246
     * @return array
247
     */
248
    protected function getDataRefMap( array $originalData ): array {
249
        // dataRef map
250
        $dataRefMap = [];
251
        foreach ( $originalData as $datum ) {
252
            if ( isset( $datum[ 'attr' ][ 'id' ] ) ) {
253
                $dataRefMap[ $datum[ 'attr' ][ 'id' ] ] = $datum[ 'raw-content' ];
254
            }
255
        }
256
257
        return $dataRefMap;
258
    }
259
260
    /**
261
     * @param $raw
262
     *
263
     * @return bool
264
     */
265
    protected function stringContainsMarks( $raw ): bool {
266
        $markers = preg_split( '#<mrk\s#si', $raw, -1 );
267
268
        return isset( $markers[ 1 ] );
269
    }
270
271
    /**
272
     * @param      $noteValue
273
     * @param bool $escapeStrings
274
     *
275
     * @return array
276
     * @throws Exception
277
     */
278
    protected function JSONOrRawContentArray( $noteValue, ?bool $escapeStrings = true ): array {
279
        //
280
        // convert double escaped entites
281
        //
282
        // Example:
283
        //
284
        // &amp;#39; ---> &#39;
285
        // &amp;amp; ---> &amp;
286
        // &amp;apos ---> &apos;
287
        //
288
        if ( Strings::isADoubleEscapedEntity( $noteValue ) ) {
289
            $noteValue = Strings::htmlspecialchars_decode( $noteValue, true );
290
        } else {
291
            // for non escaped entities $escapeStrings is always true for security reasons
292
            $escapeStrings = true;
293
        }
294
295
        if ( Strings::isJSON( $noteValue ) ) {
296
            return [ 'json' => Strings::cleanCDATA( $noteValue ) ];
297
        }
298
299
        return [ 'raw-content' => Strings::fixNonWellFormedXml( $noteValue, $escapeStrings ) ];
300
    }
301
}
302