AbstractXliffParser::extractTuFromNode()   B
last analyzed

Complexity

Conditions 10
Paths 18

Size

Total Lines 35
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 10
eloc 17
nc 18
nop 8
dl 0
loc 35
rs 7.6666
c 0
b 0
f 0

How to fix   Complexity    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
namespace Matecat\XliffParser\XliffParser;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use Exception;
9
use Matecat\EmojiParser\Emoji;
10
use Matecat\XliffParser\Constants\Placeholder;
11
use Matecat\XliffParser\Utils\Strings;
12
use OverflowException;
13
use Psr\Log\LoggerInterface;
14
15
abstract class AbstractXliffParser {
16
17
    const MAX_GROUP_RECURSION_LEVEL = 50;
18
19
    /**
20
     * @var LoggerInterface|null
21
     */
22
    protected ?LoggerInterface $logger;
23
24
    /**
25
     * @var string|null
26
     */
27
    protected ?string $xliffProprietary;
28
29
    /**
30
     * @var int
31
     */
32
    protected $xliffVersion;
33
34
    /**
35
     * AbstractXliffParser constructor.
36
     *
37
     * @param int                  $xliffVersion
38
     * @param string|null          $xliffProprietary
39
     * @param LoggerInterface|null $logger
40
     */
41
    public function __construct( int $xliffVersion, ?string $xliffProprietary = null, LoggerInterface $logger = null ) {
42
        $this->xliffVersion     = $xliffVersion;
43
        $this->logger           = $logger;
44
        $this->xliffProprietary = $xliffProprietary;
45
    }
46
47
    /**
48
     * @return string
49
     */
50
    protected function getTuTagName(): string {
51
        return ( $this->xliffVersion === 1 ) ? 'trans-unit' : 'unit';
52
    }
53
54
    /**
55
     * @param DOMDocument $dom
56
     * @param array|null  $output
57
     *
58
     * @return array
59
     */
60
    abstract public function parse( DOMDocument $dom, ?array $output = [] ): array;
61
62
    /**
63
     * Extract trans-unit content from the current node
64
     *
65
     * @param DOMElement  $childNode
66
     * @param array       $transUnitIdArrayForUniquenessCheck
67
     * @param DOMDocument $dom
68
     * @param array       $output
69
     * @param int         $i
70
     * @param int         $j
71
     * @param array|null  $contextGroups
72
     * @param int|null    $recursionLevel
73
     */
74
    protected function extractTuFromNode( DOMNode $childNode, array &$transUnitIdArrayForUniquenessCheck, DOMDocument $dom, array &$output, int &$i, int &$j, ?array $contextGroups = [], ?int $recursionLevel = 0 ) {
75
76
        if ( $childNode->nodeType != XML_ELEMENT_NODE ) {
77
            return;
78
        }
79
80
        if ( $childNode->nodeName === 'group' ) {
81
82
            // add nested context-groups
83
            foreach ( $childNode->childNodes as $nestedChildNode ) {
84
                if ( $nestedChildNode->nodeName === 'context-group' ) {
85
                    $contextGroups[] = $nestedChildNode;
86
                }
87
            }
88
89
            // avoid infinite recursion
90
            $recursionLevel++;
91
92
            foreach ( $childNode->childNodes as $nestedChildNode ) {
93
94
                // nested groups
95
                if ( $nestedChildNode->nodeName === 'group' ) {
96
97
                    if ( $recursionLevel < self::MAX_GROUP_RECURSION_LEVEL ) {
98
                        $this->extractTuFromNode( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups, $recursionLevel );
99
                    } else {
100
                        throw new OverflowException( "Maximum tag group nesting level of '" . self::MAX_GROUP_RECURSION_LEVEL . "' reached, aborting!" );
101
                    }
102
103
                } elseif ( $nestedChildNode->nodeName === $this->getTuTagName() ) {
104
                    $this->extractTransUnit( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
105
                }
106
            }
107
        } elseif ( $childNode->nodeName === $this->getTuTagName() ) {
108
            $this->extractTransUnit( $childNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
109
        }
110
    }
111
112
    /**
113
     * Extract and populate 'trans-units' array
114
     *
115
     * @param DOMElement  $transUnit
116
     * @param array       $transUnitIdArrayForUniquenessCheck
117
     * @param DOMDocument $dom
118
     * @param array       $output
119
     * @param int         $i
120
     * @param int         $j
121
     * @param array|null  $contextGroups
122
     *
123
     * @return mixed
124
     */
125
    abstract protected function extractTransUnit( DOMElement $transUnit, array &$transUnitIdArrayForUniquenessCheck, DomDocument $dom, array &$output, int &$i, int &$j, ?array $contextGroups = [] );
126
127
    /**
128
     * @param DOMDocument $dom
129
     * @param DOMElement  $node
130
     *
131
     * @return array
132
     */
133
    protected function extractContent( DOMDocument $dom, DOMNode $node ): array {
134
        return [
135
                'raw-content' => $this->extractTagContent( $dom, $node ),
136
                'attr'        => $this->extractTagAttributes( $node )
137
        ];
138
    }
139
140
    /**
141
     * Extract attributes if they are present
142
     *
143
     * Ex:
144
     * <p align=center style="font-size: 12px;">some text</p>
145
     *
146
     * $attr->nodeName == 'align' :: $attr->nodeValue == 'center'
147
     * $attr->nodeName == 'style' :: $attr->nodeValue == 'font-size: 12px;'
148
     *
149
     * @param DOMNode $element
150
     *
151
     * @return array
152
     */
153
    protected function extractTagAttributes( DOMNode $element ): array {
154
        $tagAttributes = [];
155
156
        if ( $element->hasAttributes() ) {
157
            foreach ( $element->attributes as $attr ) {
158
                $tagAttributes[ $attr->nodeName ] = $attr->nodeValue;
159
            }
160
        }
161
162
        return $tagAttributes;
163
    }
164
165
    /**
166
     * Extract tag content from DOMDocument node
167
     *
168
     * @param DOMDocument $dom
169
     * @param DOMNode     $element
170
     *
171
     * @return string
172
     */
173
    protected function extractTagContent( DOMDocument $dom, DOMNode $element ): string {
174
        $childNodes       = $element->hasChildNodes();
175
        $extractedContent = '';
176
177
        if ( !empty( $childNodes ) ) {
178
            foreach ( $element->childNodes as $node ) {
179
                $extractedContent .= Emoji::toEntity( Strings::fixNonWellFormedXml( $dom->saveXML( $node ) ) );
180
            }
181
        }
182
183
        return str_replace( Placeholder::EMPTY_TAG_PLACEHOLDER, '', $extractedContent );
184
    }
185
186
    /**
187
     * Used to extract <seg-source> and <seg-target>
188
     *
189
     * @param DOMDocument $dom
190
     * @param DOMElement  $childNode
191
     *
192
     * @return array
193
     */
194
    protected function extractContentWithMarksAndExtTags( DOMDocument $dom, DOMElement $childNode ): array {
195
        $source = [];
196
197
        // example:
198
        // <g id="1"><mrk mid="0" mtype="seg">An English string with g tags</mrk></g>
199
        $raw = $this->extractTagContent( $dom, $childNode );
200
201
        $markers = preg_split( '#<mrk\s#si', $raw, -1 );
202
203
        $mi = 0;
204
        while ( isset( $markers[ $mi + 1 ] ) ) {
205
            unset( $mid );
206
207
            preg_match( '|mid\s?=\s?["\'](.*?)["\']|si', $markers[ $mi + 1 ], $mid );
208
209
            // if it's a Trados file the trailing spaces after </mrk> are meaningful
210
            // so we add them to
211
            $trailingSpaces = '';
212
            if ( $this->xliffProprietary === 'trados' ) {
213
                preg_match_all( '/<\/mrk>[\s]+/iu', $markers[ $mi + 1 ], $trailingSpacesMatches );
214
215
                if ( isset( $trailingSpacesMatches[ 0 ] ) && count( $trailingSpacesMatches[ 0 ] ) > 0 ) {
216
                    foreach ( $trailingSpacesMatches[ 0 ] as $match ) {
217
                        $trailingSpaces = str_replace( '</mrk>', '', $match );
218
                    }
219
                }
220
            }
221
222
            //re-build the mrk tag after the split
223
            $originalMark = trim( '<mrk ' . $markers[ $mi + 1 ] );
224
225
            $mark_string  = preg_replace( '#^<mrk\s[^>]+>(.*)#', '$1', $originalMark ); // at this point we have: ---> 'Test </mrk> </g>>'
226
            $mark_content = preg_split( '#</mrk>#si', $mark_string );
227
228
            $sourceArray = [
229
                    'mid'           => ( isset( $mid[ 1 ] ) ) ? $mid[ 1 ] : $mi,
230
                    'ext-prec-tags' => ( $mi == 0 ? $markers[ 0 ] : "" ),
231
                    'raw-content'   => ( isset( $mark_content[ 0 ] ) ) ? $mark_content[ 0 ] . $trailingSpaces : '',
232
                    'ext-succ-tags' => ( isset( $mark_content[ 1 ] ) ) ? $mark_content[ 1 ] : '',
233
            ];
234
235
            $source[] = $sourceArray;
236
237
            $mi++;
238
        }
239
240
        return $source;
241
    }
242
243
    /**
244
     * @param array $originalData
245
     *
246
     * @return array
247
     */
248
    protected function getDataRefMap( array $originalData ): array {
249
        // dataRef map
250
        $dataRefMap = [];
251
        foreach ( $originalData as $datum ) {
252
            if ( isset( $datum[ 'attr' ][ 'id' ] ) ) {
253
                $dataRefMap[ $datum[ 'attr' ][ 'id' ] ] = $datum[ 'raw-content' ];
254
            }
255
        }
256
257
        return $dataRefMap;
258
    }
259
260
    /**
261
     * @param $raw
262
     *
263
     * @return bool
264
     */
265
    protected function stringContainsMarks( $raw ): bool {
266
        $markers = preg_split( '#<mrk\s#si', $raw, -1 );
267
268
        return isset( $markers[ 1 ] );
269
    }
270
271
    /**
272
     * @param      $noteValue
273
     * @param bool $escapeStrings
274
     *
275
     * @return array
276
     * @throws Exception
277
     */
278
    protected function JSONOrRawContentArray( $noteValue, ?bool $escapeStrings = true ): array {
279
        //
280
        // convert double escaped entites
281
        //
282
        // Example:
283
        //
284
        // &amp;#39; ---> &#39;
285
        // &amp;amp; ---> &amp;
286
        // &amp;apos ---> &apos;
287
        //
288
        if ( Strings::isADoubleEscapedEntity( $noteValue ) ) {
289
            $noteValue = Strings::htmlspecialchars_decode( $noteValue, true );
290
        } else {
291
            // for non escaped entities $escapeStrings is always true for security reasons
292
            $escapeStrings = true;
293
        }
294
295
        if ( Strings::isJSON( $noteValue ) ) {
296
            return [ 'json' => Strings::cleanCDATA( $noteValue ) ];
297
        }
298
299
        return [ 'raw-content' => Strings::fixNonWellFormedXml( $noteValue, $escapeStrings ) ];
300
    }
301
}
302