Passed
Pull Request — master (#87)
by Mauro
08:35
created

AbstractXliffParser::extractTuFromNode()   B

Complexity

Conditions 7
Paths 10

Size

Total Lines 17
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 10
nc 10
nop 7
dl 0
loc 17
rs 8.8333
c 0
b 0
f 0
1
<?php
2
3
namespace Matecat\XliffParser\XliffParser;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMNode;
8
use Matecat\XliffParser\Constants\Placeholder;
9
use Matecat\XliffParser\Utils\Emoji;
10
use Matecat\XliffParser\Utils\Strings;
11
use Matecat\XliffParser\XliffUtils\DataRefReplacer;
12
use Psr\Log\LoggerInterface;
13
14
abstract class AbstractXliffParser {
15
    /**
16
     * @var LoggerInterface
17
     */
18
    protected $logger;
19
20
    /**
21
     * @var string|null
22
     */
23
    protected $xliffProprietary;
24
25
    /**
26
     * @var int
27
     */
28
    protected $xliffVersion;
29
30
    /**
31
     * AbstractXliffParser constructor.
32
     *
33
     * @param int                  $xliffVersion
34
     * @param string|null          $xliffProprietary
35
     * @param LoggerInterface|null $logger
36
     */
37
    public function __construct( $xliffVersion, $xliffProprietary = null, LoggerInterface $logger = null ) {
38
        $this->xliffVersion     = $xliffVersion;
39
        $this->logger           = $logger;
40
        $this->xliffProprietary = $xliffProprietary;
41
    }
42
43
    /**
44
     * @return string
45
     */
46
    protected function getTuTagName() {
47
        return ( $this->xliffVersion === 1 ) ? 'trans-unit' : 'unit';
48
    }
49
50
    /**
51
     * @param DOMDocument $dom
52
     *
53
     * @return array
54
     */
55
    abstract public function parse( DOMDocument $dom, $output = [] );
56
57
    /**
58
     * Extract trans-unit content from the current node
59
     *
60
     * @param $childNode
61
     * @param              $transUnitIdArrayForUniquenessCheck
62
     * @param DOMDocument $dom
63
     * @param              $output
64
     * @param              $i
65
     * @param              $j
66
     * @param array $contextGroups
67
     */
68
    protected function extractTuFromNode( $childNode, &$transUnitIdArrayForUniquenessCheck, DOMDocument $dom, &$output, &$i, &$j, $contextGroups = [] ) {
69
        if ( $childNode->nodeName === 'group' ) {
70
71
            // group context-groups
72
            foreach ( $childNode->getElementsByTagName( 'context-group' ) as $contextGroup ) {
73
                $contextGroups[] = $contextGroup;
74
            }
75
76
            foreach ( $childNode->childNodes as $nestedChildNode ) {
77
                if ( $nestedChildNode->nodeName === 'group' ) {
78
                    $this->extractTuFromNode( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
79
                } elseif ( $nestedChildNode->nodeName === $this->getTuTagName() ) {
80
                    $this->extractTransUnit( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
81
                }
82
            }
83
        } elseif ( $childNode->nodeName === $this->getTuTagName() ) {
84
            $this->extractTransUnit( $childNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
85
        }
86
    }
87
88
    /**
89
     * Extract and populate 'trans-units' array
90
     *
91
     * @param $transUnit
92
     * @param $transUnitIdArrayForUniquenessCheck
93
     * @param $dom
94
     * @param $output
95
     * @param $i
96
     * @param $j
97
     * @param $contextGroups
98
     *
99
     * @return mixed
100
     */
101
    abstract protected function extractTransUnit( $transUnit, &$transUnitIdArrayForUniquenessCheck, $dom, &$output, &$i, &$j,$contextGroups = [] );
102
103
    /**
104
     * @param DOMDocument $dom
105
     * @param DOMElement  $node
106
     *
107
     * @return array
108
     */
109
    protected function extractContent( DOMDocument $dom, DOMNode $node ) {
110
        return [
111
                'raw-content' => $this->extractTagContent( $dom, $node ),
112
                'attr'        => $this->extractTagAttributes( $node )
113
        ];
114
    }
115
116
    /**
117
     * Extract attributes if they are present
118
     *
119
     * Ex:
120
     * <p align=center style="font-size: 12px;">some text</p>
121
     *
122
     * $attr->nodeName == 'align' :: $attr->nodeValue == 'center'
123
     * $attr->nodeName == 'style' :: $attr->nodeValue == 'font-size: 12px;'
124
     *
125
     * @param DOMNode $element
126
     *
127
     * @return array
128
     */
129
    protected function extractTagAttributes( DOMNode $element ) {
130
        $tagAttributes = [];
131
132
        if ( $element->hasAttributes() ) {
133
            foreach ( $element->attributes as $attr ) {
134
                $tagAttributes[ $attr->nodeName ] = $attr->nodeValue;
135
            }
136
        }
137
138
        return $tagAttributes;
139
    }
140
141
    /**
142
     * Extract tag content from DOMDocument node
143
     *
144
     * @param DOMDocument $dom
145
     * @param DOMNode     $element
146
     *
147
     * @return string
148
     */
149
    protected function extractTagContent( DOMDocument $dom, DOMNode $element ) {
150
        $childNodes       = $element->hasChildNodes();
151
        $extractedContent = '';
152
153
        if ( !empty( $childNodes ) ) {
154
            foreach ( $element->childNodes as $node ) {
155
                $extractedContent .= Emoji::toEntity( Strings::fixNonWellFormedXml( $dom->saveXML( $node ) ) );
156
            }
157
        }
158
159
        return str_replace( Placeholder::EMPTY_TAG_PLACEHOLDER, '', $extractedContent );
160
    }
161
162
    /**
163
     * Used to extract <seg-source> and <seg-target>
164
     *
165
     * @param DOMDocument $dom
166
     * @param DOMElement  $childNode
167
     * @param string      $originalRawContent
168
     * @param array       $originalData
169
     *
170
     * @return array
171
     */
172
    protected function extractContentWithMarksAndExtTags( DOMDocument $dom, DOMElement $childNode, $originalRawContent, array $originalData = [] ) {
0 ignored issues
show
Unused Code introduced by
The parameter $originalRawContent is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

172
    protected function extractContentWithMarksAndExtTags( DOMDocument $dom, DOMElement $childNode, /** @scrutinizer ignore-unused */ $originalRawContent, array $originalData = [] ) {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
173
        $source = [];
174
175
        // example:
176
        // <g id="1"><mrk mid="0" mtype="seg">An English string with g tags</mrk></g>
177
        $raw = $this->extractTagContent( $dom, $childNode );
178
179
        $markers = preg_split( '#<mrk\s#si', $raw, -1 );
180
181
        $mi = 0;
182
        while ( isset( $markers[ $mi + 1 ] ) ) {
183
            unset( $mid );
184
185
            preg_match( '|mid\s?=\s?["\'](.*?)["\']|si', $markers[ $mi + 1 ], $mid );
186
187
            // if it's a Trados file the trailing spaces after </mrk> are meaningful
188
            // so we add them to
189
            $trailingSpaces = '';
190
            if ( $this->xliffProprietary === 'trados' ) {
191
                preg_match_all( '/<\/mrk>[\s]+/iu', $markers[ $mi + 1 ], $trailingSpacesMatches );
192
193
                if ( isset( $trailingSpacesMatches[ 0 ] ) && count( $trailingSpacesMatches[ 0 ] ) > 0 ) {
194
                    foreach ( $trailingSpacesMatches[ 0 ] as $match ) {
195
                        $trailingSpaces = str_replace( '</mrk>', '', $match );
196
                    }
197
                }
198
            }
199
200
            //re-build the mrk tag after the split
201
            $originalMark = trim( '<mrk ' . $markers[ $mi + 1 ] );
202
203
            $mark_string  = preg_replace( '#^<mrk\s[^>]+>(.*)#', '$1', $originalMark ); // at this point we have: ---> 'Test </mrk> </g>>'
204
            $mark_content = preg_split( '#</mrk>#si', $mark_string );
205
206
            $sourceArray = [
207
                    'mid'           => ( isset( $mid[ 1 ] ) ) ? $mid[ 1 ] : $mi,
208
                    'ext-prec-tags' => ( $mi == 0 ? $markers[ 0 ] : "" ),
209
                    'raw-content'   => ( isset( $mark_content[ 0 ] ) ) ? $mark_content[ 0 ] . $trailingSpaces : '',
210
                    'ext-succ-tags' => ( isset( $mark_content[ 1 ] ) ) ? $mark_content[ 1 ] : '',
211
            ];
212
213
            if ( !empty( $originalData ) ) {
214
                $dataRefMap                        = $this->getDataRefMap( $originalData );
215
                $sourceArray[ 'replaced-content' ] = ( new DataRefReplacer( $dataRefMap ) )->replace( $mark_content[ 0 ] );
216
            }
217
218
            $source[] = $sourceArray;
219
220
            $mi++;
221
        }
222
223
        return $source;
224
    }
225
226
    /**
227
     * @param array $originalData
228
     *
229
     * @return array
230
     */
231
    protected function getDataRefMap( $originalData ) {
232
        // dataRef map
233
        $dataRefMap = [];
234
        foreach ( $originalData as $datum ) {
235
            if ( isset( $datum[ 'attr' ][ 'id' ] ) ) {
236
                $dataRefMap[ $datum[ 'attr' ][ 'id' ] ] = $datum[ 'raw-content' ];
237
            }
238
        }
239
240
        return $dataRefMap;
241
    }
242
243
    /**
244
     * @param $raw
245
     *
246
     * @return bool
247
     */
248
    protected function stringContainsMarks( $raw ) {
249
        $markers = preg_split( '#<mrk\s#si', $raw, -1 );
250
251
        return isset( $markers[ 1 ] );
252
    }
253
254
    /**
255
     * @param      $noteValue
256
     * @param bool $escapeStrings
257
     *
258
     * @return array
259
     * @throws \Exception
260
     */
261
    protected function JSONOrRawContentArray( $noteValue, $escapeStrings = true ) {
262
        //
263
        // convert double escaped entites
264
        //
265
        // Example:
266
        //
267
        // &amp;#39; ---> &#39;
268
        // &amp;amp; ---> &amp;
269
        // &amp;apos ---> &apos;
270
        //
271
        if ( Strings::isADoubleEscapedEntity( $noteValue ) ) {
272
            $noteValue = Strings::htmlspecialchars_decode( $noteValue, true );
273
        } else {
274
            // for non escaped entities $escapeStrings is always true for security reasons
275
            $escapeStrings = true;
276
        }
277
278
        if ( Strings::isJSON( $noteValue ) ) {
279
            return [ 'json' => Strings::cleanCDATA( $noteValue ) ];
280
        }
281
282
        return [ 'raw-content' => Strings::fixNonWellFormedXml( $noteValue, $escapeStrings ) ];
283
    }
284
}
285