XliffParser::xliffToArray()   A
last analyzed

Complexity

Conditions 4
Paths 8

Size

Total Lines 31
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 19
nc 8
nop 2
dl 0
loc 31
rs 9.6333
c 0
b 0
f 0
1
<?php
2
3
namespace Matecat\XliffParser;
4
5
use Exception;
6
use Matecat\XliffParser\Constants\Placeholder;
7
use Matecat\XliffParser\Constants\XliffTags;
8
use Matecat\XliffParser\Exception\NotSupportedVersionException;
9
use Matecat\XliffParser\Exception\NotValidFileException;
10
use Matecat\XliffParser\Utils\Strings;
11
use Matecat\XliffParser\XliffParser\XliffParserFactory;
12
use Matecat\XliffParser\XliffReplacer\XliffReplacerCallbackInterface;
13
use Matecat\XliffParser\XliffReplacer\XliffReplacerFactory;
14
use Matecat\XliffParser\XliffUtils\XliffProprietaryDetect;
15
use Matecat\XliffParser\XliffUtils\XliffVersionDetector;
16
use Matecat\XmlParser\Config;
17
use Matecat\XmlParser\Exception\InvalidXmlException;
18
use Matecat\XmlParser\Exception\XmlParsingException;
19
use Matecat\XmlParser\XmlDomLoader;
20
use Psr\Log\LoggerInterface;
21
22
class XliffParser {
23
    /**
24
     * @var ?LoggerInterface
25
     */
26
    private ?LoggerInterface $logger;
27
28
    /**
29
     * XliffParser constructor.
30
     *
31
     * @param ?LoggerInterface $logger
32
     */
33
    public function __construct( ?LoggerInterface $logger = null ) {
34
        $this->logger = $logger;
35
    }
36
37
    /**
38
     * Replace the translation in a xliff file
39
     *
40
     * @param string                              $originalXliffPath
41
     * @param array                               $data
42
     * @param array                               $transUnits
43
     * @param string                              $targetLang
44
     * @param string                              $outputFile
45
     * @param bool                                $setSourceInTarget
46
     * @param XliffReplacerCallbackInterface|null $callback
47
     */
48
    public function replaceTranslation( string $originalXliffPath, array $data, array $transUnits, string $targetLang, string $outputFile, bool $setSourceInTarget = false, ?XliffReplacerCallbackInterface $callback = null ) {
49
        try {
50
            $parser = XliffReplacerFactory::getInstance( $originalXliffPath, $data, $transUnits, $targetLang, $outputFile, $setSourceInTarget, $this->logger, $callback );
51
            $parser->replaceTranslation();
52
        } catch ( Exception $exception ) {
53
            // do nothing
54
        }
55
    }
56
57
    /**
58
     * Parse an xliff file to array
59
     *
60
     * @param string $xliffContent
61
     *
62
     * @param bool   $collapseEmptyTags
63
     *
64
     * @return array
65
     * @throws NotSupportedVersionException
66
     * @throws NotValidFileException
67
     * @throws InvalidXmlException
68
     * @throws XmlParsingException
69
     */
70
    public function xliffToArray( string $xliffContent, ?bool $collapseEmptyTags = false ): array {
71
        $xliff        = [];
72
        $xliffContent = self::forceUft8Encoding( $xliffContent, $xliff );
73
        $xliffVersion = XliffVersionDetector::detect( $xliffContent );
74
        $info         = XliffProprietaryDetect::getInfoFromXliffContent( $xliffContent );
75
76
        if ( $xliffVersion === 1 ) {
77
            $xliffContent = self::removeInternalFileTagFromContent( $xliffContent, $xliff );
78
        }
79
80
        if ( $xliffVersion === 2 ) {
81
            $xliffContent = self::escapeDataInOriginalMap( $xliffContent );
82
        }
83
84
        if ( $collapseEmptyTags === false ) {
85
            $xliffContent = self::insertPlaceholderInEmptyTags( $xliffContent );
86
        }
87
88
        $xliffProprietary = $info[ 'proprietary_short_name' ] ?? null;
89
        $parser           = XliffParserFactory::getInstance( $xliffVersion, $xliffProprietary, $this->logger );
90
91
        $dom = XmlDomLoader::load(
92
                $xliffContent,
93
                new Config(
94
                        null,
95
                        false,
96
                        LIBXML_NONET | LIBXML_PARSEHUGE
97
                )
98
        );
99
100
        return $parser->parse( $dom, $xliff );
101
    }
102
103
    /**
104
     * Pre-Processing.
105
     * Fixing non UTF-8 encoding (often I get Unicode UTF-16)
106
     *
107
     * @param $xliffContent
108
     * @param $xliff
109
     *
110
     * @return string
111
     */
112
    private static function forceUft8Encoding( $xliffContent, &$xliff ): string {
113
        $enc = mb_detect_encoding( $xliffContent );
114
115
        if ( $enc !== 'UTF-8' ) {
116
            $xliff[ 'parser-warnings' ][] = "Input identified as $enc ans converted UTF-8. May not be a problem if the content is English only";
117
            $s                            = iconv( $enc, 'UTF-8', $xliffContent );
118
            $xliffContent                 = $s !== false ? $s : "";
119
        }
120
121
        return $xliffContent;
122
    }
123
124
    /**
125
     * Remove <internal-file> heading tag from xliff content
126
     * This allows to parse xliff files with a very large <internal-file>
127
     * (only for Xliff 1.0)
128
     *
129
     * @param $xliffContent
130
     * @param $xliff
131
     *
132
     * @return mixed|string
133
     */
134
    private static function removeInternalFileTagFromContent( $xliffContent, &$xliff ) {
135
        $index       = 1;
136
        $a           = Strings::preg_split( '|<internal-file[\s>]|i', $xliffContent );
137
        $tagMatches  = count( $a );
138
139
        // no match, return original string
140
        if ( $tagMatches === 1 ) {
141
            return $a[ 0 ];
142
        }
143
144
        $b                                           = Strings::preg_split( '|</internal-file>|i', $a[ 1 ] );
145
        $strippedContent                             = $a[ 0 ] . $b[ 1 ];
146
        $xliff[ 'files' ][ $index ][ 'reference' ][] = self::extractBase64( $b[ 0 ] );
147
        $index++;
148
149
        // Sometimes, sdlxliff files can contain more than 2 <internal-file> nodes.
150
        // In this case loop and extract any other extra <internal-file> node
151
        for($i=2; $i < $tagMatches; $i++){
152
            if ( isset( $a[ $i ] ) ) {
153
                $c                                           = Strings::preg_split( '|</internal-file[\s>]|i', $a[ $i ] );
154
                $strippedContent                             .= $c[ 1 ];
155
                $xliff[ 'files' ][ $index ][ 'reference' ][] = self::extractBase64( $c[ 0 ] );
156
            }
157
        }
158
159
        return $strippedContent;
160
    }
161
162
    /**
163
     * @param $base64
164
     *
165
     * @return array
166
     */
167
    private static function extractBase64( $base64 ): array {
168
        return [
169
                'form-type' => 'base64',
170
                'base64'    => trim( str_replace( 'form="base64">', '', $base64 ) ),
171
        ];
172
    }
173
174
    /**
175
     * This function replaces:
176
     *
177
     * - spaces (like white space, tab space etc..)
178
     * - xliff tags (see XliffTags::$tags for the full list)
179
     *
180
     * with placeholders in the <original-data> map to preserve them as they are.
181
     *
182
     * XliffParserV2::extractTransUnitOriginalData function will restore them
183
     *
184
     * (only for Xliff 2.0)
185
     *
186
     * @param $xliffContent
187
     *
188
     * @return string
189
     */
190
    private static function escapeDataInOriginalMap( string $xliffContent ): string {
191
        $xliffContent = preg_replace_callback( '|<data(.*?)>(.*?)</data>|iU', [ XliffParser::class, 'replaceSpace' ], $xliffContent );
192
        $xliffContent = preg_replace_callback( '|<data(.*?)>(.*?)</data>|iU', [ XliffParser::class, 'replaceXliffTags' ], $xliffContent );
193
194
        return $xliffContent;
195
    }
196
197
    /**
198
     * Insert a placeholder inside empty tags
199
     * in order to prevent they are collapsed by parser
200
     *
201
     * Example:
202
     *
203
     * <pc id="12" dataRefStart="d1"></pc> ---> <pc id="12" dataRefStart="d1">###___EMPTY_TAG_PLACEHOLDER___###</pc>
204
     *
205
     * AbstractXliffParser::extractTagContent() will cut out ###___EMPTY_TAG_PLACEHOLDER___### to restore original empty tags
206
     *
207
     * @param $xliffContent
208
     *
209
     * @return string
210
     */
211
    private static function insertPlaceholderInEmptyTags( $xliffContent ): string {
212
        preg_match_all( '|<([a-zA-Z0-9._-]+)[^>]*></\1>|m', $xliffContent, $emptyTagMatches );
213
214
        if ( !empty( $emptyTagMatches[ 0 ] ) ) {
215
            foreach ( $emptyTagMatches[ 0 ] as $index => $emptyTagMatch ) {
216
                $matchedTag   = $emptyTagMatches[ 1 ][ $index ];
217
                $subst        = Placeholder::EMPTY_TAG_PLACEHOLDER . '</' . $matchedTag . '>';
218
                $replacedTag  = str_replace( '</' . $matchedTag . '>', $subst, $emptyTagMatch );
219
                $xliffContent = str_replace( $emptyTagMatch, $replacedTag, $xliffContent );
220
            }
221
        }
222
223
        return $xliffContent;
224
    }
225
226
    /**
227
     * Replace <data> value
228
     *
229
     * @param array $matches
230
     *
231
     * @return string
232
     */
233
    private static function replaceSpace( array $matches ): string {
234
        $content = str_replace( ' ', Placeholder::WHITE_SPACE_PLACEHOLDER, $matches[ 2 ] );
235
        $content = str_replace( '\n', Placeholder::NEW_LINE_PLACEHOLDER, $content );
236
        $content = str_replace( '\t', Placeholder::TAB_PLACEHOLDER, $content );
237
238
        return '<data' . $matches[ 1 ] . '>' . $content . '</data>';
239
    }
240
241
    /**
242
     * @param array $matches
243
     *
244
     * @return string
245
     */
246
    private static function replaceXliffTags( array $matches ): string {
247
        $xliffTags = XliffTags::$tags;
248
        $content   = $matches[ 2 ];
249
250
        foreach ( $xliffTags as $xliffTag ) {
251
            $content = preg_replace( '|&lt;(' . $xliffTag . '.*?)&gt;|si', Placeholder::LT_PLACEHOLDER . "$1" . Placeholder::GT_PLACEHOLDER, $content );
252
            $content = preg_replace( '|&lt;(/' . $xliffTag . ')&gt;|si', Placeholder::LT_PLACEHOLDER . "$1" . Placeholder::GT_PLACEHOLDER, $content );
253
        }
254
255
        return '<data' . $matches[ 1 ] . '>' . $content . '</data>';
256
    }
257
}
258