1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Matecat\XliffParser\XliffReplacer; |
4
|
|
|
|
5
|
|
|
use Psr\Log\LoggerInterface; |
6
|
|
|
use RuntimeException; |
7
|
|
|
|
8
|
|
|
abstract class AbstractXliffReplacer { |
9
|
|
|
protected $originalFP; |
10
|
|
|
|
11
|
|
|
protected $tuTagName; // <trans-unit> (forXliff v 1.*) or <unit> (forXliff v 2.*) |
12
|
|
|
protected $inTU = false; // flag to check whether we are in a <trans-unit> |
13
|
|
|
protected $inTarget = false; // flag to check whether we are in a <target>, to ignore everything |
14
|
|
|
protected $isEmpty = false; // flag to check whether we are in an empty tag (<tag/>) |
15
|
|
|
protected $targetWasWritten = false; // flag to check is <target> was written in the current unit |
16
|
|
|
protected $segmentPositionInTu = -1; // the current position of segment in the current <unit> (forXliff v 2.*) |
17
|
|
|
|
18
|
|
|
protected $CDATABuffer = ""; // buffer for special tag |
19
|
|
|
protected $bufferIsActive = false; // buffer for special tag |
20
|
|
|
|
21
|
|
|
protected $offset = 0; // offset for SAX pointer |
22
|
|
|
protected $outputFP; // output stream pointer |
23
|
|
|
protected $currentBuffer; // the current piece of text it's been parsed |
24
|
|
|
protected $len; // length of the currentBuffer |
25
|
|
|
protected $segments; // array of translations |
26
|
|
|
protected $lastTransUnit = []; |
27
|
|
|
protected int $segmentInUnitPosition = 0; |
28
|
|
|
protected $currentTransUnitId; // id of current <trans-unit> |
29
|
|
|
protected $currentTransUnitIsTranslatable; // 'translate' attribute of current <trans-unit> |
30
|
|
|
protected $unitContainsMda = false; // check if <unit> already contains a <mda:metadata> (forXliff v 2.*) |
31
|
|
|
protected $hasWrittenCounts = false; // check if <unit> already wrote segment counts (forXliff v 2.*) |
32
|
|
|
|
33
|
|
|
protected $targetLang; |
34
|
|
|
|
35
|
|
|
protected $sourceInTarget; |
36
|
|
|
|
37
|
|
|
protected $transUnits; |
38
|
|
|
|
39
|
|
|
/** @var int */ |
40
|
|
|
protected $xliffVersion; |
41
|
|
|
|
42
|
|
|
protected $callback; |
43
|
|
|
|
44
|
|
|
protected $logger; |
45
|
|
|
|
46
|
|
|
protected static $INTERNAL_TAG_PLACEHOLDER; |
47
|
|
|
|
48
|
|
|
protected $counts = [ |
49
|
|
|
'raw_word_count' => 0, |
50
|
|
|
'eq_word_count' => 0, |
51
|
|
|
]; |
52
|
|
|
|
53
|
|
|
/** |
54
|
|
|
* AbstractXliffReplacer constructor. |
55
|
|
|
* |
56
|
|
|
* @param string $originalXliffPath |
57
|
|
|
* @param int $xliffVersion |
58
|
|
|
* @param array $segments |
59
|
|
|
* @param array $transUnits |
60
|
|
|
* @param string $trgLang |
61
|
|
|
* @param string $outputFilePath |
62
|
|
|
* @param bool $setSourceInTarget |
63
|
|
|
* @param LoggerInterface|null $logger |
64
|
|
|
* @param XliffReplacerCallbackInterface|null $callback |
65
|
|
|
*/ |
66
|
|
|
public function __construct( |
67
|
|
|
$originalXliffPath, |
68
|
|
|
$xliffVersion, |
69
|
|
|
$segments, |
70
|
|
|
$transUnits, |
71
|
|
|
$trgLang, |
72
|
|
|
$outputFilePath, |
73
|
|
|
$setSourceInTarget, |
74
|
|
|
LoggerInterface $logger = null, |
75
|
|
|
XliffReplacerCallbackInterface $callback = null |
76
|
|
|
) { |
77
|
|
|
self::$INTERNAL_TAG_PLACEHOLDER = $this->getInternalTagPlaceholder(); |
78
|
|
|
$this->createOutputFileIfDoesNotExist( $outputFilePath ); |
79
|
|
|
$this->setFileDescriptors( $originalXliffPath, $outputFilePath ); |
80
|
|
|
$this->xliffVersion = $xliffVersion; |
81
|
|
|
$this->setTuTagName(); |
82
|
|
|
$this->segments = $segments; |
83
|
|
|
$this->targetLang = $trgLang; |
84
|
|
|
$this->sourceInTarget = $setSourceInTarget; |
85
|
|
|
$this->transUnits = $transUnits; |
86
|
|
|
$this->logger = $logger; |
87
|
|
|
$this->callback = $callback; |
88
|
|
|
} |
89
|
|
|
|
90
|
|
|
public function replaceTranslation() { |
91
|
|
|
fwrite( $this->outputFP, '<?xml version="1.0" encoding="UTF-8"?>' ); |
92
|
|
|
|
93
|
|
|
//create Sax parser |
94
|
|
|
$xmlParser = $this->initSaxParser(); |
95
|
|
|
|
96
|
|
|
while ( $this->currentBuffer = fread( $this->originalFP, 4096 ) ) { |
97
|
|
|
/* |
98
|
|
|
preprocess file |
99
|
|
|
*/ |
100
|
|
|
// obfuscate entities because sax automatically does html_entity_decode |
101
|
|
|
$temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer ); |
102
|
|
|
|
103
|
|
|
//avoid cutting entities in half: |
104
|
|
|
//the last fread could have truncated an entity (say, '<' in '&l'), thus invalidating the escaping |
105
|
|
|
//***** and if there is an & that it is not an entity, this is an infinite loop !!!!! |
106
|
|
|
|
107
|
|
|
$escape_AMP = false; |
108
|
|
|
|
109
|
|
|
// 9 is the max length of an entity. So, suppose that the & is at the end of buffer, |
110
|
|
|
// add 9 Bytes and substitute the entities, if the & is present, and it is not at the end |
111
|
|
|
//it can't be an entity, exit the loop |
112
|
|
|
|
113
|
|
|
while ( true ) { |
114
|
|
|
$_ampPos = strpos( $temporary_check_buffer, '&' ); |
115
|
|
|
|
116
|
|
|
//check for real entity or escape it to safely exit from the loop!!! |
117
|
|
|
if ( $_ampPos === false || strlen( substr( $temporary_check_buffer, $_ampPos ) ) > 9 ) { |
118
|
|
|
$escape_AMP = true; |
119
|
|
|
break; |
120
|
|
|
} |
121
|
|
|
|
122
|
|
|
//if an entity is still present, fetch some more and repeat the escaping |
123
|
|
|
$this->currentBuffer .= fread( $this->originalFP, 9 ); |
124
|
|
|
$temporary_check_buffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer ); |
125
|
|
|
} |
126
|
|
|
|
127
|
|
|
//free stuff outside the loop |
128
|
|
|
unset( $temporary_check_buffer ); |
129
|
|
|
|
130
|
|
|
$this->currentBuffer = preg_replace( "/&(.*?);/", self::$INTERNAL_TAG_PLACEHOLDER . '$1' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer ); |
131
|
|
|
if ( $escape_AMP ) { |
132
|
|
|
$this->currentBuffer = str_replace( "&", self::$INTERNAL_TAG_PLACEHOLDER . 'amp' . self::$INTERNAL_TAG_PLACEHOLDER, $this->currentBuffer ); |
133
|
|
|
} |
134
|
|
|
|
135
|
|
|
//get length of chunk |
136
|
|
|
$this->len = strlen( $this->currentBuffer ); |
137
|
|
|
|
138
|
|
|
//parse chunk of text |
139
|
|
|
if ( !xml_parse( $xmlParser, $this->currentBuffer, feof( $this->originalFP ) ) ) { |
140
|
|
|
//if unable, raise an exception |
141
|
|
|
throw new RuntimeException( sprintf( |
142
|
|
|
"XML error: %s at line %d", |
143
|
|
|
xml_error_string( xml_get_error_code( $xmlParser ) ), |
144
|
|
|
xml_get_current_line_number( $xmlParser ) |
145
|
|
|
) ); |
146
|
|
|
} |
147
|
|
|
//get accumulated this->offset in document: as long as SAX pointer advances, we keep track of total bytes it has seen so far; this way, we can translate its global pointer in an address local to the current buffer of text to retrieve last char of tag |
148
|
|
|
$this->offset += $this->len; |
149
|
|
|
} |
150
|
|
|
|
151
|
|
|
// close Sax parser |
152
|
|
|
$this->closeSaxParser( $xmlParser ); |
153
|
|
|
|
154
|
|
|
} |
155
|
|
|
|
156
|
|
|
protected function getLastCharacter( $parser ) { |
157
|
|
|
|
158
|
|
|
//this logic helps detecting empty tags |
159
|
|
|
//get current position of SAX pointer in all the stream of data is has read so far: |
160
|
|
|
//it points at the end of current tag |
161
|
|
|
$idx = xml_get_current_byte_index( $parser ); |
162
|
|
|
|
163
|
|
|
//check whether the bounds of current tag are entirely in current buffer or the end of the current tag |
164
|
|
|
//is outside current buffer (in the latter case, it's in next buffer to be read by the while loop); |
165
|
|
|
//this check is necessary because we may have truncated a tag in half with current read, |
166
|
|
|
//and the other half may be encountered in the next buffer it will be passed |
167
|
|
|
if ( isset( $this->currentBuffer[ $idx - $this->offset ] ) ) { |
168
|
|
|
//if this tag entire lenght fitted in the buffer, the last char must be the last |
169
|
|
|
//symbol before the '>'; if it's an empty tag, it is assumed that it's a '/' |
170
|
|
|
$lastChar = $this->currentBuffer[ $idx - $this->offset ]; |
171
|
|
|
} else { |
172
|
|
|
//if it's out, simple use the last character of the chunk |
173
|
|
|
$lastChar = $this->currentBuffer[ $this->len - 1 ]; |
174
|
|
|
} |
175
|
|
|
|
176
|
|
|
return $lastChar; |
177
|
|
|
|
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
/** |
181
|
|
|
* @return string |
182
|
|
|
*/ |
183
|
|
|
private function getInternalTagPlaceholder() { |
184
|
|
|
return "§" . |
185
|
|
|
substr( |
186
|
|
|
str_replace( |
187
|
|
|
[ '+', '/' ], |
188
|
|
|
'', |
189
|
|
|
base64_encode( openssl_random_pseudo_bytes( 10, $_crypto_strong ) ) |
190
|
|
|
), |
191
|
|
|
0, |
192
|
|
|
4 |
193
|
|
|
); |
194
|
|
|
} |
195
|
|
|
|
196
|
|
|
private function createOutputFileIfDoesNotExist( $outputFilePath ) { |
197
|
|
|
// create output file |
198
|
|
|
if ( !file_exists( $outputFilePath ) ) { |
199
|
|
|
touch( $outputFilePath ); |
200
|
|
|
} |
201
|
|
|
} |
202
|
|
|
|
203
|
|
|
/** |
204
|
|
|
* @param $originalXliffPath |
205
|
|
|
* @param $outputFilePath |
206
|
|
|
*/ |
207
|
|
|
private function setFileDescriptors( $originalXliffPath, $outputFilePath ) { |
208
|
|
|
$this->outputFP = fopen( $outputFilePath, 'w+' ); |
209
|
|
|
|
210
|
|
|
$streamArgs = null; |
211
|
|
|
|
212
|
|
|
if ( !( $this->originalFP = fopen( $originalXliffPath, "r", false, stream_context_create( $streamArgs ) ) ) ) { |
213
|
|
|
throw new RuntimeException( "could not open XML input" ); |
214
|
|
|
} |
215
|
|
|
} |
216
|
|
|
|
217
|
|
|
/** |
218
|
|
|
* set tuTagName |
219
|
|
|
* <trans-unit> (xliff v1.*) || <unit> (xliff v2.*) |
220
|
|
|
*/ |
221
|
|
|
private function setTuTagName() { |
222
|
|
|
$this->tuTagName = ( $this->xliffVersion === 2 ) ? 'unit' : 'trans-unit'; |
223
|
|
|
} |
224
|
|
|
|
225
|
|
|
/** |
226
|
|
|
* AbstractXliffReplacer destructor. |
227
|
|
|
*/ |
228
|
|
|
public function __destruct() { |
229
|
|
|
//this stream can be closed outside the class |
230
|
|
|
//to permit multiple concurrent downloads, so suppress warnings |
231
|
|
|
@fclose( $this->originalFP ); |
|
|
|
|
232
|
|
|
fclose( $this->outputFP ); |
233
|
|
|
} |
234
|
|
|
|
235
|
|
|
/** |
236
|
|
|
* Init Sax parser |
237
|
|
|
* |
238
|
|
|
* @return resource |
239
|
|
|
*/ |
240
|
|
|
protected function initSaxParser() { |
241
|
|
|
$xmlSaxParser = xml_parser_create( 'UTF-8' ); |
242
|
|
|
xml_set_object( $xmlSaxParser, $this ); |
243
|
|
|
xml_parser_set_option( $xmlSaxParser, XML_OPTION_CASE_FOLDING, false ); |
244
|
|
|
xml_set_element_handler( $xmlSaxParser, 'tagOpen', 'tagClose' ); |
245
|
|
|
xml_set_character_data_handler( $xmlSaxParser, 'characterData' ); |
246
|
|
|
|
247
|
|
|
return $xmlSaxParser; |
|
|
|
|
248
|
|
|
} |
249
|
|
|
|
250
|
|
|
/** |
251
|
|
|
* @param resource $xmlSaxParser |
252
|
|
|
*/ |
253
|
|
|
protected function closeSaxParser( $xmlSaxParser ) { |
254
|
|
|
xml_parser_free( $xmlSaxParser ); |
255
|
|
|
} |
256
|
|
|
|
257
|
|
|
/** |
258
|
|
|
* @param $parser |
259
|
|
|
* @param $name |
260
|
|
|
* @param $attr |
261
|
|
|
* |
262
|
|
|
* @return mixed |
263
|
|
|
*/ |
264
|
|
|
abstract protected function tagOpen( $parser, $name, $attr ); |
265
|
|
|
|
266
|
|
|
/** |
267
|
|
|
* @param $parser |
268
|
|
|
* @param $name |
269
|
|
|
* |
270
|
|
|
* @return mixed |
271
|
|
|
*/ |
272
|
|
|
abstract protected function tagClose( $parser, $name ); |
273
|
|
|
|
274
|
|
|
/** |
275
|
|
|
* @param $parser |
276
|
|
|
* @param $data |
277
|
|
|
* |
278
|
|
|
* @return mixed |
279
|
|
|
*/ |
280
|
|
|
protected function characterData( $parser, $data ): void { |
|
|
|
|
281
|
|
|
// don't write <target> data |
282
|
|
|
if ( !$this->inTarget && !$this->bufferIsActive ) { |
283
|
|
|
$this->postProcAndFlush( $this->outputFP, $data ); |
284
|
|
|
} elseif ( $this->bufferIsActive ) { |
285
|
|
|
$this->CDATABuffer .= $data; |
286
|
|
|
} |
287
|
|
|
} |
288
|
|
|
|
289
|
|
|
/** |
290
|
|
|
* postprocess escaped data and write to disk |
291
|
|
|
* |
292
|
|
|
* @param resource $fp |
293
|
|
|
* @param string $data |
294
|
|
|
* @param bool $treatAsCDATA |
295
|
|
|
*/ |
296
|
|
|
protected function postProcAndFlush( $fp, $data, $treatAsCDATA = false ) { |
297
|
|
|
//postprocess string |
298
|
|
|
$data = preg_replace( "/" . self::$INTERNAL_TAG_PLACEHOLDER . '(.*?)' . self::$INTERNAL_TAG_PLACEHOLDER . "/", '&$1;', $data ); |
299
|
|
|
$data = str_replace( ' ', ' ', $data ); |
300
|
|
|
if ( !$treatAsCDATA ) { |
301
|
|
|
//unix2dos |
302
|
|
|
$data = str_replace( "\r\n", "\r", $data ); |
303
|
|
|
$data = str_replace( "\n", "\r", $data ); |
304
|
|
|
$data = str_replace( "\r", "\r\n", $data ); |
305
|
|
|
} |
306
|
|
|
|
307
|
|
|
//flush to disk |
308
|
|
|
fwrite( $fp, $data ); |
309
|
|
|
} |
310
|
|
|
|
311
|
|
|
protected function handleOpenUnit( string $name, array $attr ) { |
312
|
|
|
|
313
|
|
|
// check if we are entering into a <trans-unit> (xliff v1.*) or <unit> (xliff v2.*) |
314
|
|
|
if ( $this->tuTagName === $name ) { |
315
|
|
|
$this->inTU = true; |
316
|
|
|
|
317
|
|
|
// get id |
318
|
|
|
// trim to first 100 characters because this is the limit on Matecat's DB |
319
|
|
|
$this->currentTransUnitId = substr( $attr[ 'id' ], 0, 100 ); |
320
|
|
|
|
321
|
|
|
// `translate` attribute can be only yes or no |
322
|
|
|
// current 'translate' attribute of the current trans-unit |
323
|
|
|
$this->currentTransUnitIsTranslatable = empty( $attr[ 'translate' ] ) ? 'yes' : $attr[ 'translate' ]; |
324
|
|
|
|
325
|
|
|
$this->setLastTransUnitSegments(); |
326
|
|
|
|
327
|
|
|
} |
328
|
|
|
} |
329
|
|
|
|
330
|
|
|
/** |
331
|
|
|
* @param array $seg |
332
|
|
|
*/ |
333
|
|
|
protected function updateSegmentCounts( array $seg = [] ) { |
334
|
|
|
|
335
|
|
|
$raw_word_count = $seg[ 'raw_word_count' ]; |
336
|
|
|
$eq_word_count = ( floor( $seg[ 'eq_word_count' ] * 100 ) / 100 ); |
337
|
|
|
|
338
|
|
|
$this->counts[ 'segments_count_array' ][ $seg[ 'sid' ] ] = [ |
339
|
|
|
'raw_word_count' => $raw_word_count, |
340
|
|
|
'eq_word_count' => $eq_word_count, |
341
|
|
|
]; |
342
|
|
|
|
343
|
|
|
$this->counts[ 'raw_word_count' ] += $raw_word_count; |
344
|
|
|
$this->counts[ 'eq_word_count' ] += $eq_word_count; |
345
|
|
|
} |
346
|
|
|
|
347
|
|
|
protected function resetCounts() { |
348
|
|
|
$this->counts[ 'segments_count_array' ] = []; |
349
|
|
|
$this->counts[ 'raw_word_count' ] = 0; |
350
|
|
|
$this->counts[ 'eq_word_count' ] = 0; |
351
|
|
|
} |
352
|
|
|
|
353
|
|
|
protected function checkForSelfClosedTagAndFlush( $parser, $tag ) { |
354
|
|
|
|
355
|
|
|
$lastChar = $this->getLastCharacter( $parser ); |
356
|
|
|
|
357
|
|
|
//trim last space |
358
|
|
|
$tag = rtrim( $tag ); |
359
|
|
|
|
360
|
|
|
//detect empty tag |
361
|
|
|
$this->isEmpty = $lastChar == '/'; |
362
|
|
|
if ( $this->isEmpty ) { |
363
|
|
|
$tag .= $lastChar; |
364
|
|
|
} |
365
|
|
|
|
366
|
|
|
//add tag ending |
367
|
|
|
$tag .= ">"; |
368
|
|
|
|
369
|
|
|
//set a Buffer for the segSource Source tag |
370
|
|
|
if ( $this->bufferIsActive ) { // we are opening a critical CDATA section |
371
|
|
|
//these are NOT source/seg-source/value empty tags, THERE IS A CONTENT, write it in buffer |
372
|
|
|
$this->CDATABuffer .= $tag; |
373
|
|
|
} else { |
374
|
|
|
$this->postProcAndFlush( $this->outputFP, $tag ); |
375
|
|
|
} |
376
|
|
|
|
377
|
|
|
} |
378
|
|
|
|
379
|
|
|
/** |
380
|
|
|
* A trans-unit can contain a list of segments because of mrk tags |
381
|
|
|
* Copy the segment's list for this trans-unit in a different structure |
382
|
|
|
* |
383
|
|
|
* @return void |
384
|
|
|
*/ |
385
|
|
|
protected function setLastTransUnitSegments() { |
386
|
|
|
|
387
|
|
|
/* |
388
|
|
|
* At the end of every cycle the segment grouping information is lost: unset( 'matecat|' . $this->currentId ) |
389
|
|
|
* |
390
|
|
|
* We need to take the info about the last segment parsed |
391
|
|
|
* ( normally more than 1 db row because of mrk tags ) |
392
|
|
|
* |
393
|
|
|
* So, copy the current segment data group into another structure to keep the last segment |
394
|
|
|
* for the next tagOpen ( possible sdl:seg-defs ) |
395
|
|
|
* |
396
|
|
|
*/ |
397
|
|
|
$this->lastTransUnit = []; |
398
|
|
|
|
399
|
|
|
if ( !isset( $this->transUnits[ $this->currentTransUnitId ] ) ) { |
400
|
|
|
return; |
401
|
|
|
} |
402
|
|
|
|
403
|
|
|
$listOfSegmentsIds = $this->transUnits[ $this->currentTransUnitId ]; |
404
|
|
|
$last_value = null; |
405
|
|
|
$segmentsCount = count( $listOfSegmentsIds ); |
406
|
|
|
for ( $i = 0; $i < $segmentsCount; $i++ ) { |
407
|
|
|
$id = $listOfSegmentsIds[ $i ]; |
408
|
|
|
if ( isset( $this->segments[ $id ] ) && ( $i == 0 || $last_value + 1 == $listOfSegmentsIds[ $i ] ) ) { |
409
|
|
|
$last_value = $listOfSegmentsIds[ $i ]; |
410
|
|
|
$this->lastTransUnit[] = $this->segments[ $id ]; |
411
|
|
|
} |
412
|
|
|
} |
413
|
|
|
|
414
|
|
|
} |
415
|
|
|
|
416
|
|
|
/** |
417
|
|
|
* @return array |
418
|
|
|
*/ |
419
|
|
|
protected function getCurrentSegment(): array { |
420
|
|
|
if ( $this->currentTransUnitIsTranslatable !== 'no' && isset( $this->transUnits[ $this->currentTransUnitId ] ) ) { |
421
|
|
|
return $this->segments[ $this->segmentInUnitPosition ]; |
422
|
|
|
} |
423
|
|
|
|
424
|
|
|
return []; |
425
|
|
|
} |
426
|
|
|
|
427
|
|
|
} |
If you suppress an error, we recommend checking for the error condition explicitly: