1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Matecat\SubFiltering; |
4
|
|
|
|
5
|
|
|
use Exception; |
6
|
|
|
use Matecat\SubFiltering\Commons\Pipeline; |
7
|
|
|
use Matecat\SubFiltering\Filters\CtrlCharsPlaceHoldToAscii; |
8
|
|
|
use Matecat\SubFiltering\Filters\DataRefReplace; |
9
|
|
|
use Matecat\SubFiltering\Filters\DataRefRestore; |
10
|
|
|
use Matecat\SubFiltering\Filters\EmojiToEntity; |
11
|
|
|
use Matecat\SubFiltering\Filters\EncodeControlCharsInXliff; |
12
|
|
|
use Matecat\SubFiltering\Filters\EntityToEmoji; |
13
|
|
|
use Matecat\SubFiltering\Filters\FromLayer2ToRawXML; |
14
|
|
|
use Matecat\SubFiltering\Filters\LtGtEncode; |
15
|
|
|
use Matecat\SubFiltering\Filters\PlaceHoldXliffTags; |
16
|
|
|
use Matecat\SubFiltering\Filters\RemoveDangerousChars; |
17
|
|
|
use Matecat\SubFiltering\Filters\RestorePlaceHoldersToXLIFFLtGt; |
18
|
|
|
use Matecat\SubFiltering\Filters\RestoreXliffTagsContent; |
19
|
|
|
use Matecat\SubFiltering\Filters\SpecialEntitiesToPlaceholdersForView; |
20
|
|
|
|
21
|
|
|
/** |
22
|
|
|
* Class Filter |
23
|
|
|
* |
24
|
|
|
* This class is meant to create subfiltering layers to allow data to be safely sent and received from 2 different Layers and real file |
25
|
|
|
* |
26
|
|
|
* # Definitions |
27
|
|
|
* |
28
|
|
|
* - Raw file, the real XML file in input, with data in XML |
29
|
|
|
* - Layer 0 is defined to be the Database. The data stored in the database should be in the same form (sanitized if needed) they come from Xliff file |
30
|
|
|
* - Layer 1 is defined to be external services and resources, for example, MT/TM server. This layer is different from layer 0, HTML subfiltering is applied here |
31
|
|
|
* - Layer 2 is defined to be the MayeCat UI. |
32
|
|
|
* |
33
|
|
|
* # Constraints |
34
|
|
|
* - We have to maintain the compatibility with PH tags placed inside the XLIff in the form <ph id="[0-9+]" equiv-text="<br/>"/> . |
35
|
|
|
* Those tags are placed into the database as XML. |
36
|
|
|
* - HTML and other variables like android tags and custom features are placed into the database as encoded HTML <br/> |
37
|
|
|
* |
38
|
|
|
* - Data sent to the external services like MT/TM are sub-filtered: |
39
|
|
|
* -- <br/> become <ph id="mtc_[0-9]+" equiv-text="base64:Jmx0O2JyLyZndDs="/> |
40
|
|
|
* -- Existent tags in the XLIFF like <ph id="[0-9+]" equiv-text="<br/>"/> will leave as is |
41
|
|
|
* |
42
|
|
|
* |
43
|
|
|
* @package SubFiltering |
44
|
|
|
*/ |
45
|
|
|
class MateCatFilter extends AbstractFilter { |
46
|
|
|
|
47
|
|
|
/** |
48
|
|
|
* Transforms database raw XML content (Layer 0) to intermediate structures (Layer 1). |
49
|
|
|
* |
50
|
|
|
* @param string $segment The data segment to transform from Layer 0 to Layer 1. |
51
|
|
|
* @param string|null $cid Optional context identifier for the transformation process. |
52
|
|
|
* |
53
|
|
|
* @return string The transformed segment in Layer 1 format. |
54
|
|
|
* @throws Exception If the transformation process fails. |
55
|
|
|
*/ |
56
|
75 |
|
public function fromLayer0ToLayer1( string $segment, ?string $cid = null ): string { |
57
|
|
|
|
58
|
75 |
|
$channel = new Pipeline( $this->source, $this->target, $this->dataRefMap ); |
59
|
|
|
|
60
|
75 |
|
$this->configureFromLayer0ToLayer1Pipeline( $channel ); |
61
|
|
|
|
62
|
|
|
// Allow the feature set to modify the pipeline for this specific transformation. |
63
|
|
|
/** @var $channel Pipeline */ |
64
|
75 |
|
$channel = $this->featureSet->filter( 'fromLayer0ToLayer1', $channel ); |
65
|
|
|
|
66
|
|
|
// Process the segment and return the result. |
67
|
75 |
|
return $channel->transform( $segment ); |
68
|
|
|
|
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Used to transform database raw XML content (Layer 0) to the UI structures (Layer 2) |
73
|
|
|
* |
74
|
|
|
* @param string $segment |
75
|
|
|
* |
76
|
|
|
* @return string |
77
|
|
|
* @throws Exception |
78
|
|
|
*/ |
79
|
41 |
|
public function fromLayer0ToLayer2( string $segment ): string { |
80
|
41 |
|
return $this->fromLayer1ToLayer2( |
81
|
41 |
|
$this->fromLayer0ToLayer1( $segment ) |
82
|
41 |
|
); |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* Used to transform database raw XML content (Layer 0) to the UI structures (Layer 2) |
87
|
|
|
* |
88
|
|
|
* @param string $segment |
89
|
|
|
* |
90
|
|
|
* @return string |
91
|
|
|
* @throws Exception |
92
|
|
|
*/ |
93
|
57 |
|
public function fromLayer1ToLayer2( string $segment ): string { |
94
|
57 |
|
$channel = new Pipeline( $this->source, $this->target, $this->dataRefMap ); |
95
|
57 |
|
$channel->addLast( SpecialEntitiesToPlaceholdersForView::class ); |
96
|
57 |
|
$channel->addLast( EntityToEmoji::class ); |
97
|
57 |
|
$channel->addLast( DataRefReplace::class ); |
98
|
|
|
|
99
|
|
|
/** @var $channel Pipeline */ |
100
|
57 |
|
$channel = $this->featureSet->filter( 'fromLayer1ToLayer2', $channel ); |
101
|
|
|
|
102
|
57 |
|
return $channel->transform( $segment ); |
103
|
|
|
} |
104
|
|
|
|
105
|
|
|
/** |
106
|
|
|
* Used to transform UI data (Layer 2) to the XML structures (Layer 1) |
107
|
|
|
* |
108
|
|
|
* @param string $segment |
109
|
|
|
* |
110
|
|
|
* @return string |
111
|
|
|
* @throws Exception |
112
|
|
|
*/ |
113
|
38 |
|
public function fromLayer2ToLayer1( string $segment ): string { |
114
|
38 |
|
$channel = new Pipeline( $this->source, $this->target, $this->dataRefMap ); |
115
|
38 |
|
$channel->addLast( CtrlCharsPlaceHoldToAscii::class ); |
116
|
38 |
|
$channel->addLast( PlaceHoldXliffTags::class ); |
117
|
38 |
|
$channel->addLast( FromLayer2TorawXML::class ); |
118
|
38 |
|
$channel->addLast( EmojiToEntity::class ); |
119
|
38 |
|
$channel->addLast( RestoreXliffTagsContent::class ); |
120
|
38 |
|
$channel->addLast( RestorePlaceHoldersToXLIFFLtGt::class ); |
121
|
38 |
|
$channel->addLast( DataRefRestore::class ); |
122
|
|
|
|
123
|
|
|
/** @var $channel Pipeline */ |
124
|
38 |
|
$channel = $this->featureSet->filter( 'fromLayer2ToLayer1', $channel ); |
125
|
|
|
|
126
|
38 |
|
return $channel->transform( $segment ); |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
/** |
130
|
|
|
* |
131
|
|
|
* Used to transform the UI structures (Layer 2) to allow them to be stored in the database (Layer 0) |
132
|
|
|
* |
133
|
|
|
* It is assumed that the UI sends strings having XLF tags not encoded and HTML in XML encoding representation: |
134
|
|
|
* - <b>de <ph id="mtc_1" equiv-text="base64:JTEkcw=="/>, <x id="1" /> </b>que |
135
|
|
|
* |
136
|
|
|
* @param string $segment |
137
|
|
|
* |
138
|
|
|
* @return string |
139
|
|
|
* @throws Exception |
140
|
|
|
*/ |
141
|
38 |
|
public function fromLayer2ToLayer0( string $segment ): string { |
142
|
38 |
|
return $this->fromLayer1ToLayer0( |
143
|
38 |
|
$this->fromLayer2ToLayer1( $segment ) |
144
|
38 |
|
); |
145
|
|
|
} |
146
|
|
|
|
147
|
|
|
/** |
148
|
|
|
* Transforms content from UI structures (Layer 1) back to database raw XML content (Layer 0). |
149
|
|
|
* |
150
|
|
|
* @param string $segment The segment of content to be transformed from Layer 1 to Layer 0. |
151
|
|
|
* |
152
|
|
|
* @return string The resulting transformed content in Layer 0 format. |
153
|
|
|
* @throws Exception |
154
|
|
|
*/ |
155
|
69 |
|
public function fromLayer1ToLayer0( string $segment ): string { |
156
|
69 |
|
return parent::fromLayer1ToLayer0( $segment ); |
157
|
|
|
} |
158
|
|
|
|
159
|
|
|
/** |
160
|
|
|
* Used to convert the raw XLIFF content from the file to an XML for the database (Layer 0) |
161
|
|
|
* |
162
|
|
|
* @param string $segment |
163
|
|
|
* |
164
|
|
|
* @return string |
165
|
|
|
* @throws Exception |
166
|
|
|
*/ |
167
|
11 |
|
public function fromRawXliffToLayer0( string $segment ): string { |
168
|
11 |
|
$channel = new Pipeline( $this->source, $this->target, $this->dataRefMap ); |
169
|
11 |
|
$channel->addLast( RemoveDangerousChars::class ); |
170
|
11 |
|
$channel->addLast( PlaceHoldXliffTags::class ); |
171
|
11 |
|
$channel->addLast( EncodeControlCharsInXliff::class ); |
172
|
11 |
|
$channel->addLast( RestoreXliffTagsContent::class ); |
173
|
11 |
|
$channel->addLast( RestorePlaceHoldersToXLIFFLtGt::class ); |
174
|
|
|
|
175
|
|
|
/** @var $channel Pipeline */ |
176
|
11 |
|
$channel = $this->featureSet->filter( 'fromRawXliffToLayer0', $channel ); |
177
|
|
|
|
178
|
11 |
|
return $channel->transform( $segment ); |
179
|
|
|
} |
180
|
|
|
|
181
|
|
|
/** |
182
|
|
|
* Used to export Database XML string into TMX files as valid XML |
183
|
|
|
* |
184
|
|
|
* @param string $segment |
185
|
|
|
* |
186
|
|
|
* @return string |
187
|
|
|
* @throws Exception |
188
|
|
|
*/ |
189
|
5 |
|
public function fromLayer0ToRawXliff( string $segment ): string { |
190
|
5 |
|
$channel = new Pipeline( $this->source, $this->target, $this->dataRefMap ); |
191
|
5 |
|
$channel->addLast( PlaceHoldXliffTags::class ); |
192
|
5 |
|
$channel->addLast( RemoveDangerousChars::class ); |
193
|
5 |
|
$channel->addLast( RestoreXliffTagsContent::class ); |
194
|
5 |
|
$channel->addLast( RestorePlaceHoldersToXLIFFLtGt::class ); |
195
|
5 |
|
$channel->addLast( LtGtEncode::class ); |
196
|
|
|
|
197
|
|
|
/** @var $channel Pipeline */ |
198
|
5 |
|
$channel = $this->featureSet->filter( 'fromLayer0ToRawXliff', $channel ); |
199
|
|
|
|
200
|
5 |
|
return $channel->transform( $segment ); |
201
|
|
|
} |
202
|
|
|
|
203
|
|
|
/** |
204
|
|
|
* Used to align the tags when created from Layer 0 to Layer 1, when converting data from the database is possible that HTML placeholders are in different positions |
205
|
|
|
* and their id are different because they are simple sequences. |
206
|
|
|
* We must place the right source tag ID in the corresponding target tags. |
207
|
|
|
* |
208
|
|
|
* The source holds the truth :D |
209
|
|
|
* realigns the target ids by matching the content of the base64. |
210
|
|
|
* |
211
|
|
|
* @param string $source |
212
|
|
|
* @param string $target |
213
|
|
|
* |
214
|
|
|
* @return string |
215
|
|
|
* @see getSegmentsController in matecat |
216
|
|
|
* |
217
|
|
|
*/ |
218
|
5 |
|
public function realignIDInLayer1( string $source, string $target ): string { |
219
|
5 |
|
$pattern = '|<ph id ?= ?["\'](mtc_[0-9]+)["\'] ?(equiv-text=["\'].+?["\'] ?)/>|ui'; |
220
|
5 |
|
preg_match_all( $pattern, $source, $src_tags, PREG_PATTERN_ORDER ); |
221
|
5 |
|
preg_match_all( $pattern, $target, $trg_tags, PREG_PATTERN_ORDER ); |
222
|
|
|
|
223
|
5 |
|
if ( count( $src_tags[ 0 ] ) != count( $trg_tags[ 0 ] ) ) { |
224
|
1 |
|
return $target; //WRONG NUMBER OF TAGS, in the translation there is a tag mismatch, let the user fix it |
225
|
|
|
} |
226
|
|
|
|
227
|
4 |
|
$notFoundTargetTags = []; |
228
|
|
|
|
229
|
4 |
|
$start_offset = 0; |
230
|
4 |
|
foreach ( $trg_tags[ 2 ] as $trg_tag_position => $b64 ) { |
231
|
|
|
|
232
|
3 |
|
$src_tag_position = array_search( $b64, $src_tags[ 2 ], true ); |
233
|
|
|
|
234
|
3 |
|
if ( $src_tag_position === false ) { |
235
|
|
|
//this means that the content of a tag is changed in the translation |
236
|
2 |
|
$notFoundTargetTags[ $trg_tag_position ] = $b64; |
237
|
2 |
|
continue; |
238
|
|
|
} else { |
239
|
2 |
|
unset( $src_tags[ 2 ][ $src_tag_position ] ); // remove the index to allow array_search to find the equal next one if it is present |
240
|
|
|
} |
241
|
|
|
|
242
|
|
|
//replace ONLY ONE element AND the EXACT ONE |
243
|
2 |
|
$tag_position_in_string = strpos( $target, $trg_tags[ 0 ][ $trg_tag_position ], $start_offset ); |
244
|
2 |
|
$target = (string)substr_replace( $target, $src_tags[ 0 ][ $src_tag_position ], $tag_position_in_string, strlen( $trg_tags[ 0 ][ $trg_tag_position ] ) ); |
245
|
2 |
|
$start_offset = $tag_position_in_string + strlen( $src_tags[ 0 ][ $src_tag_position ] ); // set the next starting point |
246
|
|
|
} |
247
|
|
|
|
248
|
4 |
|
if ( !empty( $notFoundTargetTags ) ) { |
249
|
|
|
//do something ?!? how to re-align if they are changed in value and changed in position? |
250
|
|
|
} |
251
|
|
|
|
252
|
4 |
|
return $target; |
253
|
|
|
} |
254
|
|
|
} |