1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* MediaWiki import plugin for phpMyAdmin |
4
|
|
|
*/ |
5
|
|
|
|
6
|
|
|
declare(strict_types=1); |
7
|
|
|
|
8
|
|
|
namespace PhpMyAdmin\Plugins\Import; |
9
|
|
|
|
10
|
|
|
use PhpMyAdmin\Current; |
11
|
|
|
use PhpMyAdmin\File; |
12
|
|
|
use PhpMyAdmin\Http\ServerRequest; |
13
|
|
|
use PhpMyAdmin\Import\ImportSettings; |
14
|
|
|
use PhpMyAdmin\Import\ImportTable; |
15
|
|
|
use PhpMyAdmin\Message; |
16
|
|
|
use PhpMyAdmin\Plugins\ImportPlugin; |
17
|
|
|
use PhpMyAdmin\Properties\Plugins\ImportPluginProperties; |
18
|
|
|
|
19
|
|
|
use function __; |
20
|
|
|
use function count; |
21
|
|
|
use function explode; |
22
|
|
|
use function mb_strlen; |
23
|
|
|
use function mb_substr; |
24
|
|
|
use function pathinfo; |
25
|
|
|
use function preg_match; |
26
|
|
|
use function str_contains; |
27
|
|
|
use function str_replace; |
28
|
|
|
use function str_starts_with; |
29
|
|
|
use function strlen; |
30
|
|
|
use function trim; |
31
|
|
|
|
32
|
|
|
use const PATHINFO_FILENAME; |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* Handles the import for the MediaWiki format |
36
|
|
|
*/ |
37
|
|
|
class ImportMediawiki extends ImportPlugin |
38
|
|
|
{ |
39
|
|
|
/** |
40
|
|
|
* Whether to analyze tables |
41
|
|
|
*/ |
42
|
|
|
private bool $analyze = false; |
43
|
|
|
|
44
|
|
|
/** @psalm-return non-empty-lowercase-string */ |
45
|
|
|
public function getName(): string |
46
|
|
|
{ |
47
|
|
|
return 'mediawiki'; |
48
|
|
|
} |
49
|
|
|
|
50
|
12 |
|
protected function setProperties(): ImportPluginProperties |
51
|
|
|
{ |
52
|
12 |
|
$this->analyze = false; |
53
|
12 |
|
if (ImportSettings::$importType !== 'table') { |
54
|
12 |
|
$this->analyze = true; |
55
|
|
|
} |
56
|
|
|
|
57
|
12 |
|
$importPluginProperties = new ImportPluginProperties(); |
58
|
12 |
|
$importPluginProperties->setText(__('MediaWiki Table')); |
59
|
12 |
|
$importPluginProperties->setExtension('txt'); |
60
|
12 |
|
$importPluginProperties->setMimeType('text/plain'); |
61
|
12 |
|
$importPluginProperties->setOptionsText(__('Options')); |
62
|
|
|
|
63
|
12 |
|
return $importPluginProperties; |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
public function setImportOptions(ServerRequest $request): void |
67
|
|
|
{ |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
/** |
71
|
|
|
* Handles the whole import logic |
72
|
|
|
* |
73
|
|
|
* @return string[] |
74
|
|
|
*/ |
75
|
8 |
|
public function doImport(File|null $importHandle = null): array |
76
|
|
|
{ |
77
|
8 |
|
$GLOBALS['error'] ??= null; |
78
|
|
|
|
79
|
8 |
|
$sqlStatements = []; |
80
|
|
|
|
81
|
|
|
// Defaults for parser |
82
|
|
|
|
83
|
|
|
// The buffer that will be used to store chunks read from the imported file |
84
|
8 |
|
$buffer = ''; |
85
|
|
|
|
86
|
|
|
// Used as storage for the last part of the current chunk data |
87
|
|
|
// Will be appended to the first line of the next chunk, if there is one |
88
|
8 |
|
$lastChunkLine = ''; |
89
|
|
|
|
90
|
|
|
// Remembers whether the current buffer line is part of a comment |
91
|
8 |
|
$insideComment = false; |
92
|
|
|
// Remembers whether the current buffer line is part of a data comment |
93
|
8 |
|
$insideDataComment = false; |
94
|
|
|
// Remembers whether the current buffer line is part of a structure comment |
95
|
8 |
|
$insideStructureComment = false; |
96
|
|
|
|
97
|
|
|
// MediaWiki only accepts "\n" as row terminator |
98
|
8 |
|
$mediawikiNewLine = "\n"; |
99
|
|
|
|
100
|
|
|
// Initialize the name of the current table |
101
|
8 |
|
$curTableName = ''; |
102
|
|
|
|
103
|
8 |
|
$curTempTableHeaders = []; |
104
|
8 |
|
$curTempTable = []; |
105
|
|
|
|
106
|
8 |
|
$inTableHeader = false; |
107
|
|
|
|
108
|
|
|
/** @infection-ignore-all */ |
109
|
8 |
|
while (! ImportSettings::$finished && ! $GLOBALS['error'] && ! ImportSettings::$timeoutPassed) { |
110
|
8 |
|
$data = $this->import->getNextChunk($importHandle); |
111
|
|
|
|
112
|
8 |
|
if ($data === false) { |
113
|
|
|
// Subtract data we didn't handle yet and stop processing |
114
|
|
|
ImportSettings::$offset -= mb_strlen($buffer); |
115
|
|
|
break; |
116
|
|
|
} |
117
|
|
|
|
118
|
8 |
|
if ($data !== true) { |
119
|
|
|
// Append new data to buffer |
120
|
8 |
|
$buffer = $data; |
121
|
8 |
|
unset($data); |
122
|
|
|
// Don't parse string if we're not at the end |
123
|
|
|
// and don't have a new line inside |
124
|
8 |
|
if (! str_contains($buffer, $mediawikiNewLine)) { |
125
|
|
|
continue; |
126
|
|
|
} |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
// Because of reading chunk by chunk, the first line from the buffer |
130
|
|
|
// contains only a portion of an actual line from the imported file. |
131
|
|
|
// Therefore, we have to append it to the last line from the previous |
132
|
|
|
// chunk. If we are at the first chunk, $last_chunk_line should be empty. |
133
|
8 |
|
$buffer = $lastChunkLine . $buffer; |
134
|
|
|
|
135
|
|
|
// Process the buffer line by line |
136
|
8 |
|
$bufferLines = explode($mediawikiNewLine, $buffer); |
137
|
|
|
|
138
|
8 |
|
$fullBufferLinesCount = count($bufferLines); |
139
|
|
|
// If the reading is not finalized, the final line of the current chunk |
140
|
|
|
// will not be complete |
141
|
8 |
|
if (! ImportSettings::$finished) { |
142
|
|
|
$lastChunkLine = $bufferLines[--$fullBufferLinesCount]; |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
/** @var list<string> $curTempLine Temporary storage of cell values */ |
146
|
8 |
|
$curTempLine = []; |
147
|
8 |
|
for ($lineNr = 0; $lineNr < $fullBufferLinesCount; ++$lineNr) { |
148
|
8 |
|
$curBufferLine = trim($bufferLines[$lineNr]); |
149
|
|
|
|
150
|
|
|
// If the line is empty, go to the next one |
151
|
8 |
|
if ($curBufferLine === '') { |
152
|
8 |
|
continue; |
153
|
|
|
} |
154
|
|
|
|
155
|
8 |
|
$firstCharacter = $curBufferLine[0]; |
156
|
|
|
|
157
|
|
|
// Check beginning of comment |
158
|
8 |
|
if (str_starts_with($curBufferLine, '<!--')) { |
159
|
8 |
|
$insideComment = true; |
160
|
8 |
|
continue; |
161
|
|
|
} |
162
|
|
|
|
163
|
8 |
|
if ($insideComment) { |
164
|
|
|
// Check end of comment |
165
|
8 |
|
if (str_starts_with($curBufferLine, '-->')) { |
166
|
|
|
// Only data comments are closed. The structure comments |
167
|
|
|
// will be closed when a data comment begins (in order to |
168
|
|
|
// skip structure tables) |
169
|
8 |
|
if ($insideDataComment) { |
170
|
8 |
|
$insideDataComment = false; |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
// End comments that are not related to table structure |
174
|
8 |
|
if (! $insideStructureComment) { |
175
|
8 |
|
$insideComment = false; |
176
|
|
|
} |
177
|
|
|
} else { |
178
|
|
|
// Check table name |
179
|
8 |
|
$matchTableName = []; |
180
|
8 |
|
if (preg_match('/^Table data for `(.*)`$/', $curBufferLine, $matchTableName)) { |
181
|
8 |
|
$curTableName = $matchTableName[1]; |
182
|
8 |
|
$insideDataComment = true; |
183
|
|
|
|
184
|
8 |
|
$insideStructureComment = false; |
185
|
|
|
} elseif (preg_match('/^Table structure for `(.*)`$/', $curBufferLine, $matchTableName)) { |
186
|
|
|
// The structure comments will be ignored |
187
|
|
|
$insideStructureComment = true; |
188
|
|
|
} |
189
|
|
|
} |
190
|
|
|
|
191
|
8 |
|
continue; |
192
|
|
|
} |
193
|
|
|
|
194
|
8 |
|
if (preg_match('/^\{\|(.*)$/', $curBufferLine)) { |
195
|
|
|
// Check start of table |
196
|
|
|
|
197
|
|
|
// This will store all the column info on all rows from |
198
|
|
|
// the current table read from the buffer |
199
|
8 |
|
$curTempTable = []; |
200
|
|
|
|
201
|
|
|
// Will be used as storage for the current row in the buffer |
202
|
|
|
// Once all its columns are read, it will be added to |
203
|
|
|
// $cur_temp_table and then it will be emptied |
204
|
8 |
|
$curTempLine = []; |
205
|
|
|
|
206
|
|
|
// Helps us differentiate the header columns |
207
|
|
|
// from the normal columns |
208
|
8 |
|
$inTableHeader = false; |
209
|
|
|
// End processing because the current line does not |
210
|
|
|
// contain any column information |
211
|
|
|
} elseif ( |
212
|
8 |
|
str_starts_with($curBufferLine, '|-') |
213
|
8 |
|
|| str_starts_with($curBufferLine, '|+') |
214
|
8 |
|
|| str_starts_with($curBufferLine, '|}') |
215
|
|
|
) { |
216
|
|
|
// Check begin row or end table |
217
|
|
|
|
218
|
|
|
// Add current line to the values storage |
219
|
8 |
|
if ($curTempLine !== []) { |
220
|
|
|
// If the current line contains header cells |
221
|
|
|
// ( marked with '!' ), |
222
|
|
|
// it will be marked as table header |
223
|
8 |
|
if ($inTableHeader) { |
224
|
|
|
// Set the header columns |
225
|
8 |
|
$curTempTableHeaders = $curTempLine; |
226
|
|
|
} else { |
227
|
|
|
// Normal line, add it to the table |
228
|
4 |
|
$curTempTable[] = $curTempLine; |
229
|
|
|
} |
230
|
|
|
} |
231
|
|
|
|
232
|
|
|
// Empty the temporary buffer |
233
|
8 |
|
$curTempLine = []; |
234
|
|
|
|
235
|
|
|
// No more processing required at the end of the table |
236
|
8 |
|
if (str_starts_with($curBufferLine, '|}')) { |
237
|
|
|
// Import the current table data into the database |
238
|
8 |
|
$this->importDataOneTable( |
239
|
8 |
|
new ImportTable($curTableName, $curTempTableHeaders, $curTempTable), |
|
|
|
|
240
|
8 |
|
$sqlStatements, |
241
|
8 |
|
); |
242
|
|
|
|
243
|
|
|
// Reset table name |
244
|
8 |
|
$curTableName = ''; |
245
|
|
|
} |
246
|
|
|
// What's after the row tag is now only attributes |
247
|
8 |
|
} elseif ($firstCharacter === '|' || $firstCharacter === '!') { |
248
|
|
|
// Check cell elements |
249
|
|
|
|
250
|
|
|
// Header cells |
251
|
8 |
|
if ($firstCharacter === '!') { |
252
|
|
|
// Mark as table header, but treat as normal row |
253
|
8 |
|
$curBufferLine = str_replace('!!', '||', $curBufferLine); |
254
|
|
|
// Will be used to set $cur_temp_line as table header |
255
|
8 |
|
$inTableHeader = true; |
256
|
|
|
} else { |
257
|
4 |
|
$inTableHeader = false; |
258
|
|
|
} |
259
|
|
|
|
260
|
|
|
// Loop through each table cell |
261
|
8 |
|
$cells = $this->explodeMarkup($curBufferLine); |
262
|
8 |
|
foreach ($cells as $cell) { |
263
|
8 |
|
$cell = $this->getCellData($cell); |
264
|
|
|
|
265
|
|
|
// Delete the beginning of the column, if there is one |
266
|
8 |
|
$cell = trim($cell); |
267
|
8 |
|
foreach (['|', '!'] as $colStartChar) { |
268
|
8 |
|
$cell = $this->getCellContent($cell, $colStartChar); |
269
|
|
|
} |
270
|
|
|
|
271
|
|
|
// Add the cell to the row |
272
|
8 |
|
$curTempLine[] = $cell; |
273
|
|
|
} |
274
|
|
|
} else { |
275
|
|
|
// If it's none of the above, then the current line has a bad |
276
|
|
|
// format |
277
|
|
|
$message = Message::error( |
278
|
|
|
__('Invalid format of mediawiki input on line: <br>%s.'), |
279
|
|
|
); |
280
|
|
|
$message->addParam($curBufferLine); |
281
|
|
|
$GLOBALS['error'] = true; |
282
|
|
|
} |
283
|
|
|
} |
284
|
|
|
} |
285
|
|
|
|
286
|
8 |
|
return $sqlStatements; |
287
|
|
|
} |
288
|
|
|
|
289
|
|
|
/** |
290
|
|
|
* Imports data from a single table |
291
|
|
|
* |
292
|
|
|
* @param string[] $sqlStatements List of SQL statements to be executed |
293
|
|
|
*/ |
294
|
8 |
|
private function importDataOneTable(ImportTable $table, array &$sqlStatements): void |
295
|
|
|
{ |
296
|
8 |
|
if ($this->analyze) { |
297
|
|
|
// Set the table name |
298
|
8 |
|
if ($table->tableName === '') { |
299
|
|
|
$table->tableName = $this->import->getNextAvailableTableName( |
300
|
|
|
Current::$database, |
301
|
|
|
pathinfo(ImportSettings::$importFileName, PATHINFO_FILENAME), |
|
|
|
|
302
|
|
|
); |
303
|
|
|
} |
304
|
|
|
|
305
|
|
|
// Set generic names for table headers if they don't exist |
306
|
8 |
|
if ($table->columns === []) { |
307
|
|
|
$table->columns = $this->setTableHeaders(count($table->rows[0])); |
|
|
|
|
308
|
|
|
} |
309
|
|
|
|
310
|
|
|
// Obtain the best-fit MySQL types for each column |
311
|
8 |
|
$analysis = $this->import->analyzeTable($table); |
312
|
|
|
|
313
|
8 |
|
$dbName = Current::$database !== '' ? Current::$database : 'mediawiki_DB'; |
314
|
|
|
|
315
|
8 |
|
if (Current::$database === '') { |
316
|
8 |
|
$sqlStatements = $this->import->createDatabase($dbName, 'utf8', 'utf8_general_ci', $sqlStatements); |
317
|
|
|
} |
318
|
|
|
|
319
|
8 |
|
$this->import->buildSql( |
320
|
8 |
|
$dbName, |
321
|
8 |
|
[$table], |
322
|
8 |
|
[$analysis], |
323
|
8 |
|
sqlData: $sqlStatements, |
324
|
8 |
|
); |
325
|
|
|
} |
326
|
|
|
|
327
|
|
|
// Commit any possible data in buffers |
328
|
8 |
|
$this->import->runQuery('', $sqlStatements); |
329
|
|
|
} |
330
|
|
|
|
331
|
|
|
/** |
332
|
|
|
* Set generic names for table headers, if they don't exist |
333
|
|
|
* |
334
|
|
|
* @return list<string> |
|
|
|
|
335
|
|
|
*/ |
336
|
|
|
private function setTableHeaders(int $numCols): array |
337
|
|
|
{ |
338
|
|
|
$tableHeaders = []; |
339
|
|
|
|
340
|
|
|
// The first table row should contain the number of columns |
341
|
|
|
// If they are not set, generic names will be given (COL 1, COL 2, etc) |
342
|
|
|
for ($i = 0; $i < $numCols; ++$i) { |
343
|
|
|
$tableHeaders[] = 'COL ' . ($i + 1); |
344
|
|
|
} |
345
|
|
|
|
346
|
|
|
return $tableHeaders; |
|
|
|
|
347
|
|
|
} |
348
|
|
|
|
349
|
|
|
/** |
350
|
|
|
* Replaces all instances of the '||' separator between delimiters |
351
|
|
|
* in a given string |
352
|
|
|
* |
353
|
|
|
* @param string $replace the string to be replaced with |
354
|
|
|
* @param string $subject the text to be replaced |
355
|
|
|
* |
356
|
|
|
* @return string with replacements |
357
|
|
|
*/ |
358
|
8 |
|
private function delimiterReplace(string $replace, string $subject): string |
359
|
|
|
{ |
360
|
|
|
// String that will be returned |
361
|
8 |
|
$cleaned = ''; |
362
|
|
|
// Possible states of current character |
363
|
8 |
|
$insideTag = false; |
364
|
8 |
|
$insideAttribute = false; |
365
|
|
|
// Attributes can be declared with either " or ' |
366
|
8 |
|
$startAttributeCharacter = false; |
367
|
|
|
|
368
|
|
|
// The full separator is "||"; |
369
|
|
|
// This remembers if the previous character was '|' |
370
|
8 |
|
$partialSeparator = false; |
371
|
|
|
|
372
|
|
|
// Parse text char by char |
373
|
|
|
/** @infection-ignore-all */ |
374
|
8 |
|
for ($i = 0, $iMax = strlen($subject); $i < $iMax; $i++) { |
375
|
8 |
|
$curChar = $subject[$i]; |
376
|
|
|
// Check for separators |
377
|
8 |
|
if ($curChar === '|') { |
378
|
|
|
// If we're not inside a tag, then this is part of a real separator, |
379
|
|
|
// so we append it to the current segment |
380
|
4 |
|
if (! $insideAttribute) { |
381
|
4 |
|
$cleaned .= $curChar; |
382
|
4 |
|
if ($partialSeparator) { |
383
|
|
|
$insideTag = false; |
384
|
2 |
|
$insideAttribute = false; |
385
|
|
|
} |
386
|
|
|
} elseif ($partialSeparator) { |
387
|
|
|
// If we are inside a tag, we replace the current char with |
388
|
|
|
// the placeholder and append that to the current segment |
389
|
|
|
$cleaned .= $replace; |
390
|
|
|
} |
391
|
|
|
|
392
|
|
|
// If the previous character was also '|', then this ends a |
393
|
|
|
// full separator. If not, this may be the beginning of one |
394
|
4 |
|
$partialSeparator = ! $partialSeparator; |
|
|
|
|
395
|
|
|
} else { |
396
|
|
|
// If we're inside a tag attribute and the current character is |
397
|
|
|
// not '|', but the previous one was, it means that the single '|' |
398
|
|
|
// was not appended, so we append it now |
399
|
8 |
|
if ($partialSeparator && $insideAttribute) { |
400
|
|
|
$cleaned .= '|'; |
401
|
|
|
} |
402
|
|
|
|
403
|
|
|
// If the char is different from "|", no separator can be formed |
404
|
8 |
|
$partialSeparator = false; |
405
|
|
|
|
406
|
|
|
// any other character should be appended to the current segment |
407
|
8 |
|
$cleaned .= $curChar; |
408
|
|
|
|
409
|
8 |
|
if ($curChar === '<' && ! $insideAttribute) { |
410
|
|
|
// start of a tag |
411
|
|
|
$insideTag = true; |
412
|
8 |
|
} elseif ($curChar === '>' && ! $insideAttribute) { |
413
|
|
|
// end of a tag |
414
|
|
|
$insideTag = false; |
415
|
8 |
|
} elseif (($curChar === '"' || $curChar === "'") && $insideTag) { |
416
|
|
|
// start or end of an attribute |
417
|
|
|
if (! $insideAttribute) { |
418
|
|
|
$insideAttribute = true; |
419
|
|
|
// remember the attribute`s declaration character (" or ') |
420
|
|
|
$startAttributeCharacter = $curChar; |
421
|
|
|
} elseif ($curChar == $startAttributeCharacter) { |
422
|
|
|
$insideAttribute = false; |
423
|
|
|
// unset attribute declaration character |
424
|
|
|
$startAttributeCharacter = false; |
425
|
|
|
} |
426
|
|
|
} |
427
|
|
|
} |
428
|
|
|
} |
429
|
|
|
|
430
|
8 |
|
return $cleaned; |
431
|
|
|
} |
432
|
|
|
|
433
|
|
|
/** |
434
|
|
|
* Separates a string into items, similarly to explode |
435
|
|
|
* Uses the '||' separator (which is standard in the mediawiki format) |
436
|
|
|
* and ignores any instances of it inside markup tags |
437
|
|
|
* Used in parsing buffer lines containing data cells |
438
|
|
|
* |
439
|
|
|
* @param string $text text to be split |
440
|
|
|
* |
441
|
|
|
* @return string[] |
442
|
|
|
*/ |
443
|
8 |
|
private function explodeMarkup(string $text): array |
444
|
|
|
{ |
445
|
8 |
|
$separator = '||'; |
446
|
8 |
|
$placeholder = "\x00"; |
447
|
|
|
|
448
|
|
|
// Remove placeholder instances |
449
|
8 |
|
$text = str_replace($placeholder, '', $text); |
450
|
|
|
|
451
|
|
|
// Replace instances of the separator inside HTML-like |
452
|
|
|
// tags with the placeholder |
453
|
8 |
|
$cleaned = $this->delimiterReplace($placeholder, $text); |
454
|
|
|
// Explode, then put the replaced separators back in |
455
|
8 |
|
$items = explode($separator, $cleaned); |
456
|
8 |
|
foreach ($items as $i => $str) { |
457
|
8 |
|
$items[$i] = str_replace($placeholder, $separator, $str); |
458
|
|
|
} |
459
|
|
|
|
460
|
8 |
|
return $items; |
461
|
|
|
} |
462
|
|
|
|
463
|
8 |
|
private function getCellData(string $cell): string |
464
|
|
|
{ |
465
|
|
|
// A cell could contain both parameters and data |
466
|
8 |
|
$cellData = explode('|', $cell, 2); |
467
|
|
|
|
468
|
|
|
// A '|' inside an invalid link should not |
469
|
|
|
// be mistaken as delimiting cell parameters |
470
|
8 |
|
if (! str_contains($cellData[0], '[[')) { |
471
|
8 |
|
return $cell; |
472
|
|
|
} |
473
|
|
|
|
474
|
|
|
if (count($cellData) === 1) { |
475
|
|
|
return $cellData[0]; |
476
|
|
|
} |
477
|
|
|
|
478
|
|
|
return $cellData[1]; |
479
|
|
|
} |
480
|
|
|
|
481
|
8 |
|
private function getCellContent(string $cell, string $colStartChar): string |
482
|
|
|
{ |
483
|
8 |
|
if (str_starts_with($cell, $colStartChar)) { |
484
|
8 |
|
return trim(mb_substr($cell, 1)); |
485
|
|
|
} |
486
|
|
|
|
487
|
8 |
|
return $cell; |
488
|
|
|
} |
489
|
|
|
} |
490
|
|
|
|