ImportMediawiki::doImport()   D
last analyzed

Complexity

Conditions 29
Paths 2

Size

Total Lines 212
Code Lines 89

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 76
CRAP Score 30.6983

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 89
c 1
b 0
f 0
dl 0
loc 212
ccs 76
cts 87
cp 0.8736
rs 4.1666
cc 29
nc 2
nop 1
crap 30.6983

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * MediaWiki import plugin for phpMyAdmin
4
 */
5
6
declare(strict_types=1);
7
8
namespace PhpMyAdmin\Plugins\Import;
9
10
use PhpMyAdmin\Current;
11
use PhpMyAdmin\File;
12
use PhpMyAdmin\Http\ServerRequest;
13
use PhpMyAdmin\Import\ImportSettings;
14
use PhpMyAdmin\Import\ImportTable;
15
use PhpMyAdmin\Message;
16
use PhpMyAdmin\Plugins\ImportPlugin;
17
use PhpMyAdmin\Properties\Plugins\ImportPluginProperties;
18
19
use function __;
20
use function count;
21
use function explode;
22
use function mb_strlen;
23
use function mb_substr;
24
use function pathinfo;
25
use function preg_match;
26
use function str_contains;
27
use function str_replace;
28
use function str_starts_with;
29
use function strlen;
30
use function trim;
31
32
use const PATHINFO_FILENAME;
33
34
/**
35
 * Handles the import for the MediaWiki format
36
 */
37
class ImportMediawiki extends ImportPlugin
38
{
39
    /**
40
     * Whether to analyze tables
41
     */
42
    private bool $analyze = false;
43
44
    /** @psalm-return non-empty-lowercase-string */
45
    public function getName(): string
46
    {
47
        return 'mediawiki';
48
    }
49
50 12
    protected function setProperties(): ImportPluginProperties
51
    {
52 12
        $this->analyze = false;
53 12
        if (ImportSettings::$importType !== 'table') {
54 12
            $this->analyze = true;
55
        }
56
57 12
        $importPluginProperties = new ImportPluginProperties();
58 12
        $importPluginProperties->setText(__('MediaWiki Table'));
59 12
        $importPluginProperties->setExtension('txt');
60 12
        $importPluginProperties->setMimeType('text/plain');
61 12
        $importPluginProperties->setOptionsText(__('Options'));
62
63 12
        return $importPluginProperties;
64
    }
65
66
    public function setImportOptions(ServerRequest $request): void
67
    {
68
    }
69
70
    /**
71
     * Handles the whole import logic
72
     *
73
     * @return string[]
74
     */
75 8
    public function doImport(File|null $importHandle = null): array
76
    {
77 8
        $GLOBALS['error'] ??= null;
78
79 8
        $sqlStatements = [];
80
81
        // Defaults for parser
82
83
        // The buffer that will be used to store chunks read from the imported file
84 8
        $buffer = '';
85
86
        // Used as storage for the last part of the current chunk data
87
        // Will be appended to the first line of the next chunk, if there is one
88 8
        $lastChunkLine = '';
89
90
        // Remembers whether the current buffer line is part of a comment
91 8
        $insideComment = false;
92
        // Remembers whether the current buffer line is part of a data comment
93 8
        $insideDataComment = false;
94
        // Remembers whether the current buffer line is part of a structure comment
95 8
        $insideStructureComment = false;
96
97
        // MediaWiki only accepts "\n" as row terminator
98 8
        $mediawikiNewLine = "\n";
99
100
        // Initialize the name of the current table
101 8
        $curTableName = '';
102
103 8
        $curTempTableHeaders = [];
104 8
        $curTempTable = [];
105
106 8
        $inTableHeader = false;
107
108
        /** @infection-ignore-all */
109 8
        while (! ImportSettings::$finished && ! $GLOBALS['error'] && ! ImportSettings::$timeoutPassed) {
110 8
            $data = $this->import->getNextChunk($importHandle);
111
112 8
            if ($data === false) {
113
                // Subtract data we didn't handle yet and stop processing
114
                ImportSettings::$offset -= mb_strlen($buffer);
115
                break;
116
            }
117
118 8
            if ($data !== true) {
119
                // Append new data to buffer
120 8
                $buffer = $data;
121 8
                unset($data);
122
                // Don't parse string if we're not at the end
123
                // and don't have a new line inside
124 8
                if (! str_contains($buffer, $mediawikiNewLine)) {
125
                    continue;
126
                }
127
            }
128
129
            // Because of reading chunk by chunk, the first line from the buffer
130
            // contains only a portion of an actual line from the imported file.
131
            // Therefore, we have to append it to the last line from the previous
132
            // chunk. If we are at the first chunk, $last_chunk_line should be empty.
133 8
            $buffer = $lastChunkLine . $buffer;
134
135
            // Process the buffer line by line
136 8
            $bufferLines = explode($mediawikiNewLine, $buffer);
137
138 8
            $fullBufferLinesCount = count($bufferLines);
139
            // If the reading is not finalized, the final line of the current chunk
140
            // will not be complete
141 8
            if (! ImportSettings::$finished) {
142
                $lastChunkLine = $bufferLines[--$fullBufferLinesCount];
143
            }
144
145
            /** @var list<string> $curTempLine Temporary storage of cell values */
146 8
            $curTempLine = [];
147 8
            for ($lineNr = 0; $lineNr < $fullBufferLinesCount; ++$lineNr) {
148 8
                $curBufferLine = trim($bufferLines[$lineNr]);
149
150
                // If the line is empty, go to the next one
151 8
                if ($curBufferLine === '') {
152 8
                    continue;
153
                }
154
155 8
                $firstCharacter = $curBufferLine[0];
156
157
                // Check beginning of comment
158 8
                if (str_starts_with($curBufferLine, '<!--')) {
159 8
                    $insideComment = true;
160 8
                    continue;
161
                }
162
163 8
                if ($insideComment) {
164
                    // Check end of comment
165 8
                    if (str_starts_with($curBufferLine, '-->')) {
166
                        // Only data comments are closed. The structure comments
167
                        // will be closed when a data comment begins (in order to
168
                        // skip structure tables)
169 8
                        if ($insideDataComment) {
170 8
                            $insideDataComment = false;
171
                        }
172
173
                        // End comments that are not related to table structure
174 8
                        if (! $insideStructureComment) {
175 8
                            $insideComment = false;
176
                        }
177
                    } else {
178
                        // Check table name
179 8
                        $matchTableName = [];
180 8
                        if (preg_match('/^Table data for `(.*)`$/', $curBufferLine, $matchTableName)) {
181 8
                            $curTableName = $matchTableName[1];
182 8
                            $insideDataComment = true;
183
184 8
                            $insideStructureComment = false;
185
                        } elseif (preg_match('/^Table structure for `(.*)`$/', $curBufferLine, $matchTableName)) {
186
                            // The structure comments will be ignored
187
                            $insideStructureComment = true;
188
                        }
189
                    }
190
191 8
                    continue;
192
                }
193
194 8
                if (preg_match('/^\{\|(.*)$/', $curBufferLine)) {
195
                    // Check start of table
196
197
                    // This will store all the column info on all rows from
198
                    // the current table read from the buffer
199 8
                    $curTempTable = [];
200
201
                    // Will be used as storage for the current row in the buffer
202
                    // Once all its columns are read, it will be added to
203
                    // $cur_temp_table and then it will be emptied
204 8
                    $curTempLine = [];
205
206
                    // Helps us differentiate the header columns
207
                    // from the normal columns
208 8
                    $inTableHeader = false;
209
                    // End processing because the current line does not
210
                    // contain any column information
211
                } elseif (
212 8
                    str_starts_with($curBufferLine, '|-')
213 8
                    || str_starts_with($curBufferLine, '|+')
214 8
                    || str_starts_with($curBufferLine, '|}')
215
                ) {
216
                    // Check begin row or end table
217
218
                    // Add current line to the values storage
219 8
                    if ($curTempLine !== []) {
220
                        // If the current line contains header cells
221
                        // ( marked with '!' ),
222
                        // it will be marked as table header
223 8
                        if ($inTableHeader) {
224
                            // Set the header columns
225 8
                            $curTempTableHeaders = $curTempLine;
226
                        } else {
227
                            // Normal line, add it to the table
228 4
                            $curTempTable[] = $curTempLine;
229
                        }
230
                    }
231
232
                    // Empty the temporary buffer
233 8
                    $curTempLine = [];
234
235
                    // No more processing required at the end of the table
236 8
                    if (str_starts_with($curBufferLine, '|}')) {
237
                        // Import the current table data into the database
238 8
                        $this->importDataOneTable(
239 8
                            new ImportTable($curTableName, $curTempTableHeaders, $curTempTable),
0 ignored issues
show
Bug introduced by
It seems like $curTempTableHeaders can also be of type PhpMyAdmin\Plugins\Import\list; however, parameter $columns of PhpMyAdmin\Import\ImportTable::__construct() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

239
                            new ImportTable($curTableName, /** @scrutinizer ignore-type */ $curTempTableHeaders, $curTempTable),
Loading history...
240 8
                            $sqlStatements,
241 8
                        );
242
243
                        // Reset table name
244 8
                        $curTableName = '';
245
                    }
246
                    // What's after the row tag is now only attributes
247 8
                } elseif ($firstCharacter === '|' || $firstCharacter === '!') {
248
                    // Check cell elements
249
250
                    // Header cells
251 8
                    if ($firstCharacter === '!') {
252
                        // Mark as table header, but treat as normal row
253 8
                        $curBufferLine = str_replace('!!', '||', $curBufferLine);
254
                        // Will be used to set $cur_temp_line as table header
255 8
                        $inTableHeader = true;
256
                    } else {
257 4
                        $inTableHeader = false;
258
                    }
259
260
                    // Loop through each table cell
261 8
                    $cells = $this->explodeMarkup($curBufferLine);
262 8
                    foreach ($cells as $cell) {
263 8
                        $cell = $this->getCellData($cell);
264
265
                        // Delete the beginning of the column, if there is one
266 8
                        $cell = trim($cell);
267 8
                        foreach (['|', '!'] as $colStartChar) {
268 8
                            $cell = $this->getCellContent($cell, $colStartChar);
269
                        }
270
271
                        // Add the cell to the row
272 8
                        $curTempLine[] = $cell;
273
                    }
274
                } else {
275
                    // If it's none of the above, then the current line has a bad
276
                    // format
277
                    $message = Message::error(
278
                        __('Invalid format of mediawiki input on line: <br>%s.'),
279
                    );
280
                    $message->addParam($curBufferLine);
281
                    $GLOBALS['error'] = true;
282
                }
283
            }
284
        }
285
286 8
        return $sqlStatements;
287
    }
288
289
    /**
290
     * Imports data from a single table
291
     *
292
     * @param string[] $sqlStatements List of SQL statements to be executed
293
     */
294 8
    private function importDataOneTable(ImportTable $table, array &$sqlStatements): void
295
    {
296 8
        if ($this->analyze) {
297
            // Set the table name
298 8
            if ($table->tableName === '') {
299
                $table->tableName = $this->import->getNextAvailableTableName(
300
                    Current::$database,
301
                    pathinfo(ImportSettings::$importFileName, PATHINFO_FILENAME),
0 ignored issues
show
Bug introduced by
It seems like pathinfo(PhpMyAdmin\Impo...ame, PATHINFO_FILENAME) can also be of type array; however, parameter $proposedTableName of PhpMyAdmin\Import\Import...extAvailableTableName() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

301
                    /** @scrutinizer ignore-type */ pathinfo(ImportSettings::$importFileName, PATHINFO_FILENAME),
Loading history...
302
                );
303
            }
304
305
            // Set generic names for table headers if they don't exist
306 8
            if ($table->columns === []) {
307
                $table->columns = $this->setTableHeaders(count($table->rows[0]));
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->setTableHeaders(count($table->rows[0])) of type array or string[] is incompatible with the declared type PhpMyAdmin\Import\list of property $columns.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
308
            }
309
310
            // Obtain the best-fit MySQL types for each column
311 8
            $analysis = $this->import->analyzeTable($table);
312
313 8
            $dbName = Current::$database !== '' ? Current::$database : 'mediawiki_DB';
314
315 8
            if (Current::$database === '') {
316 8
                $sqlStatements = $this->import->createDatabase($dbName, 'utf8', 'utf8_general_ci', $sqlStatements);
317
            }
318
319 8
            $this->import->buildSql(
320 8
                $dbName,
321 8
                [$table],
322 8
                [$analysis],
323 8
                sqlData: $sqlStatements,
324 8
            );
325
        }
326
327
        // Commit any possible data in buffers
328 8
        $this->import->runQuery('', $sqlStatements);
329
    }
330
331
    /**
332
     * Set generic names for table headers, if they don't exist
333
     *
334
     * @return list<string>
0 ignored issues
show
Bug introduced by
The type PhpMyAdmin\Plugins\Import\list was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
335
     */
336
    private function setTableHeaders(int $numCols): array
337
    {
338
        $tableHeaders = [];
339
340
        // The first table row should contain the number of columns
341
        // If they are not set, generic names will be given (COL 1, COL 2, etc)
342
        for ($i = 0; $i < $numCols; ++$i) {
343
            $tableHeaders[] = 'COL ' . ($i + 1);
344
        }
345
346
        return $tableHeaders;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $tableHeaders returns the type array|string[] which is incompatible with the documented return type PhpMyAdmin\Plugins\Import\list.
Loading history...
347
    }
348
349
    /**
350
     * Replaces all instances of the '||' separator between delimiters
351
     * in a given string
352
     *
353
     * @param string $replace the string to be replaced with
354
     * @param string $subject the text to be replaced
355
     *
356
     * @return string with replacements
357
     */
358 8
    private function delimiterReplace(string $replace, string $subject): string
359
    {
360
        // String that will be returned
361 8
        $cleaned = '';
362
        // Possible states of current character
363 8
        $insideTag = false;
364 8
        $insideAttribute = false;
365
        // Attributes can be declared with either " or '
366 8
        $startAttributeCharacter = false;
367
368
        // The full separator is "||";
369
        // This remembers if the previous character was '|'
370 8
        $partialSeparator = false;
371
372
        // Parse text char by char
373
        /** @infection-ignore-all */
374 8
        for ($i = 0, $iMax = strlen($subject); $i < $iMax; $i++) {
375 8
            $curChar = $subject[$i];
376
            // Check for separators
377 8
            if ($curChar === '|') {
378
                // If we're not inside a tag, then this is part of a real separator,
379
                // so we append it to the current segment
380 4
                if (! $insideAttribute) {
381 4
                    $cleaned .= $curChar;
382 4
                    if ($partialSeparator) {
383
                        $insideTag = false;
384 2
                        $insideAttribute = false;
385
                    }
386
                } elseif ($partialSeparator) {
387
                    // If we are inside a tag, we replace the current char with
388
                    // the placeholder and append that to the current segment
389
                    $cleaned .= $replace;
390
                }
391
392
                // If the previous character was also '|', then this ends a
393
                // full separator. If not, this may be the beginning of one
394 4
                $partialSeparator = ! $partialSeparator;
0 ignored issues
show
introduced by
$partialSeparator is of type mixed, thus it always evaluated to false.
Loading history...
395
            } else {
396
                // If we're inside a tag attribute and the current character is
397
                // not '|', but the previous one was, it means that the single '|'
398
                // was not appended, so we append it now
399 8
                if ($partialSeparator && $insideAttribute) {
400
                    $cleaned .= '|';
401
                }
402
403
                // If the char is different from "|", no separator can be formed
404 8
                $partialSeparator = false;
405
406
                // any other character should be appended to the current segment
407 8
                $cleaned .= $curChar;
408
409 8
                if ($curChar === '<' && ! $insideAttribute) {
410
                    // start of a tag
411
                    $insideTag = true;
412 8
                } elseif ($curChar === '>' && ! $insideAttribute) {
413
                    // end of a tag
414
                    $insideTag = false;
415 8
                } elseif (($curChar === '"' || $curChar === "'") && $insideTag) {
416
                    // start or end of an attribute
417
                    if (! $insideAttribute) {
418
                        $insideAttribute = true;
419
                        // remember the attribute`s declaration character (" or ')
420
                        $startAttributeCharacter = $curChar;
421
                    } elseif ($curChar == $startAttributeCharacter) {
422
                        $insideAttribute = false;
423
                        // unset attribute declaration character
424
                        $startAttributeCharacter = false;
425
                    }
426
                }
427
            }
428
        }
429
430 8
        return $cleaned;
431
    }
432
433
    /**
434
     * Separates a string into items, similarly to explode
435
     * Uses the '||' separator (which is standard in the mediawiki format)
436
     * and ignores any instances of it inside markup tags
437
     * Used in parsing buffer lines containing data cells
438
     *
439
     * @param string $text text to be split
440
     *
441
     * @return string[]
442
     */
443 8
    private function explodeMarkup(string $text): array
444
    {
445 8
        $separator = '||';
446 8
        $placeholder = "\x00";
447
448
        // Remove placeholder instances
449 8
        $text = str_replace($placeholder, '', $text);
450
451
        // Replace instances of the separator inside HTML-like
452
        // tags with the placeholder
453 8
        $cleaned = $this->delimiterReplace($placeholder, $text);
454
        // Explode, then put the replaced separators back in
455 8
        $items = explode($separator, $cleaned);
456 8
        foreach ($items as $i => $str) {
457 8
            $items[$i] = str_replace($placeholder, $separator, $str);
458
        }
459
460 8
        return $items;
461
    }
462
463 8
    private function getCellData(string $cell): string
464
    {
465
        // A cell could contain both parameters and data
466 8
        $cellData = explode('|', $cell, 2);
467
468
        // A '|' inside an invalid link should not
469
        // be mistaken as delimiting cell parameters
470 8
        if (! str_contains($cellData[0], '[[')) {
471 8
            return $cell;
472
        }
473
474
        if (count($cellData) === 1) {
475
            return $cellData[0];
476
        }
477
478
        return $cellData[1];
479
    }
480
481 8
    private function getCellContent(string $cell, string $colStartChar): string
482
    {
483 8
        if (str_starts_with($cell, $colStartChar)) {
484 8
            return trim(mb_substr($cell, 1));
485
        }
486
487 8
        return $cell;
488
    }
489
}
490