Issues (38)

src/ImportField.php (5 issues)

1
<?php
2
3
namespace SilverStripe\DocumentConverter;
4
5
use DOMAttr;
6
use DOMDocument;
7
use DOMElement;
8
use DOMXPath;
9
use Page;
10
use SilverStripe\AssetAdmin\Forms\UploadField;
11
use SilverStripe\Assets\File;
12
use SilverStripe\Assets\FileNameFilter;
13
use SilverStripe\Assets\Folder;
14
use SilverStripe\Assets\Image;
15
use SilverStripe\Assets\Upload;
16
use SilverStripe\Core\Config\Config;
17
use SilverStripe\Core\Convert;
18
use SilverStripe\Core\Injector\Injector;
19
use SilverStripe\Control\Director;
20
use SilverStripe\Control\HTTPRequest;
21
use SilverStripe\Control\HTTPResponse;
22
use SilverStripe\Forms\HTMLEditor\HTMLEditorConfig;
23
use SilverStripe\Forms\HTMLEditor\HTMLEditorSanitiser;
24
use SilverStripe\ORM\DataObject;
25
use SilverStripe\Versioned\Versioned;
26
use SilverStripe\View\Parsers\HTMLValue;
27
use Tidy;
28
29
/**
30
 * DocumentImporterField is built on top of UploadField to access a document
31
 * conversion capabilities. The original field is stripped down to allow only
32
 * uploads from the user's computer, and triggers the conversion when the upload
33
 * is completed.
34
 *
35
 * The file upload has additional parameters injected. They are set by the user
36
 * through the fields provided on the DocumentImportField:
37
 *
38
 * * SplitHeader: if enabled, scans the document looking for H1 or H2 headers and
39
 *   puts each subsection into separate page. The first part of the document until
40
 *   the first header occurence is added to the current page.
41
 * * KeepSource: prevents the removal of the uploaded document, and stores its ID
42
 *   in the has_one relationship on the parent page (see the
43
 *   DocumentImportField::__construct for how to configure the name of this has_one)
44
 * * ChosenFolderID: directory to be used for storing the original document and the
45
 *   image files that come along with the document.
46
 * * PublishPages: whether the current and the chapter pages should be published.
47
 * * IncludeTOC: builds a table of contents and puts it into the parent page. This
48
 *   could potentially replace the document content from before the first heading.
49
 *   Also, if the KeepSource is enabled, it will inject the document link into this
50
 *   page.
51
 *
52
 *  Caveat: there is some coupling between the above parameters.
53
 */
54
class ImportField extends UploadField
55
{
56
57
    private static $allowed_actions = ['upload'];
58
59
    private static $importer_class = ServiceConnector::class;
60
61
    /**
62
     * Process the document immediately upon upload.
63
     */
64
    public function upload(HTTPRequest $request)
65
    {
66
        if ($this->isDisabled() || $this->isReadonly()) {
67
            return $this->httpError(403);
68
        }
69
70
        // Protect against CSRF on destructive action
71
        $token = $this->getForm()->getSecurityToken();
72
        if (!$token->checkRequest($request)) {
73
            return $this->httpError(400);
74
        }
75
76
        $tmpfile = $request->postVar('Upload');
77
78
        // Check if the file has been uploaded into the temporary storage.
79
        if (!$tmpfile) {
80
            $return = [
81
                'error' => _t(
82
                    'SilverStripe\\AssetAdmin\\Forms\\UploadField.FIELDNOTSET',
83
                    'File information not found'
84
                )
85
            ];
86
        } else {
87
            $return = [
88
                'name' => $tmpfile['name'],
89
                'size' => $tmpfile['size'],
90
                'type' => $tmpfile['type'],
91
                'error' => $tmpfile['error']
92
            ];
93
        }
94
95
        if (!$return['error']) {
96
            // Get options for this import.
97
            $splitHeader = (int)$request->postVar('SplitHeader');
98
            $keepSource = (bool)$request->postVar('KeepSource');
99
            $chosenFolderID = (int)$request->postVar('ChosenFolderID');
100
            $publishPages = (bool)$request->postVar('PublishPages');
101
            $includeTOC = (bool)$request->postVar('IncludeTOC');
102
103
            // Process the document and write the page.
104
            $preservedDocument = null;
105
            if ($keepSource) {
106
                $preservedDocument = $this->preserveSourceDocument($tmpfile, $chosenFolderID);
107
            }
108
109
            $importResult = $this->importFromPOST($tmpfile, $splitHeader, $publishPages, $chosenFolderID);
110
            if (is_array($importResult) && isset($importResult['error'])) {
111
                $return['error'] = $importResult['error'];
112
            } elseif ($includeTOC) {
113
                $this->writeTOC($publishPages, $keepSource ? $preservedDocument : null);
114
            }
115
        }
116
117
        $response = HTTPResponse::create(Convert::raw2json([$return]));
118
        $response->addHeader('Content-Type', 'application/json');
119
        return $response;
120
    }
121
122
    /**
123
     * Preserves the source file by copying it to a specified folder.
124
     *
125
     * @param $tmpfile Temporary file data structure.
0 ignored issues
show
The type SilverStripe\DocumentConverter\Temporary was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
126
     * @param int $chosenFolderID Target folder.
127
     * @return File Stored file.
128
     */
129
    protected function preserveSourceDocument($tmpfile, $chosenFolderID = null)
130
    {
131
        $upload = Upload::create();
132
133
        $file = File::create();
134
        $upload->loadIntoFile($tmpfile, $file, $chosenFolderID);
135
136
        $page = $this->form->getRecord();
137
        $page->ImportedFromFileID = $file->ID;
138
        $page->write();
139
140
        return $file;
141
    }
142
143
    /**
144
     * Builds and writes the table of contents for the document.
145
     *
146
     * @param bool $publishPage Should the parent page be published.
147
     * @param File $preservedDocument Set if the link to the original document should be added.
148
     */
149
    protected function writeTOC($publishPages = false, $preservedDocument = null)
150
    {
151
        $page = $this->form->getRecord();
152
        $content = '<ul>';
153
154
        if ($page) {
155
            if ($page->Children()->Count() > 0) {
156
                foreach ($page->Children() as $child) {
157
                    $content .= '<li><a href="' . $child->Link() . '">' . $child->Title . '</a></li>';
158
                }
159
                $page->Content = $content . '</ul>';
160
            } else {
161
                $doc = new DOMDocument();
162
                $doc->loadHTML($page->Content);
163
                $body = $doc->getElementsByTagName('body')->item(0);
164
                $node = $body->firstChild;
165
                $h1 = $h2 = 1;
166
                while ($node) {
167
                    if ($node instanceof DOMElement && $node->tagName == 'h1') {
168
                        $content .= '<li><a href="#h1.' . $h1 . '">' .
169
                            trim(preg_replace('/\n|\r/', '', Convert::html2raw($node->textContent))) .
170
                            '</a></li>';
171
                        $node->setAttributeNode(new DOMAttr("id", "h1.".$h1));
172
                        $h1++;
173
                    } elseif ($node instanceof DOMElement && $node->tagName == 'h2') {
174
                        $content .= '<li class="menu-h2"><a href="#h2.' . $h2 . '">' .
175
                            trim(preg_replace('/\n|\r/', '', Convert::html2raw($node->textContent))) .
176
                            '</a></li>';
177
                        $node->setAttributeNode(new DOMAttr("id", "h2.".$h2));
178
                        $h2++;
179
                    }
180
                    $node = $node->nextSibling;
181
                }
182
                $page->Content = $content . '</ul>' . $doc->saveHTML();
183
            }
184
185
            // Add in the link to the original document, if provided.
186
            if ($preservedDocument) {
187
                $page->Content = '<a href="' .
188
                    $preservedDocument->Link() .
189
                    '" title="download original document">download original document (' .
190
                    $preservedDocument->getSize() .
191
                    ')</a>' .
192
                    $page->Content;
193
            }
194
195
            // Store the result
196
            $page->write();
197
            if ($publishPages) {
198
                $page->publishRecursive();
199
            }
200
        }
201
    }
202
203
    protected function getBodyText($doc, $node)
204
    {
205
        // Build a new doc
206
        $htmldoc = new DOMDocument();
207
        // Create the html element
208
        $html = $htmldoc->createElement('html');
209
        $htmldoc->appendChild($html);
210
        // Append the body node
211
        $html->appendChild($htmldoc->importNode($node, true));
212
213
        // Get the text as html, remove the entry and exit root tags and return
214
        $text = $htmldoc->saveHTML();
215
        $text = preg_replace('/^.*<body>/', '', $text);
216
        $text = preg_replace('/<\/body>.*$/', '', $text);
217
218
        return $text;
219
    }
220
221
    /**
222
     * Used only when writing the document that has been split by headers.
223
     * Can write both to the chapter pages as well as the master page.
224
     *
225
     * @param string $subtitle Title of the chapter - if missing, it will write to the master page.
226
     * @param $subdoc
227
     * @param $subnode
228
     * @param int $sort Order of the chapter page.
229
     * @param $publishPages Whether to publish the resulting child/master pages.
0 ignored issues
show
The type SilverStripe\DocumentConverter\Whether was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
230
     */
231
    protected function writeContent($subtitle, $subdoc, $subnode, $sort = null, $publishPages = false)
232
    {
233
        $record = $this->form->getRecord();
234
235
        if ($subtitle) {
236
            // Write the chapter page to a subpage.
237
            $page = DataObject::get_one(
238
                'Page',
239
                sprintf('"Title" = \'%s\' AND "ParentID" = %d', $subtitle, $record->ID)
240
            );
241
            if (!$page) {
242
                $page = Page::create();
243
                $page->ParentID = $record->ID;
244
                $page->Title = $subtitle;
245
            }
246
247
            unset($this->unusedChildren[$page->ID]);
248
            file_put_contents(ASSETS_PATH . '/index-' . $sort . '.html', $this->getBodyText($subdoc, $subnode));
249
250
            if ($sort) {
251
                $page->Sort = $sort;
252
            }
253
            $page->Content = $this->getBodyText($subdoc, $subnode);
254
            $page->write();
255
            if ($publishPages) {
256
                $page->publishRecursive();
257
            }
258
        } else {
259
            // Write to the master page.
260
            $record->Content = $this->getBodyText($subdoc, $subnode);
261
            $record->write();
262
263
            if ($publishPages) {
264
                $record->publishRecursive();
265
            }
266
        }
267
    }
268
269
    /**
270
     * Imports a document at a certain path onto the current page and writes it.
271
     * CAUTION: Overwrites any existing content on the page!
272
     *
273
     * @param array $tmpFile Array as received from PHP's POST upload.
274
     * @param bool $splitHeader Heading level to split by.
275
     * @param bool $publishPages Whether the underlying pages should be published after import.
276
     * @param int $chosenFolderID ID of the working folder - here the converted file and images will be stored.
277
     */
278
    public function importFromPOST($tmpFile, $splitHeader = false, $publishPages = false, $chosenFolderID = null)
279
    {
280
281
        $fileDescriptor = [
282
            'name' => $tmpFile['name'],
283
            'path' => $tmpFile['tmp_name'],
284
            'mimeType' => $tmpFile['type']
285
        ];
286
287
        $sourcePage = $this->form->getRecord();
288
        $importerClass = $this->config()->get('importer_class');
289
        $importer = Injector::inst()->create($importerClass, $fileDescriptor, $chosenFolderID);
290
        $content = $importer->import();
291
292
        if (is_array($content) && isset($content['error'])) {
293
            return $content;
294
        }
295
296
        // Clean up with tidy (requires tidy module)
297
        $tidy = new Tidy();
298
        $tidy->parseString($content, ['output-xhtml' => true], 'utf8');
299
        $tidy->cleanRepair();
300
301
        $fragment = [];
302
        foreach ($tidy->body()->child as $child) {
303
            $fragment[] = $child->value;
304
        }
305
306
        $htmlValue = Injector::inst()->create(HTMLValue::class, implode("\n", $fragment));
307
308
        // Sanitise
309
        $santiser = Injector::inst()->create(HTMLEditorSanitiser::class, HTMLEditorConfig::get_active());
310
        $santiser->sanitise($htmlValue);
311
312
        // Load in the HTML
313
        $doc = $htmlValue->getDocument();
314
        $xpath = new DOMXPath($doc);
315
316
        // make sure any images are added as Image records with a relative link to assets
317
        $chosenFolder = ($this->chosenFolderID) ? DataObject::get_by_id(Folder::class, $this->chosenFolderID) : null;
318
        $folderName = ($chosenFolder) ? '/' . $chosenFolder->Name : '';
319
        $imgs = $xpath->query('//img');
320
        for ($i = 0; $i < $imgs->length; $i++) {
321
            $img = $imgs->item($i);
322
            $originalPath = 'assets/' . $folderName . '/' . $img->getAttribute('src');
323
            $name = FileNameFilter::create()->filter(basename($originalPath));
324
325
            $image = Image::get()->filter([
326
                'Name' => $name,
327
                'ParentID' => (int)$chosenFolderID
328
            ])->first();
329
            if (!($image && $image->exists())) {
330
                $image = Image::create();
331
                $image->ParentID = (int)$chosenFolderID;
332
                $image->Name = $name;
333
                $image->write();
334
            }
335
336
            // make sure it's put in place correctly so Image record knows where it is.
337
            // e.g. in the case of underscores being renamed to dashes.
338
            @rename(Director::getAbsFile($originalPath), Director::getAbsFile($image->getFilename()));
339
340
            $img->setAttribute('src', $image->getFilename());
341
        }
342
343
        $remove_rules = [
344
            // Change any headers that contain font tags (other than font face tags) into p elements
345
            '//h1[.//font[not(@face)]]' => 'p',
346
            // Remove any font tags
347
            '//font'
348
        ];
349
350
        foreach ($remove_rules as $rule => $parenttag) {
351
            if (is_numeric($rule)) {
352
                $rule = $parenttag;
353
                $parenttag = null;
354
            }
355
356
            $nodes = [];
357
            foreach ($xpath->query($rule) as $node) {
358
                $nodes[] = $node;
359
            }
360
361
            foreach ($nodes as $node) {
362
                $parent = $node->parentNode;
363
364
                if ($parenttag) {
365
                    $parent = $doc->createElement($parenttag);
366
                    $node->nextSibling ?
367
                        $node->parentNode->insertBefore($parent, $node->nextSibling) :
368
                        $node->parentNode->appendChild($parent);
369
                }
370
371
                while ($node->firstChild) {
372
                    $parent->appendChild($node->firstChild);
373
                }
374
                $node->parentNode->removeChild($node);
375
            }
376
        }
377
378
        // Strip style, class, lang attributes.
379
        $els = $doc->getElementsByTagName('*');
380
        for ($i = 0; $i < $els->length; $i++) {
381
            $el = $els->item($i);
382
            $el->removeAttribute('class');
383
            $el->removeAttribute('style');
384
            $el->removeAttribute('lang');
385
        }
386
387
        $els = $doc->getElementsByTagName('*');
0 ignored issues
show
The assignment to $els is dead and can be removed.
Loading history...
388
389
        $headingXPath = [
390
            'self::h1',
391
            'self::h2',
392
            'self::h3',
393
            'self::h4',
394
            'self::h5',
395
            'self::h6',
396
        ];
397
        // Remove a bunch of unwanted elements
398
        $clean = [
399
            // Empty paragraphs
400
            '//p[not(descendant-or-self::text() | descendant-or-self::img)]',
401
            // Empty headers
402
            '//*[' . implode(' | ', $headingXPath) . '][not(descendant-or-self::text() | descendant-or-self::img)]',
403
            // Anchors
404
            '//a[not(@href)]',
405
            // BR tags
406
            '//br'
407
        ];
408
409
        foreach ($clean as $query) {
410
            // First get all the nodes. Need to build array, as they'll disappear from the
411
            // nodelist while we're deleteing them, causing the indexing to screw up.
412
            $nodes = [];
413
            foreach ($xpath->query($query) as $node) {
414
                $nodes[] = $node;
415
            }
416
417
            // Then remove them all
418
            foreach ($nodes as $node) {
419
                if ($node->parentNode) {
420
                    $node->parentNode->removeChild($node);
421
                }
422
            }
423
        }
424
425
        // Now split the document into portions by H1
426
        $body = $doc->getElementsByTagName('body')->item(0);
427
428
        $this->unusedChildren = [];
0 ignored issues
show
Bug Best Practice introduced by
The property unusedChildren does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
429
        foreach ($sourcePage->Children() as $child) {
430
            $this->unusedChildren[$child->ID] = $child;
431
        }
432
433
        $documentImporterFieldError = false;
434
435
        $documentImporterFieldErrorHandler = function (
436
            $errno,
437
            $errstr,
438
            $errfile,
439
            $errline
440
        ) use ($documentImporterFieldError) {
441
            $documentImporterFieldError = _t(
442
                'SilverStripe\\DocumentConverter\\ServiceConnector.PROCESSFAILED',
443
                'Could not process document, please double-check you uploaded a .doc or .docx format.',
444
                'Document Converter processes Word documents into HTML.'
445
            );
446
447
            // Do not cascade the error through other handlers
448
            return true;
449
        };
450
451
        set_error_handler($documentImporterFieldErrorHandler);
452
453
        $subtitle = null;
454
        $subdoc = new DOMDocument();
455
        $subnode = $subdoc->createElement('body');
456
        $node = $body->firstChild;
457
        $sort = 1;
458
        if ($splitHeader == 1 || $splitHeader == 2) {
459
            while ($node && !$documentImporterFieldError) {
460
                if ($node instanceof DOMElement && $node->tagName == 'h' . $splitHeader) {
461
                    if ($subnode->hasChildNodes()) {
462
                        $this->writeContent($subtitle, $subdoc, $subnode, $sort, $publishPages);
463
                        $sort++;
464
                    }
465
466
                    $subdoc = new DOMDocument();
467
                    $subnode = $subdoc->createElement('body');
468
                    $subtitle = trim(preg_replace('/\n|\r/', '', Convert::html2raw($node->textContent)));
469
                } else {
470
                    $subnode->appendChild($subdoc->importNode($node, true));
471
                }
472
473
                $node = $node->nextSibling;
474
            }
475
        } else {
476
            $this->writeContent($subtitle, $subdoc, $body, null, $publishPages);
477
        }
478
479
        if ($subnode->hasChildNodes() && !$documentImporterFieldError) {
480
            $this->writeContent($subtitle, $subdoc, $subnode, null, $publishPages);
481
        }
482
483
        restore_error_handler();
484
        if ($documentImporterFieldError) {
485
            return ['error' => $documentImporterFieldError];
486
        }
487
488
        foreach ($this->unusedChildren as $child) {
489
            $origStage = Versioned::current_stage();
0 ignored issues
show
The method current_stage() does not exist on SilverStripe\Versioned\Versioned. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

489
            /** @scrutinizer ignore-call */ 
490
            $origStage = Versioned::current_stage();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
490
491
            Versioned::set_stage(Versioned::DRAFT);
492
            $draft = clone $child;
493
            $draft->delete();
494
495
            Versioned::set_stage(Versioned::LIVE);
496
            $published = clone $child;
497
            $published->delete();
498
499
            Versioned::set_stage($origStage);
500
        }
501
502
        $sourcePage->write();
503
    }
504
}
505