Checks if an incompatible expression is used in output or concatination.
1 | <?php |
||
2 | |||
3 | namespace SilverStripe\DocumentConverter; |
||
4 | |||
5 | use DOMAttr; |
||
6 | use DOMDocument; |
||
7 | use DOMElement; |
||
8 | use DOMXPath; |
||
9 | use Page; |
||
10 | use SilverStripe\AssetAdmin\Forms\UploadField; |
||
11 | use SilverStripe\Assets\File; |
||
12 | use SilverStripe\Assets\FileNameFilter; |
||
13 | use SilverStripe\Assets\Folder; |
||
14 | use SilverStripe\Assets\Image; |
||
15 | use SilverStripe\Assets\Upload; |
||
16 | use SilverStripe\Core\Config\Config; |
||
17 | use SilverStripe\Core\Convert; |
||
18 | use SilverStripe\Core\Injector\Injector; |
||
19 | use SilverStripe\Control\Director; |
||
20 | use SilverStripe\Control\HTTPRequest; |
||
21 | use SilverStripe\Control\HTTPResponse; |
||
22 | use SilverStripe\Forms\HTMLEditor\HTMLEditorConfig; |
||
23 | use SilverStripe\Forms\HTMLEditor\HTMLEditorSanitiser; |
||
24 | use SilverStripe\ORM\DataObject; |
||
25 | use SilverStripe\Versioned\Versioned; |
||
26 | use SilverStripe\View\Parsers\HTMLValue; |
||
27 | use Tidy; |
||
28 | |||
29 | /** |
||
30 | * DocumentImporterField is built on top of UploadField to access a document |
||
31 | * conversion capabilities. The original field is stripped down to allow only |
||
32 | * uploads from the user's computer, and triggers the conversion when the upload |
||
33 | * is completed. |
||
34 | * |
||
35 | * The file upload has additional parameters injected. They are set by the user |
||
36 | * through the fields provided on the DocumentImportField: |
||
37 | * |
||
38 | * * SplitHeader: if enabled, scans the document looking for H1 or H2 headers and |
||
39 | * puts each subsection into separate page. The first part of the document until |
||
40 | * the first header occurence is added to the current page. |
||
41 | * * KeepSource: prevents the removal of the uploaded document, and stores its ID |
||
42 | * in the has_one relationship on the parent page (see the |
||
43 | * DocumentImportField::__construct for how to configure the name of this has_one) |
||
44 | * * ChosenFolderID: directory to be used for storing the original document and the |
||
45 | * image files that come along with the document. |
||
46 | * * PublishPages: whether the current and the chapter pages should be published. |
||
47 | * * IncludeTOC: builds a table of contents and puts it into the parent page. This |
||
48 | * could potentially replace the document content from before the first heading. |
||
49 | * Also, if the KeepSource is enabled, it will inject the document link into this |
||
50 | * page. |
||
51 | * |
||
52 | * Caveat: there is some coupling between the above parameters. |
||
53 | */ |
||
54 | class ImportField extends UploadField |
||
55 | { |
||
56 | |||
57 | private static $allowed_actions = ['upload']; |
||
58 | |||
59 | private static $importer_class = ServiceConnector::class; |
||
60 | |||
61 | /** |
||
62 | * Process the document immediately upon upload. |
||
63 | */ |
||
64 | public function upload(HTTPRequest $request) |
||
65 | { |
||
66 | if ($this->isDisabled() || $this->isReadonly()) { |
||
67 | return $this->httpError(403); |
||
68 | } |
||
69 | |||
70 | // Protect against CSRF on destructive action |
||
71 | $token = $this->getForm()->getSecurityToken(); |
||
72 | if (!$token->checkRequest($request)) { |
||
73 | return $this->httpError(400); |
||
74 | } |
||
75 | |||
76 | $tmpfile = $request->postVar('Upload'); |
||
77 | |||
78 | // Check if the file has been uploaded into the temporary storage. |
||
79 | if (!$tmpfile) { |
||
80 | $return = [ |
||
81 | 'error' => _t( |
||
82 | 'SilverStripe\\AssetAdmin\\Forms\\UploadField.FIELDNOTSET', |
||
83 | 'File information not found' |
||
84 | ) |
||
85 | ]; |
||
86 | } else { |
||
87 | $return = [ |
||
88 | 'name' => $tmpfile['name'], |
||
89 | 'size' => $tmpfile['size'], |
||
90 | 'type' => $tmpfile['type'], |
||
91 | 'error' => $tmpfile['error'] |
||
92 | ]; |
||
93 | } |
||
94 | |||
95 | if (!$return['error']) { |
||
96 | // Get options for this import. |
||
97 | $splitHeader = (int)$request->postVar('SplitHeader'); |
||
98 | $keepSource = (bool)$request->postVar('KeepSource'); |
||
99 | $chosenFolderID = (int)$request->postVar('ChosenFolderID'); |
||
100 | $publishPages = (bool)$request->postVar('PublishPages'); |
||
101 | $includeTOC = (bool)$request->postVar('IncludeTOC'); |
||
102 | |||
103 | // Process the document and write the page. |
||
104 | $preservedDocument = null; |
||
105 | if ($keepSource) { |
||
106 | $preservedDocument = $this->preserveSourceDocument($tmpfile, $chosenFolderID); |
||
107 | } |
||
108 | |||
109 | $importResult = $this->importFromPOST($tmpfile, $splitHeader, $publishPages, $chosenFolderID); |
||
110 | if (is_array($importResult) && isset($importResult['error'])) { |
||
111 | $return['error'] = $importResult['error']; |
||
112 | } elseif ($includeTOC) { |
||
113 | $this->writeTOC($publishPages, $keepSource ? $preservedDocument : null); |
||
114 | } |
||
115 | } |
||
116 | |||
117 | $response = HTTPResponse::create(Convert::raw2json([$return])); |
||
118 | $response->addHeader('Content-Type', 'application/json'); |
||
119 | return $response; |
||
120 | } |
||
121 | |||
122 | /** |
||
123 | * Preserves the source file by copying it to a specified folder. |
||
124 | * |
||
125 | * @param $tmpfile Temporary file data structure. |
||
126 | * @param int $chosenFolderID Target folder. |
||
127 | * @return File Stored file. |
||
128 | */ |
||
129 | protected function preserveSourceDocument($tmpfile, $chosenFolderID = null) |
||
130 | { |
||
131 | $upload = Upload::create(); |
||
132 | |||
133 | $file = File::create(); |
||
134 | $upload->loadIntoFile($tmpfile, $file, $chosenFolderID); |
||
135 | |||
136 | $page = $this->form->getRecord(); |
||
137 | $page->ImportedFromFileID = $file->ID; |
||
138 | $page->write(); |
||
139 | |||
140 | return $file; |
||
141 | } |
||
142 | |||
143 | /** |
||
144 | * Builds and writes the table of contents for the document. |
||
145 | * |
||
146 | * @param bool $publishPage Should the parent page be published. |
||
147 | * @param File $preservedDocument Set if the link to the original document should be added. |
||
148 | */ |
||
149 | protected function writeTOC($publishPages = false, $preservedDocument = null) |
||
150 | { |
||
151 | $page = $this->form->getRecord(); |
||
152 | $content = '<ul>'; |
||
153 | |||
154 | if ($page) { |
||
155 | if ($page->Children()->Count() > 0) { |
||
156 | foreach ($page->Children() as $child) { |
||
157 | $content .= '<li><a href="' . $child->Link() . '">' . $child->Title . '</a></li>'; |
||
158 | } |
||
159 | $page->Content = $content . '</ul>'; |
||
160 | } else { |
||
161 | $doc = new DOMDocument(); |
||
162 | $doc->loadHTML($page->Content); |
||
163 | $body = $doc->getElementsByTagName('body')->item(0); |
||
164 | $node = $body->firstChild; |
||
165 | $h1 = $h2 = 1; |
||
166 | while ($node) { |
||
167 | if ($node instanceof DOMElement && $node->tagName == 'h1') { |
||
168 | $content .= '<li><a href="#h1.' . $h1 . '">' . |
||
169 | trim(preg_replace('/\n|\r/', '', Convert::html2raw($node->textContent))) . |
||
170 | '</a></li>'; |
||
171 | $node->setAttributeNode(new DOMAttr("id", "h1.".$h1)); |
||
172 | $h1++; |
||
173 | } elseif ($node instanceof DOMElement && $node->tagName == 'h2') { |
||
174 | $content .= '<li class="menu-h2"><a href="#h2.' . $h2 . '">' . |
||
175 | trim(preg_replace('/\n|\r/', '', Convert::html2raw($node->textContent))) . |
||
176 | '</a></li>'; |
||
177 | $node->setAttributeNode(new DOMAttr("id", "h2.".$h2)); |
||
178 | $h2++; |
||
179 | } |
||
180 | $node = $node->nextSibling; |
||
181 | } |
||
182 | $page->Content = $content . '</ul>' . $doc->saveHTML(); |
||
183 | } |
||
184 | |||
185 | // Add in the link to the original document, if provided. |
||
186 | if ($preservedDocument) { |
||
187 | $page->Content = '<a href="' . |
||
188 | $preservedDocument->Link() . |
||
189 | '" title="download original document">download original document (' . |
||
190 | $preservedDocument->getSize() . |
||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
191 | ')</a>' . |
||
192 | $page->Content; |
||
193 | } |
||
194 | |||
195 | // Store the result |
||
196 | $page->write(); |
||
197 | if ($publishPages) { |
||
198 | $page->publishRecursive(); |
||
199 | } |
||
200 | } |
||
201 | } |
||
202 | |||
203 | protected function getBodyText($doc, $node) |
||
204 | { |
||
205 | // Build a new doc |
||
206 | $htmldoc = new DOMDocument(); |
||
207 | // Create the html element |
||
208 | $html = $htmldoc->createElement('html'); |
||
209 | $htmldoc->appendChild($html); |
||
210 | // Append the body node |
||
211 | $html->appendChild($htmldoc->importNode($node, true)); |
||
212 | |||
213 | // Get the text as html, remove the entry and exit root tags and return |
||
214 | $text = $htmldoc->saveHTML(); |
||
215 | $text = preg_replace('/^.*<body>/', '', $text); |
||
216 | $text = preg_replace('/<\/body>.*$/', '', $text); |
||
217 | |||
218 | return $text; |
||
219 | } |
||
220 | |||
221 | /** |
||
222 | * Used only when writing the document that has been split by headers. |
||
223 | * Can write both to the chapter pages as well as the master page. |
||
224 | * |
||
225 | * @param string $subtitle Title of the chapter - if missing, it will write to the master page. |
||
226 | * @param $subdoc |
||
227 | * @param $subnode |
||
228 | * @param int $sort Order of the chapter page. |
||
229 | * @param $publishPages Whether to publish the resulting child/master pages. |
||
230 | */ |
||
231 | protected function writeContent($subtitle, $subdoc, $subnode, $sort = null, $publishPages = false) |
||
232 | { |
||
233 | $record = $this->form->getRecord(); |
||
234 | |||
235 | if ($subtitle) { |
||
236 | // Write the chapter page to a subpage. |
||
237 | $page = DataObject::get_one( |
||
238 | 'Page', |
||
239 | sprintf('"Title" = \'%s\' AND "ParentID" = %d', $subtitle, $record->ID) |
||
240 | ); |
||
241 | if (!$page) { |
||
242 | $page = Page::create(); |
||
243 | $page->ParentID = $record->ID; |
||
244 | $page->Title = $subtitle; |
||
245 | } |
||
246 | |||
247 | unset($this->unusedChildren[$page->ID]); |
||
248 | file_put_contents(ASSETS_PATH . '/index-' . $sort . '.html', $this->getBodyText($subdoc, $subnode)); |
||
249 | |||
250 | if ($sort) { |
||
251 | $page->Sort = $sort; |
||
252 | } |
||
253 | $page->Content = $this->getBodyText($subdoc, $subnode); |
||
254 | $page->write(); |
||
255 | if ($publishPages) { |
||
256 | $page->publishRecursive(); |
||
257 | } |
||
258 | } else { |
||
259 | // Write to the master page. |
||
260 | $record->Content = $this->getBodyText($subdoc, $subnode); |
||
261 | $record->write(); |
||
262 | |||
263 | if ($publishPages) { |
||
264 | $record->publishRecursive(); |
||
265 | } |
||
266 | } |
||
267 | } |
||
268 | |||
269 | /** |
||
270 | * Imports a document at a certain path onto the current page and writes it. |
||
271 | * CAUTION: Overwrites any existing content on the page! |
||
272 | * |
||
273 | * @param array $tmpFile Array as received from PHP's POST upload. |
||
274 | * @param bool $splitHeader Heading level to split by. |
||
275 | * @param bool $publishPages Whether the underlying pages should be published after import. |
||
276 | * @param int $chosenFolderID ID of the working folder - here the converted file and images will be stored. |
||
277 | */ |
||
278 | public function importFromPOST($tmpFile, $splitHeader = false, $publishPages = false, $chosenFolderID = null) |
||
279 | { |
||
280 | |||
281 | $fileDescriptor = [ |
||
282 | 'name' => $tmpFile['name'], |
||
283 | 'path' => $tmpFile['tmp_name'], |
||
284 | 'mimeType' => $tmpFile['type'] |
||
285 | ]; |
||
286 | |||
287 | $sourcePage = $this->form->getRecord(); |
||
288 | $importerClass = $this->config()->get('importer_class'); |
||
289 | $importer = Injector::inst()->create($importerClass, $fileDescriptor, $chosenFolderID); |
||
290 | $content = $importer->import(); |
||
291 | |||
292 | if (is_array($content) && isset($content['error'])) { |
||
293 | return $content; |
||
294 | } |
||
295 | |||
296 | // Clean up with tidy (requires tidy module) |
||
297 | $tidy = new Tidy(); |
||
298 | $tidy->parseString($content, ['output-xhtml' => true], 'utf8'); |
||
299 | $tidy->cleanRepair(); |
||
300 | |||
301 | $fragment = []; |
||
302 | foreach ($tidy->body()->child as $child) { |
||
303 | $fragment[] = $child->value; |
||
304 | } |
||
305 | |||
306 | $htmlValue = Injector::inst()->create(HTMLValue::class, implode("\n", $fragment)); |
||
307 | |||
308 | // Sanitise |
||
309 | $santiser = Injector::inst()->create(HTMLEditorSanitiser::class, HTMLEditorConfig::get_active()); |
||
310 | $santiser->sanitise($htmlValue); |
||
311 | |||
312 | // Load in the HTML |
||
313 | $doc = $htmlValue->getDocument(); |
||
314 | $xpath = new DOMXPath($doc); |
||
315 | |||
316 | // make sure any images are added as Image records with a relative link to assets |
||
317 | $chosenFolder = ($this->chosenFolderID) ? DataObject::get_by_id(Folder::class, $this->chosenFolderID) : null; |
||
318 | $folderName = ($chosenFolder) ? '/' . $chosenFolder->Name : ''; |
||
319 | $imgs = $xpath->query('//img'); |
||
320 | for ($i = 0; $i < $imgs->length; $i++) { |
||
321 | $img = $imgs->item($i); |
||
322 | $originalPath = 'assets/' . $folderName . '/' . $img->getAttribute('src'); |
||
323 | $name = FileNameFilter::create()->filter(basename($originalPath)); |
||
324 | |||
325 | $image = Image::get()->filter([ |
||
326 | 'Name' => $name, |
||
327 | 'ParentID' => (int)$chosenFolderID |
||
328 | ])->first(); |
||
329 | if (!($image && $image->exists())) { |
||
330 | $image = Image::create(); |
||
331 | $image->ParentID = (int)$chosenFolderID; |
||
332 | $image->Name = $name; |
||
333 | $image->write(); |
||
334 | } |
||
335 | |||
336 | // make sure it's put in place correctly so Image record knows where it is. |
||
337 | // e.g. in the case of underscores being renamed to dashes. |
||
338 | @rename(Director::getAbsFile($originalPath), Director::getAbsFile($image->getFilename())); |
||
339 | |||
340 | $img->setAttribute('src', $image->getFilename()); |
||
341 | } |
||
342 | |||
343 | $remove_rules = [ |
||
344 | // Change any headers that contain font tags (other than font face tags) into p elements |
||
345 | '//h1[.//font[not(@face)]]' => 'p', |
||
346 | // Remove any font tags |
||
347 | '//font' |
||
348 | ]; |
||
349 | |||
350 | foreach ($remove_rules as $rule => $parenttag) { |
||
351 | if (is_numeric($rule)) { |
||
352 | $rule = $parenttag; |
||
353 | $parenttag = null; |
||
354 | } |
||
355 | |||
356 | $nodes = []; |
||
357 | foreach ($xpath->query($rule) as $node) { |
||
358 | $nodes[] = $node; |
||
359 | } |
||
360 | |||
361 | foreach ($nodes as $node) { |
||
362 | $parent = $node->parentNode; |
||
363 | |||
364 | if ($parenttag) { |
||
365 | $parent = $doc->createElement($parenttag); |
||
366 | $node->nextSibling ? |
||
367 | $node->parentNode->insertBefore($parent, $node->nextSibling) : |
||
368 | $node->parentNode->appendChild($parent); |
||
369 | } |
||
370 | |||
371 | while ($node->firstChild) { |
||
372 | $parent->appendChild($node->firstChild); |
||
373 | } |
||
374 | $node->parentNode->removeChild($node); |
||
375 | } |
||
376 | } |
||
377 | |||
378 | // Strip style, class, lang attributes. |
||
379 | $els = $doc->getElementsByTagName('*'); |
||
380 | for ($i = 0; $i < $els->length; $i++) { |
||
381 | $el = $els->item($i); |
||
382 | $el->removeAttribute('class'); |
||
383 | $el->removeAttribute('style'); |
||
384 | $el->removeAttribute('lang'); |
||
385 | } |
||
386 | |||
387 | $els = $doc->getElementsByTagName('*'); |
||
388 | |||
389 | $headingXPath = [ |
||
390 | 'self::h1', |
||
391 | 'self::h2', |
||
392 | 'self::h3', |
||
393 | 'self::h4', |
||
394 | 'self::h5', |
||
395 | 'self::h6', |
||
396 | ]; |
||
397 | // Remove a bunch of unwanted elements |
||
398 | $clean = [ |
||
399 | // Empty paragraphs |
||
400 | '//p[not(descendant-or-self::text() | descendant-or-self::img)]', |
||
401 | // Empty headers |
||
402 | '//*[' . implode(' | ', $headingXPath) . '][not(descendant-or-self::text() | descendant-or-self::img)]', |
||
403 | // Anchors |
||
404 | '//a[not(@href)]', |
||
405 | // BR tags |
||
406 | '//br' |
||
407 | ]; |
||
408 | |||
409 | foreach ($clean as $query) { |
||
410 | // First get all the nodes. Need to build array, as they'll disappear from the |
||
411 | // nodelist while we're deleteing them, causing the indexing to screw up. |
||
412 | $nodes = []; |
||
413 | foreach ($xpath->query($query) as $node) { |
||
414 | $nodes[] = $node; |
||
415 | } |
||
416 | |||
417 | // Then remove them all |
||
418 | foreach ($nodes as $node) { |
||
419 | if ($node->parentNode) { |
||
420 | $node->parentNode->removeChild($node); |
||
421 | } |
||
422 | } |
||
423 | } |
||
424 | |||
425 | // Now split the document into portions by H1 |
||
426 | $body = $doc->getElementsByTagName('body')->item(0); |
||
427 | |||
428 | $this->unusedChildren = []; |
||
429 | foreach ($sourcePage->Children() as $child) { |
||
430 | $this->unusedChildren[$child->ID] = $child; |
||
431 | } |
||
432 | |||
433 | $documentImporterFieldError = false; |
||
434 | |||
435 | $documentImporterFieldErrorHandler = function ( |
||
436 | $errno, |
||
437 | $errstr, |
||
438 | $errfile, |
||
439 | $errline |
||
440 | ) use ($documentImporterFieldError) { |
||
441 | $documentImporterFieldError = _t( |
||
442 | 'SilverStripe\\DocumentConverter\\ServiceConnector.PROCESSFAILED', |
||
443 | 'Could not process document, please double-check you uploaded a .doc or .docx format.', |
||
444 | 'Document Converter processes Word documents into HTML.' |
||
445 | ); |
||
446 | |||
447 | // Do not cascade the error through other handlers |
||
448 | return true; |
||
449 | }; |
||
450 | |||
451 | set_error_handler($documentImporterFieldErrorHandler); |
||
452 | |||
453 | $subtitle = null; |
||
454 | $subdoc = new DOMDocument(); |
||
455 | $subnode = $subdoc->createElement('body'); |
||
456 | $node = $body->firstChild; |
||
457 | $sort = 1; |
||
458 | if ($splitHeader == 1 || $splitHeader == 2) { |
||
459 | while ($node && !$documentImporterFieldError) { |
||
460 | if ($node instanceof DOMElement && $node->tagName == 'h' . $splitHeader) { |
||
461 | if ($subnode->hasChildNodes()) { |
||
462 | $this->writeContent($subtitle, $subdoc, $subnode, $sort, $publishPages); |
||
463 | $sort++; |
||
464 | } |
||
465 | |||
466 | $subdoc = new DOMDocument(); |
||
467 | $subnode = $subdoc->createElement('body'); |
||
468 | $subtitle = trim(preg_replace('/\n|\r/', '', Convert::html2raw($node->textContent))); |
||
469 | } else { |
||
470 | $subnode->appendChild($subdoc->importNode($node, true)); |
||
471 | } |
||
472 | |||
473 | $node = $node->nextSibling; |
||
474 | } |
||
475 | } else { |
||
476 | $this->writeContent($subtitle, $subdoc, $body, null, $publishPages); |
||
477 | } |
||
478 | |||
479 | if ($subnode->hasChildNodes() && !$documentImporterFieldError) { |
||
480 | $this->writeContent($subtitle, $subdoc, $subnode, null, $publishPages); |
||
481 | } |
||
482 | |||
483 | restore_error_handler(); |
||
484 | if ($documentImporterFieldError) { |
||
485 | return ['error' => $documentImporterFieldError]; |
||
486 | } |
||
487 | |||
488 | foreach ($this->unusedChildren as $child) { |
||
489 | $origStage = Versioned::current_stage(); |
||
490 | |||
491 | Versioned::set_stage(Versioned::DRAFT); |
||
492 | $draft = clone $child; |
||
493 | $draft->delete(); |
||
494 | |||
495 | Versioned::set_stage(Versioned::LIVE); |
||
496 | $published = clone $child; |
||
497 | $published->delete(); |
||
498 | |||
499 | Versioned::set_stage($origStage); |
||
500 | } |
||
501 | |||
502 | $sourcePage->write(); |
||
503 | } |
||
504 | } |
||
505 |