Total Complexity | 49 |
Total Lines | 534 |
Duplicated Lines | 0 % |
Changes | 3 | ||
Bugs | 0 | Features | 0 |
Complex classes like StaticSiteContentSource often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use StaticSiteContentSource, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
43 | class StaticSiteContentSource extends ExternalContentSource |
||
44 | { |
||
45 | /** |
||
46 | * @var string |
||
47 | */ |
||
48 | public const CACHE_DIR_PREFIX = 'static-site-0'; // Default (The zero-suffix is used by test-suite) |
||
49 | |||
50 | /** |
||
51 | * @var string |
||
52 | */ |
||
53 | private static $table_name = 'StaticSiteContentSource'; |
||
|
|||
54 | |||
55 | /** |
||
56 | * @var config |
||
57 | */ |
||
58 | private static $singular_name = 'Migration Profile'; |
||
59 | |||
60 | /** |
||
61 | * @var config |
||
62 | */ |
||
63 | private static $plural_name = 'Migration Profiles'; |
||
64 | |||
65 | /** |
||
66 | * |
||
67 | * @var array |
||
68 | */ |
||
69 | private static $db = [ |
||
70 | 'BaseUrl' => DBVarchar::class, |
||
71 | 'UrlProcessor' => DBVarchar::class, |
||
72 | 'ExtraCrawlUrls' => DBText::class, |
||
73 | 'UrlExcludePatterns' => DBText::class, |
||
74 | 'ParseCSS' => DBBoolean::class, |
||
75 | 'AutoRunTask' => DBBoolean::class, |
||
76 | ]; |
||
77 | |||
78 | /** |
||
79 | * |
||
80 | * @var array |
||
81 | */ |
||
82 | private static $has_many = [ |
||
83 | "Schemas" => StaticSiteContentSourceImportSchema::class, |
||
84 | "Pages" => SiteTree::class, |
||
85 | "Files" => File::class, |
||
86 | ]; |
||
87 | |||
88 | /** |
||
89 | * |
||
90 | * @var array |
||
91 | */ |
||
92 | private static $export_columns = [ |
||
93 | "StaticSiteContentSourceImportSchema.DataType", |
||
94 | "StaticSiteContentSourceImportSchema.Order", |
||
95 | "StaticSiteContentSourceImportSchema.AppliesTo", |
||
96 | "StaticSiteContentSourceImportSchema.MimeTypes", |
||
97 | ]; |
||
98 | |||
99 | /** |
||
100 | * |
||
101 | * @var string |
||
102 | */ |
||
103 | public $absoluteURL = null; |
||
104 | |||
105 | /** |
||
106 | * Where do we store our items for caching? |
||
107 | * Also used by calling logic |
||
108 | * |
||
109 | * @var string |
||
110 | */ |
||
111 | public $cacheDir = null; |
||
112 | |||
113 | /** |
||
114 | * Holds the StaticSiteUtils object on construct |
||
115 | * |
||
116 | * @var StaticSiteUtils $utils |
||
117 | */ |
||
118 | protected $utils; |
||
119 | |||
120 | /** |
||
121 | * |
||
122 | * @param array|null $record This will be null for a new database record. |
||
123 | * @param bool $isSingleton |
||
124 | * @param DataModel $model |
||
125 | * @return void |
||
126 | */ |
||
127 | public function __construct($record = null, $isSingleton = false, $model = null) |
||
128 | { |
||
129 | parent::__construct($record, $isSingleton, $model); |
||
130 | $this->cacheDir = preg_replace('#[0-9]+$#', $this->ID, self::CACHE_DIR_PREFIX); |
||
131 | $this->utils = singleton(StaticSiteUtils::class); |
||
132 | } |
||
133 | |||
134 | /** |
||
135 | * Template method used to display the results of a successful crawl into the central |
||
136 | * column of the CMS. |
||
137 | * |
||
138 | * @return string |
||
139 | */ |
||
140 | public function listofCrawledItems(): string |
||
158 | } |
||
159 | |||
160 | /** |
||
161 | * |
||
162 | * @return FieldList |
||
163 | * @throws LogicException |
||
164 | */ |
||
165 | public function getCMSFields() |
||
166 | { |
||
167 | $fields = parent::getCMSFields(); |
||
168 | |||
169 | $fields->removeFieldsFromTab('Root', [ |
||
170 | 'Pages', |
||
171 | 'Files', |
||
172 | 'ShowContentInMenu', |
||
173 | 'Name' |
||
174 | ]); |
||
175 | |||
176 | // Because we can't pass arrays to FieldList::insertBefore |
||
177 | foreach ( |
||
178 | [ |
||
179 | HeaderField::create('ProfileHeading', 'Migration Profile Configuration'), |
||
180 | LiteralField::create('ProfileIntro', '' |
||
181 | . '<p class="message notice">' |
||
182 | . 'This where the basics of your migration profile are configured.' |
||
183 | . '</p>')] as $introField |
||
184 | ) { |
||
185 | $fields->insertBefore('BaseUrl', $introField); |
||
186 | } |
||
187 | |||
188 | // Processing Options |
||
189 | $processingOptions = ['' => "No Processing"]; |
||
190 | |||
191 | foreach (ClassInfo::implementorsOf(StaticSiteUrlProcessor::class) as $processor) { |
||
192 | $processorObj = singleton($processor); |
||
193 | $processingOptions[$processor] = $processorObj->getName(); |
||
194 | } |
||
195 | |||
196 | $fields->addFieldsToTab( |
||
197 | 'Root.Main', |
||
198 | [ |
||
199 | TextField::create("BaseUrl", "Base URL") |
||
200 | ->setDescription('The base URL of the site to be crawled and imported.'), |
||
201 | DropdownField::create("UrlProcessor", "URL Transformation", $processingOptions) |
||
202 | ->setDescription('Select the way in which crawled URLs should be transformed and cleaned-up.'), |
||
203 | CheckboxField::create("ParseCSS", "Fetch external CSS") |
||
204 | ->setDescription("Fetch images defined as CSS <strong>background-image</strong> which are not ordinarily reachable by crawling alone."), |
||
205 | CheckboxField::create("AutoRunTask", "Automatically rewrite links into Silverstripe-aware links") |
||
206 | ->setDescription("This will run a link-rewrite task automatically once an import has completed.") |
||
207 | ] |
||
208 | ); |
||
209 | $fields->fieldByName('Root.Main')->setTitle('Profile'); |
||
210 | $fields->insertBefore('BaseUrl', TextField::create('Name', 'Name') |
||
211 | ->setDescription('Allows you to differentiate between profiles.')); |
||
212 | |||
213 | // Schema Gridfield |
||
214 | $fields->addFieldToTab('Root.Main', HeaderField::create('ImportConfigHeader', 'Import Schema Configuration')); |
||
215 | $addNewButton = (new GridFieldAddNewButton('before'))->setButtonName("Add Schema"); |
||
216 | $importRules = $fields->dataFieldByName('Schemas'); |
||
217 | $importRules->getConfig()->removeComponentsByType(GridFieldAddNewButton::class); |
||
218 | $importRules->getConfig()->addComponent($addNewButton); |
||
219 | $fields->removeFieldFromTab("Root", "Schemas"); |
||
220 | $fields->addFieldToTab('Root.Main', LiteralField::create( |
||
221 | 'SchemaIntro', |
||
222 | '' |
||
223 | . '<p class="message notice">Schema map MIME-Types to Silverstripe content classes and' |
||
224 | . ' are related to one or more Import Rules. Each rule determines how content located at crawled URLs' |
||
225 | . ' should be imported into a content classes\' fields with the use of CSS selectors.' |
||
226 | . ' Where more than one schema exists for a field, they\'ll be processed in the order of Priority:' |
||
227 | . ' The first Schema to match a URI Pattern will be the one used for that field.</p>' |
||
228 | )); |
||
229 | $fields->addFieldToTab("Root.Main", $importRules); |
||
230 | |||
231 | switch ($this->urlList()->getSpiderStatus()) { |
||
232 | case StaticSiteUrlList::CRAWL_STATUS_NOTSTARTED: |
||
233 | $crawlButtonText = _t('StaticSiteContentSource.CRAWL_SITE', 'Crawl'); |
||
234 | break; |
||
235 | case StaticSiteUrlList::CRAWL_STATUS_PARTIAL: |
||
236 | $crawlButtonText = _t('StaticSiteContentSource.RESUME_CRAWLING', 'Resume Crawl'); |
||
237 | break; |
||
238 | case StaticSiteUrlList::CRAWL_STATUS_COMPLETE: |
||
239 | $crawlButtonText = _t('StaticSiteContentSource.RECRAWL_SITE', 'Re-Crawl'); |
||
240 | break; |
||
241 | default: |
||
242 | throw new \LogicException("Invalid getSpiderStatus() value '" . $this->urlList()->getSpiderStatus() . ";"); |
||
243 | } |
||
244 | |||
245 | $crawlButton = FormAction::create('crawlsite', $crawlButtonText) |
||
246 | ->setAttribute('data-icon', 'arrow-circle-double') |
||
247 | ->setUseButtonTag(true) |
||
248 | ->addExtraClass('btn action btn btn-primary tool-button font-icon-plus'); |
||
249 | $crawlMsg = ''; |
||
250 | |||
251 | // Disable crawl-button if assets dir isn't writable |
||
252 | // TODO this will need to change if change the default location of crawl data. Like _why_ is it in assets? |
||
253 | if (!file_exists(ASSETS_PATH) || !is_writable(ASSETS_PATH)) { |
||
254 | $crawlMsg = '<p class="message warning">Warning: Assets directory is not writable.</p>'; |
||
255 | $crawlButton->setDisabled(true); |
||
256 | } |
||
257 | |||
258 | $fields->addFieldsToTab('Root.Crawl', [ |
||
259 | ReadonlyField::create("CrawlStatus", "Crawl Status", $this->urlList()->getSpiderStatus()), |
||
260 | ReadonlyField::create("NumURIs", "Number of URIs Crawled", $this->urlList()->getNumURIs()), |
||
261 | LiteralField::create( |
||
262 | 'CrawlActions', |
||
263 | $crawlMsg ? '<p class="message notice">' . $crawlMsg . '</p>' : '' |
||
264 | . '<div class="btn-toolbar">' . $crawlButton->forTemplate() . '</div>' |
||
265 | ) |
||
266 | ]); |
||
267 | |||
268 | // Because we can't pass arrays to FieldList::insertBefore |
||
269 | foreach ( |
||
270 | [ |
||
271 | HeaderField::create('CrawlHeading', 'Source Site Crawling'), |
||
272 | LiteralField::create('CrawlIntro', '' |
||
273 | . '<p class="message notice">' |
||
274 | . 'Before you can load any content into Silverstripe, all source URLs must first be crawled.' |
||
275 | . ' Select the button below to start or resume a crawl as applicable.' |
||
276 | . '</p>')] as $introField |
||
277 | ) { |
||
278 | $fields->insertBefore('CrawlStatus', $introField); |
||
279 | } |
||
280 | |||
281 | /* |
||
282 | * @todo use customise() and arrange this using an includes .ss template fragment |
||
283 | */ |
||
284 | if ($this->urlList()->getSpiderStatus() == StaticSiteUrlList::CRAWL_STATUS_COMPLETE) { |
||
285 | $fields->addFieldToTab( |
||
286 | 'Root.Crawl', |
||
287 | LiteralField::create( |
||
288 | 'CrawlURLListUIntro', |
||
289 | '<p class="mesage notice">Review the list of crawled URIs below. When you\'re happy with the import' |
||
290 | . ' you can proceed to the "Import" tab and follow the instructions there.</p>' |
||
291 | ), |
||
292 | LiteralField::create('CrawlURLList', $this->listofCrawledItems()) |
||
293 | ); |
||
294 | } |
||
295 | |||
296 | $fields->dataFieldByName("ExtraCrawlUrls") |
||
297 | ->setDescription("Add URIs that are not reachable via links when content scraping, eg: '/about/team'. One per line") |
||
298 | ->setTitle('Additional URIs'); |
||
299 | $fields->dataFieldByName("UrlExcludePatterns") |
||
300 | ->setDescription("URLs that should be excluded. (Supports regular expressions e.g. '/about/.*'). One per line") |
||
301 | ->setTitle('Excluded URLs'); |
||
302 | |||
303 | $hasImports = DataObject::get(StaticSiteImportDataObject::class); |
||
304 | $_source = []; |
||
305 | |||
306 | foreach ($hasImports as $import) { |
||
307 | $date = DBField::create_field(DBDatetime::class, $import->Created)->Time24(); |
||
308 | $_source[$import->ID] = $date . ' (Import #' . $import->ID . ')'; |
||
309 | } |
||
310 | |||
311 | $fields->addFieldsToTab('Root.Import', [ |
||
312 | HeaderField::create('ImportHeading', 'Source Site Import'), |
||
313 | LiteralField::create('ImportIntro', '' |
||
314 | . '<p class="message notice">' |
||
315 | . 'Use this area to configure where in the current IA imported page content should appear.' |
||
316 | . ' The same goes for imported files and images.' |
||
317 | . '</p>')]); |
||
318 | |||
319 | if ($importCount = $hasImports->count()) { |
||
320 | $clearImportButton = FormAction::create('clearimports', 'Clear selected imports') |
||
321 | ->setAttribute('data-icon', 'arrow-circle-double') |
||
322 | ->addExtraClass('btn action btn btn-primary tool-button font-icon-plus') |
||
323 | ->setUseButtonTag(true); |
||
324 | |||
325 | $clearImportField = ToggleCompositeField::create('ClearImports', 'Clear Import Metadata', [ |
||
326 | LiteralField::create('ImportCountText', '<p>Each time an import is run, some meta information is stored such as an import identifier and failed-link records.<br/><br/></p>'), |
||
327 | LiteralField::create('ImportCount', '<p>Total imports: ' . $importCount . '</p>'), |
||
328 | ListboxField::create('ShowImports', 'Select import(s) to clear:', $_source, '', null, true), |
||
329 | CheckboxField::create('ClearAllImports', 'Clear all import meta-data', 0), |
||
330 | LiteralField::create('ImportActions', '<div class="btn-toolbar">' . $clearImportButton->forTemplate() . '</div>') |
||
331 | ])->addExtraClass('clear-imports'); |
||
332 | |||
333 | $fields->addFieldToTab('Root.Import', $clearImportField); |
||
334 | } |
||
335 | |||
336 | $fields->addFieldsToTab('Root.Environment', [ |
||
337 | HeaderField::create('EnvHeading', 'Webserver Environment'), |
||
338 | LiteralField::create('EnvIntro', '' |
||
339 | . '<p class="message notice">' |
||
340 | . 'Refer to this area for information related to the PHP and Webserver environment' |
||
341 | . ' which may affect the proper function and performance of this tool.' |
||
342 | . '</p>'), |
||
343 | LiteralField::create('EnvInfo', '' |
||
344 | . '<ul>' |
||
345 | . '<li>PHP Info: ' . $_SERVER['PHP_VERSION'] . '</li>' |
||
346 | . '<li>Webserver Info: ' . $_SERVER['SERVER_SOFTWARE'] . '</li>' |
||
347 | . '<li>max_execution_time: ' . sprintf('%s seconds', ini_get('max_execution_time')) . '</li>' |
||
348 | . '<li>memory_limit: ' . sprintf('%d Mb', ini_get('memory_limit')) . '</li>' |
||
349 | . '</ul>') |
||
350 | ]); |
||
351 | |||
352 | return $fields; |
||
353 | } |
||
354 | |||
355 | /** |
||
356 | * If the site has been crawled and then subsequently the URLProcessor was changed, we need to ensure |
||
357 | * URLs are re-processed using the newly selected URL Preprocessor |
||
358 | * |
||
359 | * @return void |
||
360 | */ |
||
361 | public function onAfterWrite() |
||
362 | { |
||
363 | parent::onAfterWrite(); |
||
364 | |||
365 | $urlList = $this->urlList(); |
||
366 | if ($this->isChanged('UrlProcessor') && $urlList->hasCrawled()) { |
||
367 | if ($processorClass = $this->UrlProcessor) { |
||
368 | $urlList->setUrlProcessor($processorClass::create()); |
||
369 | } else { |
||
370 | $urlList->setUrlProcessor(null); |
||
371 | } |
||
372 | |||
373 | $urlList->reprocessUrls(); |
||
374 | } |
||
375 | } |
||
376 | |||
377 | /** |
||
378 | * |
||
379 | * @return StaticSiteUrlList |
||
380 | */ |
||
381 | public function urlList() |
||
382 | { |
||
383 | if (!$this->urlList) { |
||
384 | $this->urlList = StaticSiteUrlList::create($this, ASSETS_PATH . "/{$this->cacheDir}"); |
||
385 | |||
386 | if ($processorClass = $this->UrlProcessor) { |
||
387 | $this->urlList->setUrlProcessor($processorClass::create()); |
||
388 | } |
||
389 | |||
390 | if ($this->ExtraCrawlUrls) { |
||
391 | $extraCrawlUrls = preg_split('/\s+/', trim($this->ExtraCrawlUrls)); |
||
392 | $this->urlList->setExtraCrawlUrls($extraCrawlUrls); |
||
393 | } |
||
394 | |||
395 | if ($this->UrlExcludePatterns) { |
||
396 | $urlExcludePatterns = preg_split('/\s+/', trim($this->UrlExcludePatterns)); |
||
397 | $this->urlList->setExcludePatterns($urlExcludePatterns); |
||
398 | } |
||
399 | } |
||
400 | |||
401 | return $this->urlList; |
||
402 | } |
||
403 | |||
404 | /** |
||
405 | * Crawl the target site |
||
406 | * |
||
407 | * @param boolean $limit |
||
408 | * @param boolean $verbose |
||
409 | * @return StaticSiteCrawler |
||
410 | * @throws LogicException |
||
411 | */ |
||
412 | public function crawl($limit = false, $verbose = false) |
||
413 | { |
||
414 | if (!$this->BaseUrl) { |
||
415 | throw new \LogicException('Can\'t crawl a site until the "Base URL" field is set.'); |
||
416 | } |
||
417 | |||
418 | return $this->urlList()->crawl($limit, $verbose); |
||
419 | } |
||
420 | |||
421 | /** |
||
422 | * Fetch an appropriate schema for a given URL and/or Mime-Type. |
||
423 | * If no matches are found, boolean false is returned. |
||
424 | * |
||
425 | * @param string $absoluteURL |
||
426 | * @param string $mimeType (Optional) |
||
427 | * @return mixed StaticSiteContentSourceImportSchema $schema or boolean false if no schema matches are found |
||
428 | */ |
||
429 | public function getSchemaForURL($absoluteURL, $mimeType = null) |
||
430 | { |
||
431 | $mimeType = StaticSiteMimeProcessor::cleanse($mimeType); |
||
432 | // Ensure the "Order" (Priority) setting is respected |
||
433 | $schemas = $this->Schemas()->sort('Order'); |
||
434 | |||
435 | foreach ($schemas as $i => $schema) { |
||
436 | $schemaCanParseURL = $this->schemaCanParseURL($schema, $absoluteURL); |
||
437 | $schemaMimeTypes = StaticSiteMimeProcessor::get_mimetypes_from_text($schema->MimeTypes); |
||
438 | $schemaMimeTypesShow = implode(', ', $schemaMimeTypes); |
||
439 | $this->utils->log(' - Schema: ' . ($i + 1) . ', DataType: ' . $schema->DataType . ', AppliesTo: ' . $schema->AppliesTo . ' mimetypes: ' . $schemaMimeTypesShow); |
||
440 | array_push($schemaMimeTypes, StaticSiteUrlList::config()->get('undefined_mime_type')); |
||
441 | |||
442 | if ($schemaCanParseURL) { |
||
443 | if ($mimeType && $schemaMimeTypes && (!in_array($mimeType, $schemaMimeTypes))) { |
||
444 | continue; |
||
445 | } |
||
446 | |||
447 | return $schema; |
||
448 | } |
||
449 | } |
||
450 | |||
451 | return false; |
||
452 | } |
||
453 | |||
454 | /** |
||
455 | * Performs a match on the Schema->AppliedTo field with reference to the URL |
||
456 | * of the current iteration within getSchemaForURL(). |
||
457 | * |
||
458 | * @param StaticSiteContentSourceImportSchema $schema |
||
459 | * @param string $url |
||
460 | * @return boolean |
||
461 | */ |
||
462 | public function schemaCanParseURL(StaticSiteContentSourceImportSchema $schema, $url) |
||
463 | { |
||
464 | $appliesTo = $schema->AppliesTo; |
||
465 | if (!strlen($appliesTo)) { |
||
466 | $appliesTo = $schema::config()->get('default_applies_to'); |
||
467 | } |
||
468 | |||
469 | // Use (escaped) pipes for delimeters as pipes themselves are unlikely to appear in legit URLs |
||
470 | $appliesTo = str_replace('|', '\|', $appliesTo); |
||
471 | $urlToTest = str_replace(rtrim($this->BaseUrl, '/'), '', $url); |
||
472 | |||
473 | if (preg_match("|^$appliesTo|i", $urlToTest)) { |
||
474 | $this->utils->log(' - ' . __FUNCTION__ . ' matched: ' . $appliesTo . ', Url: ' . $url); |
||
475 | return true; |
||
476 | } |
||
477 | return false; |
||
478 | } |
||
479 | |||
480 | /** |
||
481 | * Returns a StaticSiteContentItem for the given URL |
||
482 | * Relative URLs are used as the unique identifiers by this importer |
||
483 | * |
||
484 | * @param string $id The URL, relative to BaseURL, starting with "/". |
||
485 | * @return StaticSiteContentItem |
||
486 | */ |
||
487 | public function getObject($id) |
||
488 | { |
||
489 | if ($id[0] != "/") { |
||
490 | $id = $this->decodeId($id); |
||
491 | if ($id[0] != "/") { |
||
492 | throw new \InvalidArgumentException("\$id must start with /"); |
||
493 | } |
||
494 | } |
||
495 | |||
496 | return StaticSiteContentItem::create($this, $id); |
||
497 | } |
||
498 | |||
499 | /** |
||
500 | * |
||
501 | * @return StaticSiteContentItem |
||
502 | */ |
||
503 | public function getRoot() |
||
504 | { |
||
505 | return $this->getObject('/'); |
||
506 | } |
||
507 | |||
508 | /** |
||
509 | * Signals external-content module that we wish to operate on `SiteTree` and `File` objects. |
||
510 | * |
||
511 | * @return array |
||
512 | */ |
||
513 | public function allowedImportTargets() |
||
514 | { |
||
515 | return [ |
||
516 | 'sitetree' => true, |
||
517 | 'file' => true, |
||
518 | ]; |
||
519 | } |
||
520 | |||
521 | /** |
||
522 | * Return the root node. |
||
523 | * |
||
524 | * @param boolean $showAll |
||
525 | * @return ArrayList A list containing the root node |
||
526 | */ |
||
527 | public function stageChildren($showAll = false) |
||
528 | { |
||
529 | if (!$this->urlList()->hasCrawled()) { |
||
530 | return ArrayList::create(); |
||
531 | } |
||
532 | |||
533 | return ArrayList::create(array( |
||
534 | $this->getObject("/") |
||
535 | )); |
||
536 | } |
||
537 | |||
538 | /** |
||
539 | * |
||
540 | * @param $target |
||
541 | * @return StaticSiteImporter |
||
542 | */ |
||
543 | public function getContentImporter($target = null) |
||
544 | { |
||
545 | return StaticSiteImporter::create(); |
||
546 | } |
||
547 | |||
548 | /** |
||
549 | * |
||
550 | * @return boolean |
||
551 | */ |
||
552 | public function isValid() |
||
553 | { |
||
554 | return (bool) $this->BaseUrl; |
||
555 | } |
||
556 | |||
557 | /** |
||
558 | * |
||
559 | * @param Member $member |
||
560 | * @param array $context |
||
561 | * @return boolean |
||
562 | */ |
||
563 | public function canImport($member = null, $context = []) |
||
564 | { |
||
565 | return $this->isValid(); |
||
566 | } |
||
567 | |||
568 | /** |
||
569 | * |
||
570 | * @param Member $member |
||
571 | * @param array $context |
||
572 | * @return boolean |
||
573 | */ |
||
574 | public function canCreate($member = null, $context = []) |
||
577 | } |
||
578 | } |
||
579 |