Total Complexity | 49 |
Total Lines | 517 |
Duplicated Lines | 0 % |
Changes | 2 | ||
Bugs | 0 | Features | 0 |
Complex classes like StaticSiteContentSource often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use StaticSiteContentSource, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
43 | class StaticSiteContentSource extends ExternalContentSource |
||
44 | { |
||
45 | /** |
||
46 | * @var string |
||
47 | */ |
||
48 | public const CACHE_DIR_PREFIX = 'static-site-0'; // Default (The zero-suffix is used by test-suite) |
||
49 | |||
50 | /** |
||
51 | * @var string |
||
52 | */ |
||
53 | private static $table_name = 'StaticSiteContentSource'; |
||
|
|||
54 | |||
55 | /** |
||
56 | * @var config |
||
57 | */ |
||
58 | private static $singular_name = 'Migration Profile'; |
||
59 | |||
60 | /** |
||
61 | * @var config |
||
62 | */ |
||
63 | private static $plural_name = 'Migration Profiles'; |
||
64 | |||
65 | /** |
||
66 | * |
||
67 | * @var array |
||
68 | */ |
||
69 | private static $db = [ |
||
70 | 'BaseUrl' => DBVarchar::class, |
||
71 | 'UrlProcessor' => DBVarchar::class, |
||
72 | 'ExtraCrawlUrls' => DBText::class, |
||
73 | 'UrlExcludePatterns' => DBText::class, |
||
74 | 'ParseCSS' => DBBoolean::class, |
||
75 | 'AutoRunTask' => DBBoolean::class, |
||
76 | ]; |
||
77 | |||
78 | /** |
||
79 | * |
||
80 | * @var array |
||
81 | */ |
||
82 | private static $has_many = [ |
||
83 | "Schemas" => StaticSiteContentSourceImportSchema::class, |
||
84 | "Pages" => SiteTree::class, |
||
85 | "Files" => File::class, |
||
86 | ]; |
||
87 | |||
88 | /** |
||
89 | * |
||
90 | * @var array |
||
91 | */ |
||
92 | private static $export_columns = [ |
||
93 | "StaticSiteContentSourceImportSchema.DataType", |
||
94 | "StaticSiteContentSourceImportSchema.Order", |
||
95 | "StaticSiteContentSourceImportSchema.AppliesTo", |
||
96 | "StaticSiteContentSourceImportSchema.MimeTypes", |
||
97 | ]; |
||
98 | |||
99 | /** |
||
100 | * |
||
101 | * @var string |
||
102 | */ |
||
103 | public $absoluteURL = null; |
||
104 | |||
105 | /** |
||
106 | * Where do we store our items for caching? |
||
107 | * Also used by calling logic |
||
108 | * |
||
109 | * @var string |
||
110 | */ |
||
111 | public $cacheDir = null; |
||
112 | |||
113 | /** |
||
114 | * Holds the StaticSiteUtils object on construct |
||
115 | * |
||
116 | * @var StaticSiteUtils $utils |
||
117 | */ |
||
118 | protected $utils; |
||
119 | |||
120 | /** |
||
121 | * |
||
122 | * @param array|null $record This will be null for a new database record. |
||
123 | * @param bool $isSingleton |
||
124 | * @param DataModel $model |
||
125 | * @return void |
||
126 | */ |
||
127 | public function __construct($record = null, $isSingleton = false, $model = null) |
||
128 | { |
||
129 | parent::__construct($record, $isSingleton, $model); |
||
130 | $this->cacheDir = preg_replace('#[0-9]+$#', $this->ID, self::CACHE_DIR_PREFIX); |
||
131 | $this->utils = singleton(StaticSiteUtils::class); |
||
132 | } |
||
133 | |||
134 | /** |
||
135 | * Template method used to display the results of a successful crawl into the central |
||
136 | * column of the CMS. |
||
137 | * |
||
138 | * @return string |
||
139 | */ |
||
140 | public function listofCrawledItems(): string |
||
141 | { |
||
142 | $list = $this->urlList(); |
||
143 | $ulist = ''; |
||
144 | |||
145 | if ($list->getSpiderStatus() !== StaticSiteUrlList::CRAWL_STATUS_COMPLETE) { |
||
146 | return ''; |
||
147 | } |
||
148 | |||
149 | foreach (array_unique($list->getProcessedURLs()) as $raw => $processed) { |
||
150 | if ($raw != $processed) { |
||
151 | $ulist .= '<li>' . sprintf('%s (was: %s)', $processed, $raw) . '</li>'; |
||
152 | } else { |
||
153 | $ulist .= '<li>' . $processed . '</li>'; |
||
154 | } |
||
155 | } |
||
156 | |||
157 | return '<ul>' . $ulist . '</ul>'; |
||
158 | } |
||
159 | |||
160 | /** |
||
161 | * |
||
162 | * @return FieldList |
||
163 | * @throws LogicException |
||
164 | */ |
||
165 | public function getCMSFields() |
||
166 | { |
||
167 | $fields = parent::getCMSFields(); |
||
168 | |||
169 | $fields->removeFieldsFromTab('Root', [ |
||
170 | 'Pages', |
||
171 | 'Files', |
||
172 | 'ShowContentInMenu', |
||
173 | 'Name' |
||
174 | ]); |
||
175 | |||
176 | // Because we can't pass arrays to FieldList::insertBefore |
||
177 | foreach ([ |
||
178 | HeaderField::create('ProfileHeading', 'Migration Profile Configuration'), |
||
179 | LiteralField::create('ProfileIntro', '' |
||
180 | . '<p class="message notice">' |
||
181 | . 'This where the basics of your migration profile are configured.' |
||
182 | . '</p>' |
||
183 | )] as $introField) { |
||
184 | $fields->insertBefore('BaseUrl', $introField); |
||
185 | } |
||
186 | |||
187 | // Processing Options |
||
188 | $processingOptions = ['' => "No Processing"]; |
||
189 | |||
190 | foreach (ClassInfo::implementorsOf(StaticSiteUrlProcessor::class) as $processor) { |
||
191 | $processorObj = singleton($processor); |
||
192 | $processingOptions[$processor] = $processorObj->getName(); |
||
193 | } |
||
194 | |||
195 | $fields->addFieldsToTab( |
||
196 | 'Root.Main', [ |
||
197 | TextField::create("BaseUrl", "Base URL") |
||
198 | ->setDescription('The base URL of the site to be crawled and imported.'), |
||
199 | DropdownField::create("UrlProcessor", "URL Transformation", $processingOptions) |
||
200 | ->setDescription('Select the way in which crawled URLs should be transformed and cleaned-up.'), |
||
201 | CheckboxField::create("ParseCSS", "Fetch external CSS") |
||
202 | ->setDescription("Fetch images defined as CSS <strong>background-image</strong> which are not ordinarily reachable by crawling alone."), |
||
203 | CheckboxField::create("AutoRunTask", "Automatically rewrite links into Silverstripe-aware links") |
||
204 | ->setDescription("This will run a link-rewrite task automatically once an import has completed.") |
||
205 | ] |
||
206 | ); |
||
207 | $fields->fieldByName('Root.Main')->setTitle('Profile'); |
||
208 | $fields->insertBefore('BaseUrl', TextField::create('Name', 'Name') |
||
209 | ->setDescription('Allows you to differentiate between profiles.') |
||
210 | ); |
||
211 | |||
212 | // Schema Gridfield |
||
213 | $fields->addFieldToTab('Root.Main', HeaderField::create('ImportConfigHeader', 'Import Schema Configuration')); |
||
214 | $addNewButton = (new GridFieldAddNewButton('before'))->setButtonName("Add Schema"); |
||
215 | $importRules = $fields->dataFieldByName('Schemas'); |
||
216 | $importRules->getConfig()->removeComponentsByType(GridFieldAddNewButton::class); |
||
217 | $importRules->getConfig()->addComponent($addNewButton); |
||
218 | $fields->removeFieldFromTab("Root", "Schemas"); |
||
219 | $fields->addFieldToTab('Root.Main', LiteralField::create( |
||
220 | 'SchemaIntro', |
||
221 | '' |
||
222 | . '<p class="message notice">Schema map MIME-Types to Silverstripe content classes and' |
||
223 | . ' are related to one or more Import Rules. Each rule determines how content located at crawled URLs' |
||
224 | . ' should be imported into a content classes\' fields with the use of CSS selectors.' |
||
225 | . ' Where more than one schema exists for a field, they\'ll be processed in the order of Priority:' |
||
226 | . ' The first Schema to match a URI Pattern will be the one used for that field.</p>' |
||
227 | )); |
||
228 | $fields->addFieldToTab("Root.Main", $importRules); |
||
229 | |||
230 | switch ($this->urlList()->getSpiderStatus()) { |
||
231 | case StaticSiteUrlList::CRAWL_STATUS_NOTSTARTED: |
||
232 | $crawlButtonText = _t('StaticSiteContentSource.CRAWL_SITE', 'Crawl'); |
||
233 | break; |
||
234 | case StaticSiteUrlList::CRAWL_STATUS_PARTIAL: |
||
235 | $crawlButtonText = _t('StaticSiteContentSource.RESUME_CRAWLING', 'Resume Crawl'); |
||
236 | break; |
||
237 | case StaticSiteUrlList::CRAWL_STATUS_COMPLETE: |
||
238 | $crawlButtonText = _t('StaticSiteContentSource.RECRAWL_SITE', 'Re-Crawl'); |
||
239 | break; |
||
240 | default: |
||
241 | throw new \LogicException("Invalid getSpiderStatus() value '".$this->urlList()->getSpiderStatus().";"); |
||
242 | } |
||
243 | |||
244 | $crawlButton = FormAction::create('crawlsite', $crawlButtonText) |
||
245 | ->setAttribute('data-icon', 'arrow-circle-double') |
||
246 | ->setUseButtonTag(true) |
||
247 | ->addExtraClass('btn action btn btn-primary tool-button font-icon-plus'); |
||
248 | $crawlMsg = ''; |
||
249 | |||
250 | // Disable crawl-button if assets dir isn't writable |
||
251 | // TODO this will need to change if change the default location of crawl data. Like _why_ is it in assets? |
||
252 | if (!file_exists(ASSETS_PATH) || !is_writable(ASSETS_PATH)) { |
||
253 | $crawlMsg = '<p class="message warning">Warning: Assets directory is not writable.</p>'; |
||
254 | $crawlButton->setDisabled(true); |
||
255 | } |
||
256 | |||
257 | $fields->addFieldsToTab('Root.Crawl', [ |
||
258 | ReadonlyField::create("CrawlStatus", "Crawl Status", $this->urlList()->getSpiderStatus()), |
||
259 | ReadonlyField::create("NumURIs", "Number of URIs Crawled", $this->urlList()->getNumURIs()), |
||
260 | LiteralField::create( |
||
261 | 'CrawlActions', |
||
262 | $crawlMsg ? '<p class="message notice">' . $crawlMsg . '</p>' : '' |
||
263 | . '<div class="btn-toolbar">' . $crawlButton->forTemplate() . '</div>' |
||
264 | ) |
||
265 | ]); |
||
266 | |||
267 | // Because we can't pass arrays to FieldList::insertBefore |
||
268 | foreach ([ |
||
269 | HeaderField::create('CrawlHeading', 'Source Site Crawling'), |
||
270 | LiteralField::create('CrawlIntro', '' |
||
271 | . '<p class="message notice">' |
||
272 | . 'Before you can load any content into Silverstripe, all source URLs must first be crawled.' |
||
273 | . ' Select the button below to start or resume a crawl as applicable.' |
||
274 | . '</p>' |
||
275 | )] as $introField) { |
||
276 | $fields->insertBefore('CrawlStatus', $introField); |
||
277 | } |
||
278 | |||
279 | /* |
||
280 | * @todo use customise() and arrange this using an includes .ss template fragment |
||
281 | */ |
||
282 | if ($this->urlList()->getSpiderStatus() == StaticSiteUrlList::CRAWL_STATUS_COMPLETE) { |
||
283 | $fields->addFieldToTab( |
||
284 | 'Root.Crawl', |
||
285 | LiteralField::create( |
||
286 | 'CrawlURLListUIntro', |
||
287 | '<p class="mesage notice">Review the list of crawled URIs below. When you\'re happy with the import' |
||
288 | . ' you can proceed to the "Import" tab and follow the instructions there.</p>' |
||
289 | ), |
||
290 | LiteralField::create('CrawlURLList', $this->listofCrawledItems()) |
||
291 | ); |
||
292 | } |
||
293 | |||
294 | $fields->dataFieldByName("ExtraCrawlUrls") |
||
295 | ->setDescription("Add URIs that are not reachable via links when content scraping, eg: '/about/team'. One per line") |
||
296 | ->setTitle('Additional URIs'); |
||
297 | $fields->dataFieldByName("UrlExcludePatterns") |
||
298 | ->setDescription("URLs that should be excluded. (Supports regular expressions e.g. '/about/.*'). One per line") |
||
299 | ->setTitle('Excluded URLs'); |
||
300 | |||
301 | $hasImports = DataObject::get(StaticSiteImportDataObject::class); |
||
302 | $_source = []; |
||
303 | |||
304 | foreach ($hasImports as $import) { |
||
305 | $date = DBField::create_field(DBDatetime::class, $import->Created)->Time24(); |
||
306 | $_source[$import->ID] = $date . ' (Import #' . $import->ID . ')'; |
||
307 | } |
||
308 | |||
309 | $fields->addFieldsToTab('Root.Import', [ |
||
310 | HeaderField::create('ImportHeading', 'Source Site Import'), |
||
311 | LiteralField::create('ImportIntro', '' |
||
312 | . '<p class="message notice">' |
||
313 | . 'Use this area to configure where in the current IA imported page content should appear.' |
||
314 | . ' The same goes for imported files and images.' |
||
315 | . '</p>' |
||
316 | )]); |
||
317 | |||
318 | if ($importCount = $hasImports->count()) { |
||
319 | $clearImportButton = FormAction::create('clearimports', 'Clear selected imports') |
||
320 | ->setAttribute('data-icon', 'arrow-circle-double') |
||
321 | ->addExtraClass('btn action btn btn-primary tool-button font-icon-plus') |
||
322 | ->setUseButtonTag(true); |
||
323 | |||
324 | $clearImportField = ToggleCompositeField::create('ClearImports', 'Clear Import Metadata', [ |
||
325 | LiteralField::create('ImportCountText', '<p>Each time an import is run, some meta information is stored such as an import identifier and failed-link records.<br/><br/></p>'), |
||
326 | LiteralField::create('ImportCount', '<p>Total imports: ' . $importCount . '</p>'), |
||
327 | ListboxField::create('ShowImports', 'Select import(s) to clear:', $_source, '', null, true), |
||
328 | CheckboxField::create('ClearAllImports', 'Clear all import meta-data', 0), |
||
329 | LiteralField::create('ImportActions', '<div class="btn-toolbar">' . $clearImportButton->forTemplate() . '</div>') |
||
330 | ])->addExtraClass('clear-imports'); |
||
331 | |||
332 | $fields->addFieldToTab('Root.Import', $clearImportField); |
||
333 | } |
||
334 | |||
335 | return $fields; |
||
336 | } |
||
337 | |||
338 | /** |
||
339 | * If the site has been crawled and then subsequently the URLProcessor was changed, we need to ensure |
||
340 | * URLs are re-processed using the newly selected URL Preprocessor |
||
341 | * |
||
342 | * @return void |
||
343 | */ |
||
344 | public function onAfterWrite() |
||
345 | { |
||
346 | parent::onAfterWrite(); |
||
347 | |||
348 | $urlList = $this->urlList(); |
||
349 | if ($this->isChanged('UrlProcessor') && $urlList->hasCrawled()) { |
||
350 | if ($processorClass = $this->UrlProcessor) { |
||
351 | $urlList->setUrlProcessor($processorClass::create()); |
||
352 | } else { |
||
353 | $urlList->setUrlProcessor(null); |
||
354 | } |
||
355 | |||
356 | $urlList->reprocessUrls(); |
||
357 | } |
||
358 | } |
||
359 | |||
360 | /** |
||
361 | * |
||
362 | * @return StaticSiteUrlList |
||
363 | */ |
||
364 | public function urlList() |
||
365 | { |
||
366 | if (!$this->urlList) { |
||
367 | $this->urlList = StaticSiteUrlList::create($this, ASSETS_PATH . "/{$this->cacheDir}"); |
||
368 | |||
369 | if ($processorClass = $this->UrlProcessor) { |
||
370 | $this->urlList->setUrlProcessor($processorClass::create()); |
||
371 | } |
||
372 | |||
373 | if ($this->ExtraCrawlUrls) { |
||
374 | $extraCrawlUrls = preg_split('/\s+/', trim($this->ExtraCrawlUrls)); |
||
375 | $this->urlList->setExtraCrawlUrls($extraCrawlUrls); |
||
376 | } |
||
377 | |||
378 | if ($this->UrlExcludePatterns) { |
||
379 | $urlExcludePatterns = preg_split('/\s+/', trim($this->UrlExcludePatterns)); |
||
380 | $this->urlList->setExcludePatterns($urlExcludePatterns); |
||
381 | } |
||
382 | } |
||
383 | |||
384 | return $this->urlList; |
||
385 | } |
||
386 | |||
387 | /** |
||
388 | * Crawl the target site |
||
389 | * |
||
390 | * @param boolean $limit |
||
391 | * @param boolean $verbose |
||
392 | * @return StaticSiteCrawler |
||
393 | * @throws LogicException |
||
394 | */ |
||
395 | public function crawl($limit = false, $verbose = false) |
||
396 | { |
||
397 | if (!$this->BaseUrl) { |
||
398 | throw new \LogicException('Can\'t crawl a site until the "Base URL" field is set.'); |
||
399 | } |
||
400 | |||
401 | return $this->urlList()->crawl($limit, $verbose); |
||
402 | } |
||
403 | |||
404 | /** |
||
405 | * Fetch an appropriate schema for a given URL and/or Mime-Type. |
||
406 | * If no matches are found, boolean false is returned. |
||
407 | * |
||
408 | * @param string $absoluteURL |
||
409 | * @param string $mimeType (Optional) |
||
410 | * @return mixed StaticSiteContentSourceImportSchema $schema or boolean false if no schema matches are found |
||
411 | */ |
||
412 | public function getSchemaForURL($absoluteURL, $mimeType = null) |
||
413 | { |
||
414 | $mimeType = StaticSiteMimeProcessor::cleanse($mimeType); |
||
415 | // Ensure the "Order" (Priority) setting is respected |
||
416 | $schemas = $this->Schemas()->sort('Order'); |
||
417 | |||
418 | foreach ($schemas as $i => $schema) { |
||
419 | $schemaCanParseURL = $this->schemaCanParseURL($schema, $absoluteURL); |
||
420 | $schemaMimeTypes = StaticSiteMimeProcessor::get_mimetypes_from_text($schema->MimeTypes); |
||
421 | $schemaMimeTypesShow = implode(', ', $schemaMimeTypes); |
||
422 | $this->utils->log(' - Schema: ' . ($i + 1) . ', DataType: ' . $schema->DataType . ', AppliesTo: ' . $schema->AppliesTo . ' mimetypes: ' . $schemaMimeTypesShow); |
||
423 | array_push($schemaMimeTypes, StaticSiteUrlList::config()->get('undefined_mime_type')); |
||
424 | |||
425 | if ($schemaCanParseURL) { |
||
426 | if ($mimeType && $schemaMimeTypes && (!in_array($mimeType, $schemaMimeTypes))) { |
||
427 | continue; |
||
428 | } |
||
429 | |||
430 | return $schema; |
||
431 | } |
||
432 | } |
||
433 | |||
434 | return false; |
||
435 | } |
||
436 | |||
437 | /** |
||
438 | * Performs a match on the Schema->AppliedTo field with reference to the URL |
||
439 | * of the current iteration within getSchemaForURL(). |
||
440 | * |
||
441 | * @param StaticSiteContentSourceImportSchema $schema |
||
442 | * @param string $url |
||
443 | * @return boolean |
||
444 | */ |
||
445 | public function schemaCanParseURL(StaticSiteContentSourceImportSchema $schema, $url) |
||
446 | { |
||
447 | $appliesTo = $schema->AppliesTo; |
||
448 | if (!strlen($appliesTo)) { |
||
449 | $appliesTo = $schema::config()->get('default_applies_to'); |
||
450 | } |
||
451 | |||
452 | // Use (escaped) pipes for delimeters as pipes themselves are unlikely to appear in legit URLs |
||
453 | $appliesTo = str_replace('|', '\|', $appliesTo); |
||
454 | $urlToTest = str_replace(rtrim($this->BaseUrl, '/'), '', $url); |
||
455 | |||
456 | if (preg_match("|^$appliesTo|i", $urlToTest)) { |
||
457 | $this->utils->log(' - ' . __FUNCTION__ . ' matched: ' . $appliesTo . ', Url: ' . $url); |
||
458 | return true; |
||
459 | } |
||
460 | return false; |
||
461 | } |
||
462 | |||
463 | /** |
||
464 | * Returns a StaticSiteContentItem for the given URL |
||
465 | * Relative URLs are used as the unique identifiers by this importer |
||
466 | * |
||
467 | * @param string $id The URL, relative to BaseURL, starting with "/". |
||
468 | * @return StaticSiteContentItem |
||
469 | */ |
||
470 | public function getObject($id) |
||
471 | { |
||
472 | if ($id[0] != "/") { |
||
473 | $id = $this->decodeId($id); |
||
474 | if ($id[0] != "/") { |
||
475 | throw new \InvalidArgumentException("\$id must start with /"); |
||
476 | } |
||
477 | } |
||
478 | |||
479 | return StaticSiteContentItem::create($this, $id); |
||
480 | } |
||
481 | |||
482 | /** |
||
483 | * |
||
484 | * @return StaticSiteContentItem |
||
485 | */ |
||
486 | public function getRoot() |
||
487 | { |
||
488 | return $this->getObject('/'); |
||
489 | } |
||
490 | |||
491 | /** |
||
492 | * Signals external-content module that we wish to operate on `SiteTree` and `File` objects. |
||
493 | * |
||
494 | * @return array |
||
495 | */ |
||
496 | public function allowedImportTargets() |
||
497 | { |
||
498 | return [ |
||
499 | 'sitetree' => true, |
||
500 | 'file' => true, |
||
501 | ]; |
||
502 | } |
||
503 | |||
504 | /** |
||
505 | * Return the root node. |
||
506 | * |
||
507 | * @param boolean $showAll |
||
508 | * @return ArrayList A list containing the root node |
||
509 | */ |
||
510 | public function stageChildren($showAll = false) |
||
511 | { |
||
512 | if (!$this->urlList()->hasCrawled()) { |
||
513 | return ArrayList::create(); |
||
514 | } |
||
515 | |||
516 | return ArrayList::create(array( |
||
517 | $this->getObject("/") |
||
518 | )); |
||
519 | } |
||
520 | |||
521 | /** |
||
522 | * |
||
523 | * @param $target |
||
524 | * @return StaticSiteImporter |
||
525 | */ |
||
526 | public function getContentImporter($target = null) |
||
527 | { |
||
528 | return StaticSiteImporter::create(); |
||
529 | } |
||
530 | |||
531 | /** |
||
532 | * |
||
533 | * @return boolean |
||
534 | */ |
||
535 | public function isValid() |
||
536 | { |
||
537 | return (bool) $this->BaseUrl; |
||
538 | } |
||
539 | |||
540 | /** |
||
541 | * |
||
542 | * @param Member $member |
||
543 | * @param array $context |
||
544 | * @return boolean |
||
545 | */ |
||
546 | public function canImport($member = null, $context = []) |
||
547 | { |
||
548 | return $this->isValid(); |
||
549 | } |
||
550 | |||
551 | /** |
||
552 | * |
||
553 | * @param Member $member |
||
554 | * @param array $context |
||
555 | * @return boolean |
||
556 | */ |
||
557 | public function canCreate($member = null, $context = []) |
||
560 | } |
||
561 | } |
||
562 |