StaticSiteCrawler::handleDocumentInfo()   A
last analyzed

Complexity

Conditions 4
Paths 4

Size

Total Lines 28
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 10
c 1
b 0
f 0
dl 0
loc 28
rs 9.9332
cc 4
nc 4
nop 1
1
<?php
2
3
namespace PhpTek\Exodus\Crawl;
4
5
use PHPCrawl\PHPCrawler;
6
use PHPCrawl\PHPCrawlerDocumentInfo;
7
use PHPCrawl\PHPCrawlerURLDescriptor;
8
use SilverStripe\Core\Injector\Injectable;
9
use SilverStripe\Core\Config\Configurable;
10
use PhpTek\Exodus\Tool\StaticSiteUrlList;
11
use PhpTek\Exodus\Tool\StaticSiteUtils;
12
13
/**
14
 * Extends PHPCrawler essentially to override its handleDocumentInfo() method.
15
 *
16
 * @see {@link PHPCrawler}
17
 */
18
class StaticSiteCrawler extends PHPCrawler
19
{
20
    use Injectable;
21
    use Configurable;
22
23
    /**
24
     *
25
     * @var StaticSiteUrlList
26
     */
27
    protected $urlList;
28
29
    /**
30
     *
31
     * @var boolean
32
     */
33
    protected $verbose = false;
34
35
    /*
36
     * Holds the StaticSiteUtils object on construct
37
     *
38
     * @var StaticSiteUtils
39
     */
40
    protected $utils;
41
42
    /**
43
     * Set this by using the yml config system
44
     *
45
     * Example:
46
     * <code>
47
     * StaticSiteContentExtractor:
48
     *  log_file:  ../logs/crawler-log.txt
49
     * </code>
50
     *
51
     * @var string
52
     */
53
    private static $log_file = null;
0 ignored issues
show
introduced by
The private property $log_file is not used, and could be removed.
Loading history...
54
55
    /**
56
     *
57
     * @param StaticSiteUrlList $urlList
58
     * @param number $limit
59
     * @param boolean $verbose
60
     * @return void
61
     */
62
    public function __construct(StaticSiteUrlList $urlList, $limit = false, $verbose = false)
63
    {
64
        parent::__construct();
65
66
        $this->urlList = $urlList;
67
        $this->verbose = $verbose;
68
        $this->utils = singleton(StaticSiteUtils::class);
69
    }
70
71
    /**
72
     * After checking raw status codes out of PHPCrawler we continue to save each URL to our cache file.
73
     *
74
     * $PageInfo gives us:
75
     *
76
     * $PageInfo->url
77
     * $PageInfo->http_status_code
78
     * $PageInfo->links_found_url_descriptors
79
     *
80
     * @param PHPCrawlerDocumentInfo $PageInfo
81
     * @return int
82
     * @todo Can we make use of PHPCrawlerDocumentInfo#error_occured instead of manually checking server codes??
83
     * @todo The comments below state that badly formatted URLs never make it to our caching logic. Wrong!
84
     *  - Pass the preg_replace() call for "fixing" $mossBracketRegex into StaticSiteUrlProcessor#postProcessUrl()
85
     * @todo Processor-specific logic (MOSS) should be ported into dedicated class under "Process" namespace
86
     */
87
    public function handleDocumentInfo(PHPCrawlerDocumentInfo $PageInfo): int
88
    {
89
        $info = $PageInfo; // upgraded phpcrawler compatibility
90
        /*
91
         * MOSS has many URLs with brackets, e.g. http://www.stuff.co.nz/news/cat-stuck-up-tree/(/
92
         * These result in 4xx response-codes returned from curl requests for it, and won't filter down to our
93
         * caching or URL Processor logic. We can "recover" these URLs by stripping and replacing
94
         * with a trailing slash. This allows us to be able to fetch all the URL's children, if any.
95
         */
96
        $isRecoverableUrl = (bool) preg_match('#(\(|%28)+(.+)?$#i', $info->url);
97
        // Ignore errors and redirects, they'll get logged for later analysis
98
        $badStatusCode = (($info->http_status_code < 200) || ($info->http_status_code > 299));
99
        /*
100
         * We're checking for a bad status code AND for "recoverability", becuase we might be able to recover the URL
101
         * when re-requesting it during the import stage, as long as we cache it correctly here.
102
         */
103
        if ($badStatusCode && !$isRecoverableUrl) {
104
            $message = $info->url . " Skipped. We got a bad status-code and URL was irrecoverable" . PHP_EOL;
105
            $this->utils->log($message);
106
107
            return 1;
108
        }
109
110
        // Continue building our cache
111
        $this->urlList->addAbsoluteURL($info->url, $info->content_type);
112
        $this->urlList->saveURLs();
113
114
        return 0;
115
    }
116
117
    /**
118
     *
119
     * @return void
120
     * @throws \InvalidArgumentException
121
     */
122
    protected function initCrawlerProcess(): void
123
    {
124
        parent::initCrawlerProcess();
125
126
        // Add additional URLs to crawl to the crawler's LinkCache
127
        // NOTE: This is using an undocumented API
128
        if ($extraURLs = $this->urlList->getExtraCrawlURLs()) {
129
            foreach ($extraURLs as $extraURL) {
130
                $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL));
131
            }
132
        }
133
134
        // Prevent URLs that match the exclude patterns from being fetched
135
        if ($excludePatterns = $this->urlList->getExcludePatterns()) {
136
            foreach ($excludePatterns as $pattern) {
137
                $validRegExp = $this->addURLFilterRule('|' . str_replace('|', '\|', $pattern) . '|');
138
139
                if (!$validRegExp) {
140
                    throw new \InvalidArgumentException('Exclude url pattern "' . $pattern . '" is not a valid regular expression.');
141
                }
142
            }
143
        }
144
    }
145
}
146