Issues in StaticSiteCrawler.php (master) - Issues in master - phptek/silverstripe-exodus - Measure and Improve Code Quality continuously with Scrutinizer

Issues (146)

src/Crawl/StaticSiteCrawler.php (1 issue)

Severity

Unknown 1

<?php

namespace PhpTek\Exodus\Crawl;

use PHPCrawl\PHPCrawler;
use PHPCrawl\PHPCrawlerDocumentInfo;
use PHPCrawl\PHPCrawlerURLDescriptor;
use SilverStripe\Core\Injector\Injectable;
use SilverStripe\Core\Config\Configurable;
use PhpTek\Exodus\Tool\StaticSiteUrlList;
use PhpTek\Exodus\Tool\StaticSiteUtils;

/**
 * Extends PHPCrawler essentially to override its handleDocumentInfo() method.
 *
 * @see {@link PHPCrawler}
 */
class StaticSiteCrawler extends PHPCrawler
{
    use Injectable;
    use Configurable;

    /**
     *
     * @var StaticSiteUrlList
     */
    protected $urlList;

    /**
     *
     * @var boolean
     */
    protected $verbose = false;

    /*
     * Holds the StaticSiteUtils object on construct
     *
     * @var StaticSiteUtils
     */
    protected $utils;

    /**
     * Set this by using the yml config system
     *
     * Example:
     * <code>
     * StaticSiteContentExtractor:
     *  log_file:  ../logs/crawler-log.txt
     * </code>
     *
     * @var string
     */
    private static $log_file = null;


    /**
     *
     * @param StaticSiteUrlList $urlList
     * @param number $limit
     * @param boolean $verbose
     * @return void
     */
    public function __construct(StaticSiteUrlList $urlList, $limit = false, $verbose = false)
    {
        parent::__construct();

        $this->urlList = $urlList;
        $this->verbose = $verbose;
        $this->utils = singleton(StaticSiteUtils::class);
    }

    /**
     * After checking raw status codes out of PHPCrawler we continue to save each URL to our cache file.
     *
     * $PageInfo gives us:
     *
     * $PageInfo->url
     * $PageInfo->http_status_code
     * $PageInfo->links_found_url_descriptors
     *
     * @param PHPCrawlerDocumentInfo $PageInfo
     * @return int
     * @todo Can we make use of PHPCrawlerDocumentInfo#error_occured instead of manually checking server codes??
     * @todo The comments below state that badly formatted URLs never make it to our caching logic. Wrong!
     *  - Pass the preg_replace() call for "fixing" $mossBracketRegex into StaticSiteUrlProcessor#postProcessUrl()
     * @todo Processor-specific logic (MOSS) should be ported into dedicated class under "Process" namespace
     */
    public function handleDocumentInfo(PHPCrawlerDocumentInfo $PageInfo): int
    {
        $info = $PageInfo; // upgraded phpcrawler compatibility
        /*
         * MOSS has many URLs with brackets, e.g. http://www.stuff.co.nz/news/cat-stuck-up-tree/(/
         * These result in 4xx response-codes returned from curl requests for it, and won't filter down to our
         * caching or URL Processor logic. We can "recover" these URLs by stripping and replacing
         * with a trailing slash. This allows us to be able to fetch all the URL's children, if any.
         */
        $isRecoverableUrl = (bool) preg_match('#(\(|%28)+(.+)?$#i', $info->url);
        // Ignore errors and redirects, they'll get logged for later analysis
        $badStatusCode = (($info->http_status_code < 200) || ($info->http_status_code > 299));
        /*
         * We're checking for a bad status code AND for "recoverability", becuase we might be able to recover the URL
         * when re-requesting it during the import stage, as long as we cache it correctly here.
         */
        if ($badStatusCode && !$isRecoverableUrl) {
            $message = $info->url . " Skipped. We got a bad status-code and URL was irrecoverable" . PHP_EOL;
            $this->utils->log($message);

            return 1;
        }

        // Continue building our cache
        $this->urlList->addAbsoluteURL($info->url, $info->content_type);
        $this->urlList->saveURLs();

        return 0;
    }

    /**
     *
     * @return void
     * @throws \InvalidArgumentException
     */
    protected function initCrawlerProcess(): void
    {
        parent::initCrawlerProcess();

        // Add additional URLs to crawl to the crawler's LinkCache
        // NOTE: This is using an undocumented API
        if ($extraURLs = $this->urlList->getExtraCrawlURLs()) {
            foreach ($extraURLs as $extraURL) {
                $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL));
            }
        }

        // Prevent URLs that match the exclude patterns from being fetched
        if ($excludePatterns = $this->urlList->getExcludePatterns()) {
            foreach ($excludePatterns as $pattern) {
                $validRegExp = $this->addURLFilterRule('|' . str_replace('|', '\|', $pattern) . '|');

                if (!$validRegExp) {
                    throw new \InvalidArgumentException('Exclude url pattern "' . $pattern . '" is not a valid regular expression.');
                }
            }
        }
    }
}


1			<?php
2
3			namespace PhpTek\Exodus\Crawl;
4
5			use PHPCrawl\PHPCrawler;
6			use PHPCrawl\PHPCrawlerDocumentInfo;
7			use PHPCrawl\PHPCrawlerURLDescriptor;
8			use SilverStripe\Core\Injector\Injectable;
9			use SilverStripe\Core\Config\Configurable;
10			use PhpTek\Exodus\Tool\StaticSiteUrlList;
11			use PhpTek\Exodus\Tool\StaticSiteUtils;
12
13			/**
14			* Extends PHPCrawler essentially to override its handleDocumentInfo() method.
15			*
16			* @see {@link PHPCrawler}
17			*/
18			class StaticSiteCrawler extends PHPCrawler
19			{
20			use Injectable;
21			use Configurable;
22
23			/**
24			*
25			* @var StaticSiteUrlList
26			*/
27			protected $urlList;
28
29			/**
30			*
31			* @var boolean
32			*/
33			protected $verbose = false;
34
35			/*
36			* Holds the StaticSiteUtils object on construct
37			*
38			* @var StaticSiteUtils
39			*/
40			protected $utils;
41
42			/**
43			* Set this by using the yml config system
44			*
45			* Example:
46			* <code>
47			* StaticSiteContentExtractor:
48			* log_file: ../logs/crawler-log.txt
49			* </code>
50			*
51			* @var string
52			*/
53			private static $log_file = null;
			0 ignored issues – show introduced 2022-09-02 02:13 UTC by Report Bug Copy Issue Report Show Similar Issues like this The private property `$log_file` is not used, and could be removed. Loading history...
54
55			/**
56			*
57			* @param StaticSiteUrlList $urlList
58			* @param number $limit
59			* @param boolean $verbose
60			* @return void
61			*/
62			public function __construct(StaticSiteUrlList $urlList, $limit = false, $verbose = false)
63			{
64			parent::__construct();
65
66			$this->urlList = $urlList;
67			$this->verbose = $verbose;
68			$this->utils = singleton(StaticSiteUtils::class);
69			}
70
71			/**
72			* After checking raw status codes out of PHPCrawler we continue to save each URL to our cache file.
73			*
74			* $PageInfo gives us:
75			*
76			* $PageInfo->url
77			* $PageInfo->http_status_code
78			* $PageInfo->links_found_url_descriptors
79			*
80			* @param PHPCrawlerDocumentInfo $PageInfo
81			* @return int
82			* @todo Can we make use of PHPCrawlerDocumentInfo#error_occured instead of manually checking server codes??
83			* @todo The comments below state that badly formatted URLs never make it to our caching logic. Wrong!
84			* - Pass the preg_replace() call for "fixing" $mossBracketRegex into StaticSiteUrlProcessor#postProcessUrl()
85			* @todo Processor-specific logic (MOSS) should be ported into dedicated class under "Process" namespace
86			*/
87			public function handleDocumentInfo(PHPCrawlerDocumentInfo $PageInfo): int
88			{
89			$info = $PageInfo; // upgraded phpcrawler compatibility
90			/*
91			* MOSS has many URLs with brackets, e.g. http://www.stuff.co.nz/news/cat-stuck-up-tree/(/
92			* These result in 4xx response-codes returned from curl requests for it, and won't filter down to our
93			* caching or URL Processor logic. We can "recover" these URLs by stripping and replacing
94			* with a trailing slash. This allows us to be able to fetch all the URL's children, if any.
95			*/
96			$isRecoverableUrl = (bool) preg_match('#(\(\|%28)+(.+)?$#i', $info->url);
97			// Ignore errors and redirects, they'll get logged for later analysis
98			$badStatusCode = (($info->http_status_code < 200) \|\| ($info->http_status_code > 299));
99			/*
100			* We're checking for a bad status code AND for "recoverability", becuase we might be able to recover the URL
101			* when re-requesting it during the import stage, as long as we cache it correctly here.
102			*/
103			if ($badStatusCode && !$isRecoverableUrl) {
104			$message = $info->url . " Skipped. We got a bad status-code and URL was irrecoverable" . PHP_EOL;
105			$this->utils->log($message);
106
107			return 1;
108			}
109
110			// Continue building our cache
111			$this->urlList->addAbsoluteURL($info->url, $info->content_type);
112			$this->urlList->saveURLs();
113
114			return 0;
115			}
116
117			/**
118			*
119			* @return void
120			* @throws \InvalidArgumentException
121			*/
122			protected function initCrawlerProcess(): void
123			{
124			parent::initCrawlerProcess();
125
126			// Add additional URLs to crawl to the crawler's LinkCache
127			// NOTE: This is using an undocumented API
128			if ($extraURLs = $this->urlList->getExtraCrawlURLs()) {
129			foreach ($extraURLs as $extraURL) {
130			$this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL));
131			}
132			}
133
134			// Prevent URLs that match the exclude patterns from being fetched
135			if ($excludePatterns = $this->urlList->getExcludePatterns()) {
136			foreach ($excludePatterns as $pattern) {
137			$validRegExp = $this->addURLFilterRule('\|' . str_replace('\|', '\\|', $pattern) . '\|');
138
139			if (!$validRegExp) {
140			throw new \InvalidArgumentException('Exclude url pattern "' . $pattern . '" is not a valid regular expression.');
141			}
142			}
143			}
144			}
145			}
146

phptek / silverstripe-exodus

Issues (146)

src/Crawl/StaticSiteCrawler.php (1 issue)

Severity

Introduced By

Duplication Side-by-Side

Filter issues like