SourceRevisitor   A
last analyzed

Complexity

Total Complexity 13

Size/Duplication

Total Lines 135
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 8

Test Coverage

Coverage 0%

Importance

Changes 0
Metric Value
wmc 13
lcom 1
cbo 8
dl 0
loc 135
ccs 0
cts 59
cp 0
rs 10
c 0
b 0
f 0

6 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 5 1
A revisit() 0 20 4
A revisitAfter() 0 9 1
A removeSource() 0 5 1
A createScraper() 0 17 4
A isFresh() 0 14 2
1
<?php
2
3
namespace TreeHouse\IoBundle\Scrape;
4
5
use TreeHouse\IoBundle\Entity\Scraper as ScraperEntity;
6
use TreeHouse\IoBundle\Event\SourceEvent;
7
use TreeHouse\IoBundle\Model\SourceInterface;
8
use TreeHouse\IoBundle\Scrape\Crawler\RateLimit\EnablingRateLimitInterface;
9
use TreeHouse\IoBundle\Scrape\Exception\NotFoundException;
10
use TreeHouse\IoBundle\Source\SourceManagerInterface;
11
12
class SourceRevisitor
13
{
14
    /**
15
     * @var SourceManagerInterface
16
     */
17
    protected $sourceManager;
18
19
    /**
20
     * @var ScraperFactory
21
     */
22
    protected $factory;
23
24
    /**
25
     * Array of cached scrapers.
26
     *
27
     * @var Scraper[]
28
     */
29
    protected $scrapers = [];
30
31
    /**
32
     * @param SourceManagerInterface $sourceManager
33
     * @param ScraperFactory         $factory
34
     */
35
    public function __construct(SourceManagerInterface $sourceManager, ScraperFactory $factory)
36
    {
37
        $this->sourceManager = $sourceManager;
38
        $this->factory = $factory;
39
    }
40
41
    /**
42
     * Revisits a source. This basically means doing a scrape operation on the
43
     * source origin, only this time the source will be removed if the original
44
     * url was not found.
45
     *
46
     * @param SourceInterface $source       The source to revisit.
47
     * @param bool            $async        If true, makes the scrape action asynchronous.
48
     *                                      The revisit action will happen right away, but any
49
     *                                      consecutive scrape actions will be queued. Use this
50
     *                                      when calling the revisit action from an asynchronous
51
     *                                      context.
52
     * @param bool            $disableLimit Whether to disable the rate limit when revisiting.
53
     */
54
    public function revisit(SourceInterface $source, $async = false, $disableLimit = false)
55
    {
56
        if (!$source->getOriginalUrl()) {
57
            throw new \InvalidArgumentException('Source does not contain an original url');
58
        }
59
60
        // check if source is still fresh
61
        if ($this->isFresh($source)) {
62
            return;
63
        }
64
65
        $scraper = $this->createScraper($source->getScraper(), $disableLimit);
66
        $scraper->setAsync($async);
67
68
        try {
69
            $scraper->scrape($source->getScraper(), $source->getOriginalUrl(), false);
70
        } catch (NotFoundException $e) {
71
            $this->removeSource($source);
72
        }
73
    }
74
75
    /**
76
     * Does a non-blocking revisit operation. Depending on the implementation,
77
     * this will mean adding a revisit job to some sort of queueing system.
78
     *
79
     * @param SourceInterface $source
80
     * @param \DateTime       $date
81
     */
82
    public function revisitAfter(SourceInterface $source, \DateTime $date)
83
    {
84
        $scraper = $this->createScraper($source->getScraper());
85
        $scraper->getEventDispatcher()->dispatch(
86
            ScraperEvents::SCRAPE_REVISIT_SOURCE,
87
            new SourceEvent($source),
0 ignored issues
show
Documentation introduced by
new \TreeHouse\IoBundle\Event\SourceEvent($source) is of type object<TreeHouse\IoBundle\Event\SourceEvent>, but the function expects a null|string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
88
            $date
0 ignored issues
show
Unused Code introduced by
The call to EventDispatcherInterface::dispatch() has too many arguments starting with $date.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
89
        );
90
    }
91
92
    /**
93
     * @param SourceInterface $source
94
     */
95
    protected function removeSource(SourceInterface $source)
96
    {
97
        $this->sourceManager->remove($source);
98
        $this->sourceManager->flush($source);
99
    }
100
101
    /**
102
     * @param ScraperEntity $scraperEntity
103
     * @param bool          $disableLimit
104
     *
105
     * @return ScraperInterface
106
     */
107
    protected function createScraper(ScraperEntity $scraperEntity, $disableLimit = false)
108
    {
109
        if (!array_key_exists($scraperEntity->getId(), $this->scrapers)) {
110
            $scraper = $this->factory->createScraper($scraperEntity);
111
112
            if ($disableLimit) {
113
                $limit = $scraper->getCrawler()->getRateLimit();
114
                if ($limit instanceof EnablingRateLimitInterface) {
115
                    $limit->disable();
116
                }
117
            }
118
119
            $this->scrapers[$scraperEntity->getId()] = $scraper;
120
        }
121
122
        return $this->scrapers[$scraperEntity->getId()];
123
    }
124
125
    /**
126
     * Checks whether the given source is fresh, meaning it doesn't need revisiting right now.
127
     *
128
     * @param SourceInterface $source
129
     *
130
     * @return bool
131
     */
132
    protected function isFresh(SourceInterface $source)
133
    {
134
        $lastVisitDate = $source->getDatetimeLastVisited();
135
136
        // no previous visit date, consider it stale
137
        if (null === $lastVisitDate) {
138
            return false;
139
        }
140
141
        $revisitDate = clone $lastVisitDate;
142
        $revisitDate->modify(sprintf('+%d hours', $source->getScraper()->getRevisitFrequency()));
143
144
        return $revisitDate > new \DateTime();
145
    }
146
}
147