Crawler   A
last analyzed

Complexity

Total Complexity 7

Size/Duplication

Total Lines 90
Duplicated Lines 0 %

Importance

Changes 9
Bugs 1 Features 0
Metric Value
wmc 7
eloc 32
c 9
b 1
f 0
dl 0
loc 90
rs 10

5 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 2 1
A config() 0 2 1
A modules() 0 9 2
A crawl() 0 37 2
A getDocument() 0 5 1
1
<?php declare(strict_types=1);
2
3
namespace Goose;
4
5
use Goose\Utils\Helper;
6
use DOMWrap\Document;
7
8
/**
9
 * Crawler
10
 *
11
 * @package Goose
12
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
13
 */
14
class Crawler {
15
    /** @var Configuration */
16
    protected $config;
17
18
    /**
19
     * @param Configuration $config
20
     */
21
    public function __construct(Configuration $config) {
22
        $this->config = $config;
23
    }
24
25
    /**
26
     * @return Configuration
27
     */
28
    public function config(): Configuration {
29
        return $this->config;
30
    }
31
32
    /**
33
     * @param string $url
34
     * @param string|null $rawHTML
35
     *
36
     * @return Article
37
     */
38
    public function crawl(string $url, string $rawHTML = null): ?Article {
39
        $article = new Article();
40
41
        $parseCandidate = Helper::getCleanedUrl($url);
42
43
        $xmlInternalErrors = libxml_use_internal_errors(true);
44
45
        if (empty($rawHTML)) {
46
            $guzzle = new \GuzzleHttp\Client();
47
            $response = $guzzle->get($parseCandidate->url, $this->config()->get('browser'));
48
            $article->setRawResponse($response);
49
            $rawHTML = $response->getBody()->getContents();
50
        }
51
52
        // Generate document
53
        $doc = $this->getDocument($rawHTML);
54
55
        // Set core mutators
56
        $article->setFinalUrl($parseCandidate->url);
57
        $article->setDomain($parseCandidate->parts->host);
58
        $article->setLinkhash($parseCandidate->linkhash);
59
        $article->setRawHtml($rawHTML);
60
        $article->setDoc($doc);
61
        $article->setRawDoc(clone $doc);
62
63
        // Pre-extraction document cleaning
64
        $this->modules('cleaners', $article);
65
66
        // Extract content
67
        $this->modules('extractors', $article);
68
69
        // Post-extraction content formatting
70
        $this->modules('formatters', $article);
71
72
        libxml_use_internal_errors($xmlInternalErrors);
73
74
        return $article;
75
    }
76
77
    /**
78
     * @param string $rawHTML
79
     *
80
     * @return Document
81
     */
82
    private function getDocument(string $rawHTML): Document {
83
        $doc = new Document();
84
        $doc->html($rawHTML);
85
86
        return $doc;
87
    }
88
89
    /**
90
     * @param string $category
91
     * @param Article $article
92
     *
93
     * @return self
94
     */
95
    public function modules(string $category, Article $article): self {
96
        $modules = $this->config->getModules($category);
97
98
        foreach ($modules as $module) {
99
            $obj = new $module($this->config());
100
            $obj->run($article);
101
        }
102
103
        return $this;
104
    }
105
}
106
107