Scraper::scrape()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 17
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 1

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 1
eloc 8
c 2
b 0
f 0
nc 1
nop 1
dl 0
loc 17
ccs 9
cts 9
cp 1
crap 1
rs 10
1
<?php
2
3
namespace Pilipinews\Website\Sunstar;
4
5
use Pilipinews\Common\Article;
6
use Pilipinews\Common\Crawler as DomCrawler;
7
use Pilipinews\Common\Interfaces\ScraperInterface;
8
use Pilipinews\Common\Scraper as AbstractScraper;
9
10
/**
11
 * Sunstar News Scraper
12
 *
13
 * @package Pilipinews
14
 * @author  Rougin Gutib <[email protected]>
15
 */
16
class Scraper extends AbstractScraper implements ScraperInterface
17
{
18
    /**
19
     * @var array
20
     */
21
    protected $elements = array('.article-header', '.articleBody');
22
23
    /**
24
     * @var string[]
25
     */
26
    protected $removables = array('.subSection', '.titleArticle', '.pagingWrap', 'script', '#fb-root');
27
28
    /**
29
     * @var string[]
30
     */
31
    protected $texts = array("PHOTO: https://www.sunstar.com.ph/\n", 'Please refresh page for updates.', 'ARTICLE_MOBILE_AD_CODE');
32
33
    /**
34
     * Returns the contents of an article.
35
     *
36
     * @param  string $link
37
     * @return \Pilipinews\Common\Article
38
     */
39 30
    public function scrape($link)
40
    {
41 30
        $this->prepare((string) strtolower($link));
42
43 30
        $title = $this->title('title', ' - SUNSTAR');
44
45 30
        $this->remove((array) $this->removables);
46
47 30
        $this->crawler = $this->carousel($this->crawler);
48
49 30
        $body = $this->body($this->elements);
50
51 30
        $body = $this->video($this->image($body));
52
53 30
        $html = $this->html($body, $this->texts);
54
55 30
        return new Article($title, $html, $link);
56
    }
57
58
    /**
59
     * Returns the article content based on a given element.
60
     *
61
     * @param  string|string[] $element
62
     * @return \Pilipinews\Common\Crawler
63
     */
64 30
    protected function body($elements)
65
    {
66 30
        is_string($elements) && $elements = array($elements);
67
68 30
        foreach ((array) $elements as $key => $element)
69
        {
70 30
            $body = $this->crawler->filter($element)->last()->html();
71
72 30
            $body = (string) trim(preg_replace('/\s+/', ' ', $body));
73
74 30
            $elements[$key] = str_replace('  ', ' ', (string) $body);
75 10
        }
76
77 30
        return new DomCrawler(implode('<br><br><br>', $elements));
78
    }
79
80
    /**
81
     * Converts carousel elements to readable string.
82
     *
83
     * @param  \Pilipinews\Common\Crawler $crawler
84
     * @return \Pilipinews\Common\Crawler
85
     */
86 10
    protected function carousel(DomCrawler $crawler)
87
    {
88 20
        $callback = function (DomCrawler $crawler)
89
        {
90 3
            $texts = $crawler->filter('.img-caption');
91
92 2
            $function = function ($result, $index) use ($texts)
93
            {
94 3
                $text = $texts->eq($index)->text();
95
96 3
                $image = $result->attr('src') . ' - ' . $text;
97
98 3
                return '<p>PHOTO: ' . $image . '</p>';
99 3
            };
100
101 3
            $items = $crawler->filter('img');
102
103 3
            $image = $items->each($function);
104
105 3
            return implode("<br><br>", $image);
106 30
        };
107
108 30
        return $this->replace($crawler, '.owl-carousel', $callback);
109
    }
110
111
    /**
112
     * Converts image elements to readable string.
113
     *
114
     * @param  \Pilipinews\Common\Crawler $crawler
115
     * @return \Pilipinews\Common\Crawler
116
     */
117 10
    protected function image(DomCrawler $crawler)
118
    {
119 20
        $callback = function (DomCrawler $crawler)
120
        {
121 27
            $break = (string) '<br><br><br>';
122
123 27
            $result = $crawler->filter('img')->first();
124
125 27
            $image = $result->attr('src') . $break;
126
127 27
            return (string) $break . 'PHOTO: ' . $image;
128 30
        };
129
130 30
        return $this->replace($crawler, '.imgArticle', $callback);
131
    }
132
133
    /**
134
     * Initializes the crawler instance.
135
     *
136
     * @param  string $link
137
     * @return void
138
     */
139 30
    protected function prepare($link)
140
    {
141 30
        $this->crawler = new DomCrawler(Client::request($link));
142 30
    }
143
144
    /**
145
     * Converts video elements to readable string.
146
     *
147
     * @param  \Pilipinews\Common\Crawler $crawler
148
     * @return \Pilipinews\Common\Crawler
149
     */
150
    protected function video(DomCrawler $crawler)
151
    {
152 30
        $callback = function (DomCrawler $crawler)
153
        {
154 6
            $link = trim($crawler->attr('data-href'));
155
156 6
            $break = '<br><br><br>';
157
158 6
            return $break . 'VIDEO: ' . $link . $break;
159 30
        };
160
161 10
        return $this->replace($crawler, '.fb-video', $callback);
162
    }
163
}
164