Passed
Push — master ( ec64c5...cc7b87 )
by Rougin
02:02
created

Scraper   A

Complexity

Total Complexity 10

Size/Duplication

Total Lines 150
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 41
dl 0
loc 150
ccs 55
cts 55
cp 1
rs 10
c 0
b 0
f 0
wmc 10

7 Methods

Rating   Name   Duplication   Size   Complexity  
A replace() 0 21 2
A prepare() 0 5 1
A body() 0 11 1
A html() 0 13 2
A remove() 0 10 2
A tweet() 0 13 1
A title() 0 11 1
1
<?php
2
3
namespace Pilipinews\Common;
4
5
use Symfony\Component\DomCrawler\Crawler;
6
7
/**
8
 * Abstract Scraper
9
 *
10
 * @package Pilipinews
11
 * @author  Rougin Royce Gutib <[email protected]>
12
 */
13
abstract class Scraper
14
{
15
    /**
16
     * @var \Symfony\Component\DomCrawler\Crawler
17
     */
18
    protected $crawler;
19
20
    /**
21
     * Returns the article content based on a given element.
22
     *
23
     * @param  string $element
24
     * @return \Symfony\Component\DomCrawler\Crawler
25
     */
26 9
    protected function body($element)
27
    {
28 9
        $body = $this->crawler->filter($element)->first();
29
30 9
        $charset = 'ISO-8859-1//TRANSLIT//IGNORE';
31
32 9
        $body = iconv('UTF-8', $charset, $body->html());
33
34 9
        $body = trim(preg_replace('/\s+/', ' ', $body));
35
36 9
        return new Crawler(str_replace('  ', ' ', $body));
37
    }
38
39
    /**
40
     * Returns the HTML format of the body from the crawler.
41
     *
42
     * @param  \Symfony\Component\DomCrawler\Crawler $crawler
43
     * @param  string[]                              $removables
44
     * @return string
45
     */
46 9
    protected function html(Crawler $crawler, $removables = array())
47 2
    {
48 9
        $converter = new Converter;
49
50 9
        $html = $converter->convert($crawler->html());
51
52 9
        foreach ((array) $removables as $keyword) {
53 6
            $html = str_replace($keyword, '', $html);
54
55 6
            $html = str_replace("\n\n\n", '', $html);
56 6
        }
57
58 9
        return trim(preg_replace('/\s\s+/', "\n\n", $html));
59 2
    }
60
61
    /**
62
     * Initializes the crawler instance.
63
     *
64
     * @param  string $link
65
     * @return void
66
     */
67 3
    protected function prepare($link)
68
    {
69 3
        $response = Client::request((string) $link);
70
71 3
        $this->crawler = new Crawler($response);
72 3
    }
73
74
    /**
75
     * Removes specified HTML tags from body.
76
     *
77
     * @param  string[] $elements
78
     * @return void
79
     */
80 6
    protected function remove($elements)
81
    {
82 3
        $callback = function ($crawler) {
83 3
            $node = $crawler->getNode((integer) 0);
84
85 3
            $node->parentNode->removeChild($node);
86 9
        };
87
88 9
        foreach ((array) $elements as $removable) {
89 9
            $this->crawler->filter($removable)->each($callback);
90 6
        }
91 9
    }
92
93
    /**
94
     * Replaces a specified HTML tag based from the given callback.
95
     *
96
     * @param  \Symfony\Component\DomCrawler\Crawler $crawler
97
     * @param  string                                $element
98
     * @param  callable                              $callback
99
     * @return \Symfony\Component\DomCrawler\Crawler
100
     */
101 4
    protected function replace(Crawler $crawler, $element, $callback)
102
    {
103 2
        $function = function (Crawler $crawler) use ($callback) {
104 6
            $node = $crawler->getNode(0);
105
106 6
            $html = $node->ownerDocument->saveHtml($node);
107
108 6
            $text = $callback($crawler, (string) $html);
109
110 6
            return array((string) $html, (string) $text);
111 6
        };
112
113 6
        $items = $crawler->filter($element)->each($function);
114
115 6
        $html = (string) $crawler->html();
116
117 6
        foreach ((array) $items as $item) {
118 6
            $html = str_replace($item[0], $item[1], $html);
119 4
        }
120
121 6
        return new Crawler((string) $html);
122
    }
123
124
    /**
125
     * Returns the title text based from given HTML tag.
126
     *
127
     * @param  string $element
128
     * @param  string $removable
129
     * @return string
130
     */
131 9
    protected function title($element, $removable = '')
132
    {
133 9
        $converter = new Converter;
134
135 9
        $crawler = $this->crawler->filter($element);
136
137 9
        $html = $crawler->first()->html();
138
139 9
        $html = str_replace($removable, '', $html);
140
141 9
        return $converter->convert((string) $html);
142
    }
143
144
    /**
145
     * Parses embedded Twitter tweet in the HTML.
146
     *
147
     * @param  \Symfony\Component\DomCrawler\Crawler $crawler
148
     * @return \Symfony\Component\DomCrawler\Crawler
149
     */
150
    protected function tweet(Crawler $crawler)
151
    {
152 6
        $callback = function (Crawler $crawler) {
153 6
            $parsed = (string) $crawler->text();
154
155 6
            $text = str_replace('📸: ', '', $parsed);
156
157 6
            return '<p>TWEET: ' . $text . '</p>';
158 6
        };
159
160 4
        $class = '.twitter-tweet';
161
162 4
        return $this->replace($crawler, $class, $callback);
163
    }
164
}
165