Scraper   A
last analyzed

Complexity

Total Complexity 10

Size/Duplication

Total Lines 154
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 7
Bugs 0 Features 0
Metric Value
eloc 40
c 7
b 0
f 0
dl 0
loc 154
ccs 55
cts 55
cp 1
rs 10
wmc 10

7 Methods

Rating   Name   Duplication   Size   Complexity  
A replace() 0 23 2
A prepare() 0 7 1
A body() 0 7 1
A remove() 0 12 2
A title() 0 11 1
A html() 0 14 2
A tweet() 0 14 1
1
<?php
2
3
namespace Pilipinews\Common;
4
5
/**
6
 * Scraper
7
 *
8
 * @package Pilipinews
9
 * @author  Rougin Gutib <[email protected]>
10
 */
11
abstract class Scraper
12
{
13
    /**
14
     * @var \Pilipinews\Common\Crawler
15
     */
16
    protected $crawler;
17
18
    /**
19
     * Returns the article content based on a given element.
20
     *
21
     * @param  string $element
22
     * @return \Pilipinews\Common\Crawler
23
     */
24 12
    protected function body($element)
25
    {
26 12
        $body = $this->crawler->filter($element)->first()->html();
27
28 12
        $body = trim(preg_replace('/\s+/', ' ', $body));
29
30 12
        return new Crawler(str_replace('  ', ' ', $body));
31
    }
32
33
    /**
34
     * Returns the HTML format of the body from the crawler.
35
     *
36
     * @param  \Pilipinews\Common\Crawler $crawler
37
     * @param  string[]                   $removables
38
     * @return string
39
     */
40 12
    protected function html(Crawler $crawler, $removables = array())
41 1
    {
42 12
        $converter = new Converter;
43
44 12
        $html = trim($converter->convert($crawler->html()));
45
46 12
        foreach ((array) $removables as $keyword)
47 1
        {
48 12
            $html = str_replace($keyword, '', $html);
49 4
        }
50
51 12
        $html = str_replace('  ', ' ', (string) $html);
52
53 12
        return trim(preg_replace('/\s\s+/', "\n\n", $html));
54 1
    }
55
56
    /**
57
     * Initializes the crawler instance.
58
     *
59
     * @param  string $link
60
     * @return void
61
     */
62 12
    protected function prepare($link)
63
    {
64 12
        $response = Client::request((string) $link);
65
66 12
        $response = str_replace('<strong> </strong>', ' ', $response);
67
68 12
        $this->crawler = new Crawler($response);
69 12
    }
70
71
    /**
72
     * Removes specified HTML tags from body.
73
     *
74
     * @param  string[] $elements
75
     * @return void
76
     */
77 2
    protected function remove($elements)
78
    {
79 4
        $callback = function ($crawler)
80
        {
81 6
            $node = $crawler->getNode((integer) 0);
82
83 6
            $node->parentNode->removeChild($node);
84 6
        };
85
86 6
        foreach ((array) $elements as $removable)
87
        {
88 6
            $this->crawler->filter($removable)->each($callback);
89 2
        }
90 6
    }
91
92
    /**
93
     * Replaces a specified HTML tag based from the given callback.
94
     *
95
     * @param  \Pilipinews\Common\Crawler $crawler
96
     * @param  string                     $element
97
     * @param  callable                   $callback
98
     * @return \Pilipinews\Common\Crawler
99
     */
100 4
    protected function replace(Crawler $crawler, $element, $callback)
101
    {
102 8
        $function = function (Crawler $crawler) use ($callback)
103
        {
104 12
            $node = $crawler->getNode(0);
105
106 12
            $html = $node->ownerDocument->saveHtml($node);
107
108 12
            $text = $callback($crawler, (string) $html);
109
110 12
            return array((string) $html, (string) $text);
111 12
        };
112
113 12
        $items = $crawler->filter($element)->each($function);
114
115 12
        $html = (string) $crawler->html();
116
117 12
        foreach ((array) $items as $item)
118
        {
119 12
            $html = str_replace($item[0], $item[1], $html);
120 4
        }
121
122 12
        return new Crawler((string) $html);
123
    }
124
125
    /**
126
     * Returns the title text based from given HTML tag.
127
     *
128
     * @param  string $element
129
     * @param  string $removable
130
     * @return string
131
     */
132 12
    protected function title($element, $removable = '')
133
    {
134 12
        $converter = new Converter;
135
136 12
        $crawler = $this->crawler->filter($element);
137
138 12
        $html = $crawler->first()->html();
139
140 12
        $html = str_replace($removable, '', $html);
141
142 12
        return $converter->convert((string) $html);
143
    }
144
145
    /**
146
     * Parses embedded Twitter tweet in the HTML.
147
     *
148
     * @param  \Pilipinews\Common\Crawler $crawler
149
     * @return \Pilipinews\Common\Crawler
150
     */
151
    protected function tweet(Crawler $crawler)
152
    {
153 12
        $callback = function (Crawler $crawler)
154
        {
155 6
            $parsed = (string) $crawler->text();
156
157 6
            $text = str_replace('📸: ', '', $parsed);
158
159 6
            return '<p>TWEET: ' . $text . '</p>';
160 12
        };
161
162 4
        $class = '.twitter-tweet';
163
164 4
        return $this->replace($crawler, $class, $callback);
165
    }
166
}
167