AbstractAdapter::getBodyHtml()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 13
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 13
rs 9.4285
c 0
b 0
f 0
nc 1
cc 1
eloc 7
nop 1
1
<?php
2
3
namespace Zrashwani\NewsScrapper\Adapters;
4
5
use Symfony\Component\DomCrawler\Crawler;
6
use Zrashwani\NewsScrapper\Selector;
7
8
/**
9
 * Base class that defines skeleton of the any adapter implemented
10
 *
11
 * @author Zeid Rashwani <zrashwani.com>
12
 */
13
abstract class AbstractAdapter
14
{
15
    public $currentUrl;
16
17
    abstract public function extractTitle(Crawler $crawler);
18
19
    abstract public function extractImage(Crawler $crawler);
20
21
    abstract public function extractDescription(Crawler $crawler);
22
23
    abstract public function extractKeywords(Crawler $crawler);
24
25
    abstract public function extractBody(Crawler $crawler);
26
27
    abstract public function extractPublishDate(Crawler $crawler);
28
29
    abstract public function extractAuthor(Crawler $crawler);
30
31
    /**
32
     * normalize link and turn it into absolute format
33
     * @param string $link
34
     * @param boolean $remove_hashes if set true, hashes will be removed from url
35
     * @return string
36
     */
37
    public function normalizeLink($link, $remove_hashes = false)
38
    {
39
        $baseUrl = $this->currentUrl;
40
        if (preg_match('@^http(s?)://.*$@', $baseUrl) === 0 && //local environment assumed here
41
            preg_match('@^http(s?)://.*$@', $link) === 0) {
42
                $link = pathinfo($baseUrl, PATHINFO_DIRNAME).'/'.$link;
43
        } elseif (preg_match('@^http(s?)://.*$@', $link) === 0) { //is not absolute
44
            $urlParts = parse_url($baseUrl);
45
            $scheme = isset($urlParts['scheme']) === true ? $urlParts['scheme'] : 'http';
46
            $host = isset($urlParts['host']) === true ? $urlParts['host'] : '';
47
            if (strpos($link, '//') === 0) { //begins with //
48
                $link = $scheme.':'.$link;
49
            } elseif (strpos($link, '/') === 0) { //begins with /
50
                $link = $scheme.'://'.$host.$link;
51
            } else {
52
                $link = $scheme.'://'.$host.'/'.$link;
53
            }
54
        }
55
        
56
        if ($remove_hashes === true) {
57
            $link = preg_replace('@#.*$@', '', $link);
58
        }
59
        return $link;
60
    }
61
62
    /**
63
     * normalizing html scrapped by removing unwanted tags (ex. script, css)
64
     * and amending external resources paths
65
     * @param string $raw_html
66
     * @return string
67
     */
68
    public function normalizeHtml($raw_html)
69
    {
70
        if (empty($raw_html)) {
71
            return $raw_html;
72
        }
73
        
74
        $disallowed_tags = ['script', 'style', 'meta', 'form', 'aside'];
75
                
76
        $xmlDoc = new \DOMDocument();
77
        libxml_use_internal_errors(true);
78
        $xmlDoc->loadHTML(mb_convert_encoding($raw_html, 'HTML-ENTITIES', 'UTF-8'));
79
        libxml_clear_errors();
80
        
81
        $xpath = new \DOMXPath($xmlDoc);
82
        foreach ($disallowed_tags as $tag) {
83
            $unwanted_entries = $xpath->query('//'.$tag);
84
            foreach ($unwanted_entries as $unwanted_elem) {
85
                $unwanted_elem->parentNode->removeChild($unwanted_elem);
86
            }
87
        }
88
        
89
        $html = $this->normalizeBodyLinks($xmlDoc->saveHTML());
90
        $html2 = preg_replace('@\s{2,}@', ' ', $html); //remove empty spaces from document
91
        
92
        return $html2;
93
    }
94
    
95
    /**
96
     * covert all relative paths in html to absolute ones
97
     * including: img src, a href ...etc.
98
     * @param string $html
99
     * @return string
100
     */
101
    public function normalizeBodyLinks($html)
102
    {
103
        if (empty($html) === true) { //if html is empty, do nothing
104
            return $html;
105
        }
106
        
107
        $xmlDoc = new \DOMDocument();
108
        libxml_use_internal_errors(true);
109
        $xmlDoc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
110
        libxml_clear_errors();
111
        
112
        $xpath = new \DOMXPath($xmlDoc);
113
        $lnk_entries = $xpath->query('//a');
114
        
115
        foreach ($lnk_entries as $entry) {
116
                $href = $entry->getAttribute('href');
117
                $normalized_href = $this->normalizeLink($href);
118
                
119
                $entry->setAttribute('href', $normalized_href);
120
                $entry->setAttribute('target', '_blank');
121
        }
122
        
123
        $img_entries = $xpath->query('//img');
124
        
125
        foreach ($img_entries as $entry) {
126
                $src = $entry->getAttribute('src');
127
                $normalized_src = $this->normalizeLink($src);
128
                $entry->setAttribute('src', $normalized_src);
129
        }
130
        
131
        $final_html = $xmlDoc->saveHTML();
132
                
133
        return $this->getBodyHtml($final_html);
134
    }
135
    
136
137
    /**
138
     * normalize keywords by removing spaces from each
139
     * @param array $keywords
140
     * @return array
141
     */
142
    public function normalizeKeywords(array $keywords)
143
    {
144
        foreach ($keywords as $k => $word) {
145
            $keywords[$k] = trim($word);
146
        }
147
        
148
        return $keywords;
149
    }
150
    
151
    /**
152
     * extract body content from html document
153
     * @param string $doc_html
154
     * @return string
155
     */
156
    protected function getBodyHtml($doc_html)
157
    {
158
        $html_crawler = new Crawler($doc_html);
159
        
160
        $ret = '';
161
        $html_crawler->filter('body')->each(
162
            function (Crawler $node) use (&$ret) {
163
                $ret = $node->html();
164
            }
165
        );
166
        
167
        return $ret;
168
    }
169
    
170
171
    /**
172
     * extract image source by selector
173
     * @param  Crawler $crawler
174
     * @param  string $selector
175
     * @return string|NULL
176
     */
177
    protected function getSrcByImgSelector(Crawler $crawler, $selector)
178
    {
179
        $ret = null;
180
        $imgExtractClosure = function (Crawler $node) use (&$ret) {
181
            $ret = $node->attr('src');
182
        };
183
        if (Selector::isXPath($selector)) {
184
            $crawler->filterXPath($selector)
185
                    ->each($imgExtractClosure);
186
        } else {
187
            $crawler->filter($selector)
188
                    ->each($imgExtractClosure);
189
        }
190
        
191
        if (empty($ret) === false) {
192
            return $this->normalizeLink($ret);
193
        } else {
194
            return null;
195
        }
196
    }
197
}
198