AbstractAdapter::extractBody() - Code Metrics - zrashwani/news-scrapper - Measure and Improve Code Quality continuously with Scrutinizer

AbstractAdapter::extractBody()
last analyzed 2017-08-17 08:42 UTC

↳ Parent: AbstractAdapter

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
dl	0
loc	1
c	0
b	0
f	0
nc	1

<?php

namespace Zrashwani\NewsScrapper\Adapters;

use Symfony\Component\DomCrawler\Crawler;
use Zrashwani\NewsScrapper\Selector;

/**
 * Base class that defines skeleton of the any adapter implemented
 *
 * @author Zeid Rashwani <zrashwani.com>
 */
abstract class AbstractAdapter
{
    public $currentUrl;

    abstract public function extractTitle(Crawler $crawler);

    abstract public function extractImage(Crawler $crawler);

    abstract public function extractDescription(Crawler $crawler);

    abstract public function extractKeywords(Crawler $crawler);

    abstract public function extractBody(Crawler $crawler);

    abstract public function extractPublishDate(Crawler $crawler);

    abstract public function extractAuthor(Crawler $crawler);

    /**
     * normalize link and turn it into absolute format
     * @param string $link
     * @param boolean $remove_hashes if set true, hashes will be removed from url
     * @return string
     */
    public function normalizeLink($link, $remove_hashes = false)
    {
        $baseUrl = $this->currentUrl;
        if (preg_match('@^http(s?)://.*$@', $baseUrl) === 0 && //local environment assumed here
            preg_match('@^http(s?)://.*$@', $link) === 0) {
                $link = pathinfo($baseUrl, PATHINFO_DIRNAME).'/'.$link;
        } elseif (preg_match('@^http(s?)://.*$@', $link) === 0) { //is not absolute
            $urlParts = parse_url($baseUrl);
            $scheme = isset($urlParts['scheme']) === true ? $urlParts['scheme'] : 'http';
            $host = isset($urlParts['host']) === true ? $urlParts['host'] : '';
            if (strpos($link, '//') === 0) { //begins with //
                $link = $scheme.':'.$link;
            } elseif (strpos($link, '/') === 0) { //begins with /
                $link = $scheme.'://'.$host.$link;
            } else {
                $link = $scheme.'://'.$host.'/'.$link;
            }
        }
        
        if ($remove_hashes === true) {
            $link = preg_replace('@#.*$@', '', $link);
        }
        return $link;
    }

    /**
     * normalizing html scrapped by removing unwanted tags (ex. script, css)
     * and amending external resources paths
     * @param string $raw_html
     * @return string
     */
    public function normalizeHtml($raw_html)
    {
        if (empty($raw_html)) {
            return $raw_html;
        }
        
        $disallowed_tags = ['script', 'style', 'meta', 'form', 'aside'];
                
        $xmlDoc = new \DOMDocument();
        libxml_use_internal_errors(true);
        $xmlDoc->loadHTML(mb_convert_encoding($raw_html, 'HTML-ENTITIES', 'UTF-8'));
        libxml_clear_errors();
        
        $xpath = new \DOMXPath($xmlDoc);
        foreach ($disallowed_tags as $tag) {
            $unwanted_entries = $xpath->query('//'.$tag);
            foreach ($unwanted_entries as $unwanted_elem) {
                $unwanted_elem->parentNode->removeChild($unwanted_elem);
            }
        }
        
        $html = $this->normalizeBodyLinks($xmlDoc->saveHTML());
        $html2 = preg_replace('@\s{2,}@', ' ', $html); //remove empty spaces from document
        
        return $html2;
    }
    
    /**
     * covert all relative paths in html to absolute ones
     * including: img src, a href ...etc.
     * @param string $html
     * @return string
     */
    public function normalizeBodyLinks($html)
    {
        if (empty($html) === true) { //if html is empty, do nothing
            return $html;
        }
        
        $xmlDoc = new \DOMDocument();
        libxml_use_internal_errors(true);
        $xmlDoc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
        libxml_clear_errors();
        
        $xpath = new \DOMXPath($xmlDoc);
        $lnk_entries = $xpath->query('//a');
        
        foreach ($lnk_entries as $entry) {
                $href = $entry->getAttribute('href');
                $normalized_href = $this->normalizeLink($href);
                
                $entry->setAttribute('href', $normalized_href);
                $entry->setAttribute('target', '_blank');
        }
        
        $img_entries = $xpath->query('//img');
        
        foreach ($img_entries as $entry) {
                $src = $entry->getAttribute('src');
                $normalized_src = $this->normalizeLink($src);
                $entry->setAttribute('src', $normalized_src);
        }
        
        $final_html = $xmlDoc->saveHTML();
                
        return $this->getBodyHtml($final_html);
    }
    

    /**
     * normalize keywords by removing spaces from each
     * @param array $keywords
     * @return array
     */
    public function normalizeKeywords(array $keywords)
    {
        foreach ($keywords as $k => $word) {
            $keywords[$k] = trim($word);
        }
        
        return $keywords;
    }
    
    /**
     * extract body content from html document
     * @param string $doc_html
     * @return string
     */
    protected function getBodyHtml($doc_html)
    {
        $html_crawler = new Crawler($doc_html);
        
        $ret = '';
        $html_crawler->filter('body')->each(
            function (Crawler $node) use (&$ret) {
                $ret = $node->html();
            }
        );
        
        return $ret;
    }
    

    /**
     * extract image source by selector
     * @param  Crawler $crawler
     * @param  string $selector
     * @return string|NULL
     */
    protected function getSrcByImgSelector(Crawler $crawler, $selector)
    {
        $ret = null;
        $imgExtractClosure = function (Crawler $node) use (&$ret) {
            $ret = $node->attr('src');
        };
        if (Selector::isXPath($selector)) {
            $crawler->filterXPath($selector)
                    ->each($imgExtractClosure);
        } else {
            $crawler->filter($selector)
                    ->each($imgExtractClosure);
        }
        
        if (empty($ret) === false) {
            return $this->normalizeLink($ret);
        } else {
            return null;
        }
    }
}


1			<?php
2
3			namespace Zrashwani\NewsScrapper\Adapters;
4
5			use Symfony\Component\DomCrawler\Crawler;
6			use Zrashwani\NewsScrapper\Selector;
7
8			/**
9			* Base class that defines skeleton of the any adapter implemented
10			*
11			* @author Zeid Rashwani <zrashwani.com>
12			*/
13			abstract class AbstractAdapter
14			{
15			public $currentUrl;
16
17			abstract public function extractTitle(Crawler $crawler);
18
19			abstract public function extractImage(Crawler $crawler);
20
21			abstract public function extractDescription(Crawler $crawler);
22
23			abstract public function extractKeywords(Crawler $crawler);
24
25			abstract public function extractBody(Crawler $crawler);
26
27			abstract public function extractPublishDate(Crawler $crawler);
28
29			abstract public function extractAuthor(Crawler $crawler);
30
31			/**
32			* normalize link and turn it into absolute format
33			* @param string $link
34			* @param boolean $remove_hashes if set true, hashes will be removed from url
35			* @return string
36			*/
37			public function normalizeLink($link, $remove_hashes = false)
38			{
39			$baseUrl = $this->currentUrl;
40			if (preg_match('@^http(s?)://.*$@', $baseUrl) === 0 && //local environment assumed here
41			preg_match('@^http(s?)://.*$@', $link) === 0) {
42			$link = pathinfo($baseUrl, PATHINFO_DIRNAME).'/'.$link;
43			} elseif (preg_match('@^http(s?)://.*$@', $link) === 0) { //is not absolute
44			$urlParts = parse_url($baseUrl);
45			$scheme = isset($urlParts['scheme']) === true ? $urlParts['scheme'] : 'http';
46			$host = isset($urlParts['host']) === true ? $urlParts['host'] : '';
47			if (strpos($link, '//') === 0) { //begins with //
48			$link = $scheme.':'.$link;
49			} elseif (strpos($link, '/') === 0) { //begins with /
50			$link = $scheme.'://'.$host.$link;
51			} else {
52			$link = $scheme.'://'.$host.'/'.$link;
53			}
54			}
55
56			if ($remove_hashes === true) {
57			$link = preg_replace('@#.*$@', '', $link);
58			}
59			return $link;
60			}
61
62			/**
63			* normalizing html scrapped by removing unwanted tags (ex. script, css)
64			* and amending external resources paths
65			* @param string $raw_html
66			* @return string
67			*/
68			public function normalizeHtml($raw_html)
69			{
70			if (empty($raw_html)) {
71			return $raw_html;
72			}
73
74			$disallowed_tags = ['script', 'style', 'meta', 'form', 'aside'];
75
76			$xmlDoc = new \DOMDocument();
77			libxml_use_internal_errors(true);
78			$xmlDoc->loadHTML(mb_convert_encoding($raw_html, 'HTML-ENTITIES', 'UTF-8'));
79			libxml_clear_errors();
80
81			$xpath = new \DOMXPath($xmlDoc);
82			foreach ($disallowed_tags as $tag) {
83			$unwanted_entries = $xpath->query('//'.$tag);
84			foreach ($unwanted_entries as $unwanted_elem) {
85			$unwanted_elem->parentNode->removeChild($unwanted_elem);
86			}
87			}
88
89			$html = $this->normalizeBodyLinks($xmlDoc->saveHTML());
90			$html2 = preg_replace('@\s{2,}@', ' ', $html); //remove empty spaces from document
91
92			return $html2;
93			}
94
95			/**
96			* covert all relative paths in html to absolute ones
97			* including: img src, a href ...etc.
98			* @param string $html
99			* @return string
100			*/
101			public function normalizeBodyLinks($html)
102			{
103			if (empty($html) === true) { //if html is empty, do nothing
104			return $html;
105			}
106
107			$xmlDoc = new \DOMDocument();
108			libxml_use_internal_errors(true);
109			$xmlDoc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
110			libxml_clear_errors();
111
112			$xpath = new \DOMXPath($xmlDoc);
113			$lnk_entries = $xpath->query('//a');
114
115			foreach ($lnk_entries as $entry) {
116			$href = $entry->getAttribute('href');
117			$normalized_href = $this->normalizeLink($href);
118
119			$entry->setAttribute('href', $normalized_href);
120			$entry->setAttribute('target', '_blank');
121			}
122
123			$img_entries = $xpath->query('//img');
124
125			foreach ($img_entries as $entry) {
126			$src = $entry->getAttribute('src');
127			$normalized_src = $this->normalizeLink($src);
128			$entry->setAttribute('src', $normalized_src);
129			}
130
131			$final_html = $xmlDoc->saveHTML();
132
133			return $this->getBodyHtml($final_html);
134			}
135
136
137			/**
138			* normalize keywords by removing spaces from each
139			* @param array $keywords
140			* @return array
141			*/
142			public function normalizeKeywords(array $keywords)
143			{
144			foreach ($keywords as $k => $word) {
145			$keywords[$k] = trim($word);
146			}
147
148			return $keywords;
149			}
150
151			/**
152			* extract body content from html document
153			* @param string $doc_html
154			* @return string
155			*/
156			protected function getBodyHtml($doc_html)
157			{
158			$html_crawler = new Crawler($doc_html);
159
160			$ret = '';
161			$html_crawler->filter('body')->each(
162			function (Crawler $node) use (&$ret) {
163			$ret = $node->html();
164			}
165			);
166
167			return $ret;
168			}
169
170
171			/**
172			* extract image source by selector
173			* @param Crawler $crawler
174			* @param string $selector
175			* @return string\|NULL
176			*/
177			protected function getSrcByImgSelector(Crawler $crawler, $selector)
178			{
179			$ret = null;
180			$imgExtractClosure = function (Crawler $node) use (&$ret) {
181			$ret = $node->attr('src');
182			};
183			if (Selector::isXPath($selector)) {
184			$crawler->filterXPath($selector)
185			->each($imgExtractClosure);
186			} else {
187			$crawler->filter($selector)
188			->each($imgExtractClosure);
189			}
190
191			if (empty($ret) === false) {
192			return $this->normalizeLink($ret);
193			} else {
194			return null;
195			}
196			}
197			}
198

zrashwani / news-scrapper

AbstractAdapter::extractBody() last analyzed 2017-08-17 08:42 UTC

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

AbstractAdapter::extractBody()
last analyzed 2017-08-17 08:42 UTC