Inspection of "update readme package version" - zrashwani/news-scrapper - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (9824e2)

by Zeid

created 2015-11-28 14:03 UTC

Spacing +7 added lines, -7 removed lines patch added patch discarded remove patch

@@ -42,10 +42,10 @@  discard block
 block discarded – undo
                 $link = pathinfo($baseUrl, PATHINFO_DIRNAME).'/'.$link;
         } elseif (preg_match('@^http(s?)://.*$@', $link) === 0) { //is not absolute
             $urlParts = parse_url($baseUrl);
-            $scheme = isset($urlParts['scheme'])===true?$urlParts['scheme']:'http';
-            $host = isset($urlParts['host'])===true?$urlParts['host']:'';
+            $scheme = isset($urlParts['scheme']) === true ? $urlParts['scheme'] : 'http';
+            $host = isset($urlParts['host']) === true ? $urlParts['host'] : '';
             if (strpos($link, '//') === 0) { //begins with //
-                $link = $scheme . ':' . $link;
+                $link = $scheme.':'.$link;
             } elseif (strpos($link, '/') === 0) { //begins with /
                 $link = $scheme.'://'.$host.$link;
             } else {
@@ -71,7 +71,7 @@  discard block
 block discarded – undo
             return $raw_html;
         }
         
-        $disallowed_tags = ['script', 'style', 'meta','form','aside'];
+        $disallowed_tags = ['script', 'style', 'meta', 'form', 'aside'];
                 
         $xmlDoc = new \DOMDocument();
         libxml_use_internal_errors(true);
@@ -100,7 +100,7 @@  discard block
 block discarded – undo
      */
     public function normalizeBodyLinks($html)
     {
-        if (empty($html)===true) { //if html is empty, do nothing
+        if (empty($html) === true) { //if html is empty, do nothing
             return $html;
         }
         
@@ -159,7 +159,7 @@  discard block
 block discarded – undo
         
         $ret = '';
         $html_crawler->filter('body')->each(
-            function (Crawler $node) use (&$ret) {
+            function(Crawler $node) use (&$ret) {
                 $ret = $node->html();
             }
         );
@@ -177,7 +177,7 @@  discard block
 block discarded – undo
     protected function getSrcByImgSelector(Crawler $crawler, $selector)
     {
         $ret = null;
-        $imgExtractClosure = function (Crawler $node) use (&$ret) {
+        $imgExtractClosure = function(Crawler $node) use (&$ret) {
             $ret = $node->attr('src');
         };
         if (Selector::isXPath($selector)) {

Please login to merge, or discard this patch.

src/NewsScrapper/Adapters/CustomAdapter.php 1 patch

Spacing +1 added lines, -1 removed lines patch added patch discarded remove patch

@@ -158,7 +158,7 @@
 block discarded – undo
 
         $ret = null;
         if ($extractClosure === null) {
-            $extractClosure = function (Crawler $node) use (&$ret) {
+            $extractClosure = function(Crawler $node) use (&$ret) {
                 $ret = $node->html();
             };
         }

Please login to merge, or discard this patch.

src/NewsScrapper/Adapters/DefaultAdapter.php 1 patch

Spacing +7 added lines, -7 removed lines patch added patch discarded remove patch

@@ -22,7 +22,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//head/title')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                             $ret = $node->text();
                 }
             );
@@ -42,7 +42,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//img')
             ->each(
-                function (Crawler $node) use (&$ret, $theAdapter) {
+                function(Crawler $node) use (&$ret, $theAdapter) {
                         $img_src = $theAdapter->normalizeLink($node->attr('src'));
                         $width_org = $height_org = 0;
                     
@@ -77,7 +77,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath("//head/meta[@name='description']")
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                             $ret = $node->attr('content');
                 }
             );
@@ -96,7 +96,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath("//head/meta[@name='keywords']")
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                             $node_txt = trim($node->attr('content'));
                     if (!empty($node_txt)) {
                         $ret = explode(',', $node_txt);
@@ -119,7 +119,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath("//article")
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
 
                             $node_txt = $node->text();
                     if (strlen($node_txt) > strlen($ret)) {
@@ -142,7 +142,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath("//meta[@name='pubdate']")
             ->each(
-                function (Crawler $node) use (&$date_str) {
+                function(Crawler $node) use (&$date_str) {
                     if (empty($date_str) === true) {
                         $date_str = $node->attr('content');
                     }
@@ -172,7 +172,7 @@  discard block
 block discarded – undo
         $ret = null;
         $crawler->filterXPath("//head/meta[@name='author']")
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                             $ret = $node->attr('content');
                 }
             );

Please login to merge, or discard this patch.

src/NewsScrapper/Adapters/HAtomAdapter.php 1 patch

Spacing +6 added lines, -6 removed lines patch added patch discarded remove patch

@@ -17,7 +17,7 @@  discard block
 block discarded – undo
 
         $crawler->filter('.hentry .entry-title')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $node->text();
                 }
             );
@@ -38,7 +38,7 @@  discard block
 block discarded – undo
 
         $crawler->filter('.hentry .entry-summary')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $node->text();
                 }
             );
@@ -52,7 +52,7 @@  discard block
 block discarded – undo
 
         $crawler->filter('.hentry a[rel="tag"]')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret[] = $node->text();
                 }
             );
@@ -65,7 +65,7 @@  discard block
 block discarded – undo
         $ret = null;
         $crawler->filter(".hentry .entry-content")
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $this->normalizeHtml($node->html());
                 }
             );
@@ -79,7 +79,7 @@  discard block
 block discarded – undo
 
         $crawler->filter('time.published, .hentry .entry-date')
             ->each(
-                function (Crawler $node) use (&$date_str) {
+                function(Crawler $node) use (&$date_str) {
                         $date_str = $node->attr('datetime');
                 }
             );
@@ -95,7 +95,7 @@  discard block
 block discarded – undo
         $ret = null;
         $crawler->filter('.hentry .author.vcard')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $node->text();
                 }
             );

Please login to merge, or discard this patch.

src/NewsScrapper/Adapters/JsonLDAdapter.php 1 patch

Spacing +5 added lines, -5 removed lines patch added patch discarded remove patch

@@ -17,7 +17,7 @@  discard block
 block discarded – undo
     public function extractTitle(Crawler $crawler)
     {
         $article_data = $this->getJsonData($crawler);
-        $ret = isset($article_data['headline'])?$article_data['headline']:null;
+        $ret = isset($article_data['headline']) ? $article_data['headline'] : null;
 
         return $ret;
     }
@@ -39,7 +39,7 @@  discard block
 block discarded – undo
     public function extractDescription(Crawler $crawler)
     {
         $article_data = $this->getJsonData($crawler);
-        $ret = isset($article_data['description'])?$article_data['description']:null;
+        $ret = isset($article_data['description']) ? $article_data['description'] : null;
         
         return $ret;
     }
@@ -47,7 +47,7 @@  discard block
 block discarded – undo
     public function extractKeywords(Crawler $crawler)
     {
         $article_data = $this->getJsonData($crawler);
-        $ret = isset($article_data['keywords'])?$article_data['keywords']:array();
+        $ret = isset($article_data['keywords']) ? $article_data['keywords'] : array();
         
         if (!is_array($ret)) {
             $ret = explode(',', $ret);
@@ -137,7 +137,7 @@  discard block
 block discarded – undo
         
         $ret = array();
         $crawler->filterXPath('//script[@type="application/ld+json"]')
-                ->each(function (Crawler $node) use (&$ret) {
+                ->each(function(Crawler $node) use (&$ret) {
                     $json_content = trim($node->text());
                     if (empty($json_content) === true && $node->attr('src')) {
                         $script_path = $this->normalizeLink($node->attr('src'));
@@ -171,7 +171,7 @@  discard block
 block discarded – undo
                 'APIReference'];
         
         if (isset($article_data['@context']) &&
-                $article_data['@context']=='http://schema.org' &&
+                $article_data['@context'] == 'http://schema.org' &&
                 isset($article_data['@type']) &&
                 in_array($article_data['@type'], $article_types)) {
             return true;

Please login to merge, or discard this patch.

src/NewsScrapper/Adapters/MicrodataAdapter.php 1 patch

Spacing +8 added lines, -8 removed lines patch added patch discarded remove patch

@@ -22,7 +22,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//*[@itemprop="headline"]')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                             $ret = trim($node->text());
                 }
             );
@@ -44,7 +44,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//*[@itemprop="description"]')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                     if ($node->nodeName() === 'meta') {
                         $ret = trim($node->attr('content'));
                     } else {
@@ -67,7 +67,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//*[@itemprop="keywords"]')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                     if ($node->nodeName() === 'meta') {
                         $keyword_txt = trim($node->attr('content'));
                     } else {
@@ -89,7 +89,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//*[@itemprop="articleBody"]')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret .= $node->html();
                 }
             );
@@ -106,7 +106,7 @@  discard block
 block discarded – undo
                     "//*[@itemtype='http://schema.org/$article_type']"
                 )
                     ->each(
-                        function (Crawler $node) use (&$ret) {
+                        function(Crawler $node) use (&$ret) {
                                     $ret .= $node->html();
                         }
                     );
@@ -128,7 +128,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//*[@itemprop="datePublished"]')
             ->each(
-                function (Crawler $node) use (&$date_str) {
+                function(Crawler $node) use (&$date_str) {
                     if ($node->nodeName() === 'meta') {
                         $date_str = $node->attr('content');
                     } elseif ($node->attr('datetime')) {
@@ -157,7 +157,7 @@  discard block
 block discarded – undo
             'and @itemtype="http://schema.org/Person"]//*[@itemprop="name"]'
         )
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                             $ret = $node->text();
                 }
             );
@@ -165,7 +165,7 @@  discard block
 block discarded – undo
         if (is_null($ret)) {
             $crawler->filterXPath('//*[@itemprop="author"]')
                 ->each(
-                    function (Crawler $node) use (&$ret) {
+                    function(Crawler $node) use (&$ret) {
                         if ($node->nodeName() === 'meta') {
                                 $ret = $node->attr('content');
                         } else {

Please login to merge, or discard this patch.

src/NewsScrapper/Adapters/OpenGraphAdapter.php 1 patch

Spacing +9 added lines, -9 removed lines patch added patch discarded remove patch

@@ -22,7 +22,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath("//head/meta[@property='og:title']")
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $node->attr('content');
                 }
             );
@@ -31,7 +31,7 @@  discard block
 block discarded – undo
         if (empty($ret) === true) {
             $crawler->filterXPath('//h1')
                 ->each(
-                    function (Crawler $node) use (&$ret) {
+                    function(Crawler $node) use (&$ret) {
                             $ret = $node->text();
                     }
                 );
@@ -40,7 +40,7 @@  discard block
 block discarded – undo
         if (empty($ret) === true) {
             $crawler->filterXPath('//head/title')
                 ->each(
-                    function (Crawler $node) use (&$ret) {
+                    function(Crawler $node) use (&$ret) {
                             $ret = $node->text();
                     }
                 );
@@ -61,7 +61,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath("//head/meta[@property='og:image']")
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $node->attr('content');
                 }
             );
@@ -69,7 +69,7 @@  discard block
 block discarded – undo
         if (empty($ret) === true) {
             $crawler->filterXPath('//img')
                 ->each(
-                    function (Crawler $node) use (&$ret, $theAdapter) {
+                    function(Crawler $node) use (&$ret, $theAdapter) {
                         $img_src = $theAdapter->normalizeLink($node->attr('src'));
                         $width_org = $height_org = 0;
                     
@@ -106,7 +106,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath("//head/meta[@property='og:description']")
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $node->attr('content');
                 }
             );
@@ -125,7 +125,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath("//head/meta[@property='og:keywords']")
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                 
                         $node_txt = trim($node->attr('content'));
                     if (!empty($node_txt)) {
@@ -150,7 +150,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath("//head/meta[@property='article:published_time']")
             ->each(
-                function (Crawler $node) use (&$date_str) {
+                function(Crawler $node) use (&$date_str) {
                         $date_str = $node->attr('content');
                 }
             );
@@ -168,7 +168,7 @@  discard block
 block discarded – undo
         $ret = null;
         $crawler->filterXPath("//head/meta[@property='article:author']")
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $node->attr('content');
                 }
             );

Please login to merge, or discard this patch.

src/NewsScrapper/Adapters/ParselyAdapter.php 1 patch

Spacing +5 added lines, -5 removed lines patch added patch discarded remove patch

@@ -18,7 +18,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//meta[@name="parsely-title"]')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $node->attr('content');
                 }
             );
@@ -33,7 +33,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//meta[@name="parsely-image-url"]')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $node->attr('content');
                 }
             );
@@ -55,7 +55,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//meta[@name="parsely-tags"]')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = explode(',', $node->attr('content'));
                 }
             );
@@ -74,7 +74,7 @@  discard block
 block discarded – undo
 
         $crawler->filterXPath('//meta[@name="parsely-pub-date"]')
             ->each(
-                function (Crawler $node) use (&$date_str) {
+                function(Crawler $node) use (&$date_str) {
                         $date_str = $node->attr('content');
                 }
             );
@@ -90,7 +90,7 @@  discard block
 block discarded – undo
         $ret = null;
         $crawler->filterXPath('//meta[@name="parsely-author"]')
             ->each(
-                function (Crawler $node) use (&$ret) {
+                function(Crawler $node) use (&$ret) {
                         $ret = $node->attr('content');
                 }
             );

Please login to merge, or discard this patch.

src/NewsScrapper/Client.php 1 patch

Spacing +19 added lines, -19 removed lines patch added patch discarded remove patch

@@ -32,12 +32,12 @@  discard block
 block discarded – undo
         
         $this->scrapClient->followRedirects();
         $this->scrapClient->getClient()->setDefaultOption(
-            'config/curl/' .
+            'config/curl/'.
             CURLOPT_SSL_VERIFYHOST,
             false
         );
         $this->scrapClient->getClient()->setDefaultOption(
-            'config/curl/' .
+            'config/curl/'.
             CURLOPT_SSL_VERIFYPEER,
             false
         );
@@ -61,7 +61,7 @@  discard block
 block discarded – undo
      */
     public function setAdapter($adapter_name)
     {
-        $adapterClass = "\Zrashwani\NewsScrapper\Adapters\\" . $adapter_name . "Adapter";
+        $adapterClass = "\Zrashwani\NewsScrapper\Adapters\\".$adapter_name."Adapter";
         if (class_exists($adapterClass)) {
             $this->adapter = new $adapterClass();
         } else {
@@ -88,11 +88,11 @@  discard block
 block discarded – undo
         $theAdapter->currentUrl = $baseUrl;
 
         $isXpath = Selector::isXPath($linkSelector);
-        $method = ($isXpath ===false)?'filter':'filterXPath';
+        $method = ($isXpath === false) ? 'filter' : 'filterXPath';
         
         $crawler->$method($linkSelector)
             ->each(
-                function (Crawler $link_node) use (&$scrap_result, $theAdapter, &$limit) {
+                function(Crawler $link_node) use (&$scrap_result, $theAdapter, &$limit) {
                     if (!is_null($limit) && count($scrap_result) >= $limit) {
                         return;
                     }
@@ -147,21 +147,21 @@  discard block
 block discarded – undo
     ) {
         $adapter->currentUrl = $article_info->url; //associate link url to adapter
         
-        $article_info->title = empty($article_info->title) === true?
-                    $adapter->extractTitle($pageCrawler):$article_info->title;
-        $article_info->image = empty($article_info->image) === true?
-                $adapter->extractImage($pageCrawler, $article_info->url):$article_info->image;
-        $article_info->description = empty($article_info->description) === true?
-                $adapter->extractDescription($pageCrawler):$article_info->description;
-        $article_info->keywords = !isset($article_info->keywords) || count($article_info->keywords) === 0?
-                $adapter->extractKeywords($pageCrawler):$article_info->keywords;
+        $article_info->title = empty($article_info->title) === true ?
+                    $adapter->extractTitle($pageCrawler) : $article_info->title;
+        $article_info->image = empty($article_info->image) === true ?
+                $adapter->extractImage($pageCrawler, $article_info->url) : $article_info->image;
+        $article_info->description = empty($article_info->description) === true ?
+                $adapter->extractDescription($pageCrawler) : $article_info->description;
+        $article_info->keywords = !isset($article_info->keywords) || count($article_info->keywords) === 0 ?
+                $adapter->extractKeywords($pageCrawler) : $article_info->keywords;
         
-        $article_info->author = empty($article_info->author) === true?
-                $adapter->extractAuthor($pageCrawler):$article_info->author;
-        $article_info->publishDate = empty($article_info->publishDate) === true?
-                $adapter->extractPublishDate($pageCrawler):$article_info->publishDate;
-        $article_info->body = empty($article_info->body) === true?
-                $adapter->extractBody($pageCrawler):$article_info->body;
+        $article_info->author = empty($article_info->author) === true ?
+                $adapter->extractAuthor($pageCrawler) : $article_info->author;
+        $article_info->publishDate = empty($article_info->publishDate) === true ?
+                $adapter->extractPublishDate($pageCrawler) : $article_info->publishDate;
+        $article_info->body = empty($article_info->body) === true ?
+                $adapter->extractBody($pageCrawler) : $article_info->body;
         
     }
 }

Please login to merge, or discard this patch.

		@@ -42,10 +42,10 @@ discard block
		block discarded – undo
42	42	$link = pathinfo($baseUrl, PATHINFO_DIRNAME).'/'.$link;
43	43	} elseif (preg_match('@^http(s?)://.*$@', $link) === 0) { //is not absolute
44	44	$urlParts = parse_url($baseUrl);
45		- $scheme = isset($urlParts['scheme'])===true?$urlParts['scheme']:'http';
46		- $host = isset($urlParts['host'])===true?$urlParts['host']:'';
	45	+ $scheme = isset($urlParts['scheme']) === true ? $urlParts['scheme'] : 'http';
	46	+ $host = isset($urlParts['host']) === true ? $urlParts['host'] : '';
47	47	if (strpos($link, '//') === 0) { //begins with //
48		- $link = $scheme . ':' . $link;
	48	+ $link = $scheme.':'.$link;
49	49	} elseif (strpos($link, '/') === 0) { //begins with /
50	50	$link = $scheme.'://'.$host.$link;
51	51	} else {
		@@ -71,7 +71,7 @@ discard block
		block discarded – undo
71	71	return $raw_html;
72	72	}
73	73
74		- $disallowed_tags = ['script', 'style', 'meta','form','aside'];
	74	+ $disallowed_tags = ['script', 'style', 'meta', 'form', 'aside'];
75	75
76	76	$xmlDoc = new \DOMDocument();
77	77	libxml_use_internal_errors(true);
		@@ -100,7 +100,7 @@ discard block
		block discarded – undo
100	100	*/
101	101	public function normalizeBodyLinks($html)
102	102	{
103		- if (empty($html)===true) { //if html is empty, do nothing
	103	+ if (empty($html) === true) { //if html is empty, do nothing
104	104	return $html;
105	105	}
106	106
		@@ -159,7 +159,7 @@ discard block
		block discarded – undo
159	159
160	160	$ret = '';
161	161	$html_crawler->filter('body')->each(
162		- function (Crawler $node) use (&$ret) {
	162	+ function(Crawler $node) use (&$ret) {
163	163	$ret = $node->html();
164	164	}
165	165	);
		@@ -177,7 +177,7 @@ discard block
		block discarded – undo
177	177	protected function getSrcByImgSelector(Crawler $crawler, $selector)
178	178	{
179	179	$ret = null;
180		- $imgExtractClosure = function (Crawler $node) use (&$ret) {
	180	+ $imgExtractClosure = function(Crawler $node) use (&$ret) {
181	181	$ret = $node->attr('src');
182	182	};
183	183	if (Selector::isXPath($selector)) {

		@@ -158,7 +158,7 @@
		block discarded – undo
158	158
159	159	$ret = null;
160	160	if ($extractClosure === null) {
161		- $extractClosure = function (Crawler $node) use (&$ret) {
	161	+ $extractClosure = function(Crawler $node) use (&$ret) {
162	162	$ret = $node->html();
163	163	};
164	164	}

		@@ -22,7 +22,7 @@ discard block
		block discarded – undo
22	22
23	23	$crawler->filterXPath('//head/title')
24	24	->each(
25		- function (Crawler $node) use (&$ret) {
	25	+ function(Crawler $node) use (&$ret) {
26	26	$ret = $node->text();
27	27	}
28	28	);
		@@ -42,7 +42,7 @@ discard block
		block discarded – undo
42	42
43	43	$crawler->filterXPath('//img')
44	44	->each(
45		- function (Crawler $node) use (&$ret, $theAdapter) {
	45	+ function(Crawler $node) use (&$ret, $theAdapter) {
46	46	$img_src = $theAdapter->normalizeLink($node->attr('src'));
47	47	$width_org = $height_org = 0;
48	48
		@@ -77,7 +77,7 @@ discard block
		block discarded – undo
77	77
78	78	$crawler->filterXPath("//head/meta[@name='description']")
79	79	->each(
80		- function (Crawler $node) use (&$ret) {
	80	+ function(Crawler $node) use (&$ret) {
81	81	$ret = $node->attr('content');
82	82	}
83	83	);
		@@ -96,7 +96,7 @@ discard block
		block discarded – undo
96	96
97	97	$crawler->filterXPath("//head/meta[@name='keywords']")
98	98	->each(
99		- function (Crawler $node) use (&$ret) {
	99	+ function(Crawler $node) use (&$ret) {
100	100	$node_txt = trim($node->attr('content'));
101	101	if (!empty($node_txt)) {
102	102	$ret = explode(',', $node_txt);
		@@ -119,7 +119,7 @@ discard block
		block discarded – undo
119	119
120	120	$crawler->filterXPath("//article")
121	121	->each(
122		- function (Crawler $node) use (&$ret) {
	122	+ function(Crawler $node) use (&$ret) {
123	123
124	124	$node_txt = $node->text();
125	125	if (strlen($node_txt) > strlen($ret)) {
		@@ -142,7 +142,7 @@ discard block
		block discarded – undo
142	142
143	143	$crawler->filterXPath("//meta[@name='pubdate']")
144	144	->each(
145		- function (Crawler $node) use (&$date_str) {
	145	+ function(Crawler $node) use (&$date_str) {
146	146	if (empty($date_str) === true) {
147	147	$date_str = $node->attr('content');
148	148	}
		@@ -172,7 +172,7 @@ discard block
		block discarded – undo
172	172	$ret = null;
173	173	$crawler->filterXPath("//head/meta[@name='author']")
174	174	->each(
175		- function (Crawler $node) use (&$ret) {
	175	+ function(Crawler $node) use (&$ret) {
176	176	$ret = $node->attr('content');
177	177	}
178	178	);

		@@ -17,7 +17,7 @@ discard block
		block discarded – undo
17	17
18	18	$crawler->filter('.hentry .entry-title')
19	19	->each(
20		- function (Crawler $node) use (&$ret) {
	20	+ function(Crawler $node) use (&$ret) {
21	21	$ret = $node->text();
22	22	}
23	23	);
		@@ -38,7 +38,7 @@ discard block
		block discarded – undo
38	38
39	39	$crawler->filter('.hentry .entry-summary')
40	40	->each(
41		- function (Crawler $node) use (&$ret) {
	41	+ function(Crawler $node) use (&$ret) {
42	42	$ret = $node->text();
43	43	}
44	44	);
		@@ -52,7 +52,7 @@ discard block
		block discarded – undo
52	52
53	53	$crawler->filter('.hentry a[rel="tag"]')
54	54	->each(
55		- function (Crawler $node) use (&$ret) {
	55	+ function(Crawler $node) use (&$ret) {
56	56	$ret[] = $node->text();
57	57	}
58	58	);
		@@ -65,7 +65,7 @@ discard block
		block discarded – undo
65	65	$ret = null;
66	66	$crawler->filter(".hentry .entry-content")
67	67	->each(
68		- function (Crawler $node) use (&$ret) {
	68	+ function(Crawler $node) use (&$ret) {
69	69	$ret = $this->normalizeHtml($node->html());
70	70	}
71	71	);
		@@ -79,7 +79,7 @@ discard block
		block discarded – undo
79	79
80	80	$crawler->filter('time.published, .hentry .entry-date')
81	81	->each(
82		- function (Crawler $node) use (&$date_str) {
	82	+ function(Crawler $node) use (&$date_str) {
83	83	$date_str = $node->attr('datetime');
84	84	}
85	85	);
		@@ -95,7 +95,7 @@ discard block
		block discarded – undo
95	95	$ret = null;
96	96	$crawler->filter('.hentry .author.vcard')
97	97	->each(
98		- function (Crawler $node) use (&$ret) {
	98	+ function(Crawler $node) use (&$ret) {
99	99	$ret = $node->text();
100	100	}
101	101	);

		@@ -17,7 +17,7 @@ discard block
		block discarded – undo
17	17	public function extractTitle(Crawler $crawler)
18	18	{
19	19	$article_data = $this->getJsonData($crawler);
20		- $ret = isset($article_data['headline'])?$article_data['headline']:null;
	20	+ $ret = isset($article_data['headline']) ? $article_data['headline'] : null;
21	21
22	22	return $ret;
23	23	}
		@@ -39,7 +39,7 @@ discard block
		block discarded – undo
39	39	public function extractDescription(Crawler $crawler)
40	40	{
41	41	$article_data = $this->getJsonData($crawler);
42		- $ret = isset($article_data['description'])?$article_data['description']:null;
	42	+ $ret = isset($article_data['description']) ? $article_data['description'] : null;
43	43
44	44	return $ret;
45	45	}
		@@ -47,7 +47,7 @@ discard block
		block discarded – undo
47	47	public function extractKeywords(Crawler $crawler)
48	48	{
49	49	$article_data = $this->getJsonData($crawler);
50		- $ret = isset($article_data['keywords'])?$article_data['keywords']:array();
	50	+ $ret = isset($article_data['keywords']) ? $article_data['keywords'] : array();
51	51
52	52	if (!is_array($ret)) {
53	53	$ret = explode(',', $ret);
		@@ -137,7 +137,7 @@ discard block
		block discarded – undo
137	137
138	138	$ret = array();
139	139	$crawler->filterXPath('//script[@type="application/ld+json"]')
140		- ->each(function (Crawler $node) use (&$ret) {
	140	+ ->each(function(Crawler $node) use (&$ret) {
141	141	$json_content = trim($node->text());
142	142	if (empty($json_content) === true && $node->attr('src')) {
143	143	$script_path = $this->normalizeLink($node->attr('src'));
		@@ -171,7 +171,7 @@ discard block
		block discarded – undo
171	171	'APIReference'];
172	172
173	173	if (isset($article_data['@context']) &&
174		- $article_data['@context']=='http://schema.org' &&
	174	+ $article_data['@context'] == 'http://schema.org' &&
175	175	isset($article_data['@type']) &&
176	176	in_array($article_data['@type'], $article_types)) {
177	177	return true;

		@@ -22,7 +22,7 @@ discard block
		block discarded – undo
22	22
23	23	$crawler->filterXPath('//*[@itemprop="headline"]')
24	24	->each(
25		- function (Crawler $node) use (&$ret) {
	25	+ function(Crawler $node) use (&$ret) {
26	26	$ret = trim($node->text());
27	27	}
28	28	);
		@@ -44,7 +44,7 @@ discard block
		block discarded – undo
44	44
45	45	$crawler->filterXPath('//*[@itemprop="description"]')
46	46	->each(
47		- function (Crawler $node) use (&$ret) {
	47	+ function(Crawler $node) use (&$ret) {
48	48	if ($node->nodeName() === 'meta') {
49	49	$ret = trim($node->attr('content'));
50	50	} else {
		@@ -67,7 +67,7 @@ discard block
		block discarded – undo
67	67
68	68	$crawler->filterXPath('//*[@itemprop="keywords"]')
69	69	->each(
70		- function (Crawler $node) use (&$ret) {
	70	+ function(Crawler $node) use (&$ret) {
71	71	if ($node->nodeName() === 'meta') {
72	72	$keyword_txt = trim($node->attr('content'));
73	73	} else {
		@@ -89,7 +89,7 @@ discard block
		block discarded – undo
89	89
90	90	$crawler->filterXPath('//*[@itemprop="articleBody"]')
91	91	->each(
92		- function (Crawler $node) use (&$ret) {
	92	+ function(Crawler $node) use (&$ret) {
93	93	$ret .= $node->html();
94	94	}
95	95	);
		@@ -106,7 +106,7 @@ discard block
		block discarded – undo
106	106	"//*[@itemtype='http://schema.org/$article_type']"
107	107	)
108	108	->each(
109		- function (Crawler $node) use (&$ret) {
	109	+ function(Crawler $node) use (&$ret) {
110	110	$ret .= $node->html();
111	111	}
112	112	);
		@@ -128,7 +128,7 @@ discard block
		block discarded – undo
128	128
129	129	$crawler->filterXPath('//*[@itemprop="datePublished"]')
130	130	->each(
131		- function (Crawler $node) use (&$date_str) {
	131	+ function(Crawler $node) use (&$date_str) {
132	132	if ($node->nodeName() === 'meta') {
133	133	$date_str = $node->attr('content');
134	134	} elseif ($node->attr('datetime')) {
		@@ -157,7 +157,7 @@ discard block
		block discarded – undo
157	157	'and @itemtype="http://schema.org/Person"]//*[@itemprop="name"]'
158	158	)
159	159	->each(
160		- function (Crawler $node) use (&$ret) {
	160	+ function(Crawler $node) use (&$ret) {
161	161	$ret = $node->text();
162	162	}
163	163	);
		@@ -165,7 +165,7 @@ discard block
		block discarded – undo
165	165	if (is_null($ret)) {
166	166	$crawler->filterXPath('//*[@itemprop="author"]')
167	167	->each(
168		- function (Crawler $node) use (&$ret) {
	168	+ function(Crawler $node) use (&$ret) {
169	169	if ($node->nodeName() === 'meta') {
170	170	$ret = $node->attr('content');
171	171	} else {

		@@ -22,7 +22,7 @@ discard block
		block discarded – undo
22	22
23	23	$crawler->filterXPath("//head/meta[@property='og:title']")
24	24	->each(
25		- function (Crawler $node) use (&$ret) {
	25	+ function(Crawler $node) use (&$ret) {
26	26	$ret = $node->attr('content');
27	27	}
28	28	);
		@@ -31,7 +31,7 @@ discard block
		block discarded – undo
31	31	if (empty($ret) === true) {
32	32	$crawler->filterXPath('//h1')
33	33	->each(
34		- function (Crawler $node) use (&$ret) {
	34	+ function(Crawler $node) use (&$ret) {
35	35	$ret = $node->text();
36	36	}
37	37	);
		@@ -40,7 +40,7 @@ discard block
		block discarded – undo
40	40	if (empty($ret) === true) {
41	41	$crawler->filterXPath('//head/title')
42	42	->each(
43		- function (Crawler $node) use (&$ret) {
	43	+ function(Crawler $node) use (&$ret) {
44	44	$ret = $node->text();
45	45	}
46	46	);
		@@ -61,7 +61,7 @@ discard block
		block discarded – undo
61	61
62	62	$crawler->filterXPath("//head/meta[@property='og:image']")
63	63	->each(
64		- function (Crawler $node) use (&$ret) {
	64	+ function(Crawler $node) use (&$ret) {
65	65	$ret = $node->attr('content');
66	66	}
67	67	);
		@@ -69,7 +69,7 @@ discard block
		block discarded – undo
69	69	if (empty($ret) === true) {
70	70	$crawler->filterXPath('//img')
71	71	->each(
72		- function (Crawler $node) use (&$ret, $theAdapter) {
	72	+ function(Crawler $node) use (&$ret, $theAdapter) {
73	73	$img_src = $theAdapter->normalizeLink($node->attr('src'));
74	74	$width_org = $height_org = 0;
75	75
		@@ -106,7 +106,7 @@ discard block
		block discarded – undo
106	106
107	107	$crawler->filterXPath("//head/meta[@property='og:description']")
108	108	->each(
109		- function (Crawler $node) use (&$ret) {
	109	+ function(Crawler $node) use (&$ret) {
110	110	$ret = $node->attr('content');
111	111	}
112	112	);
		@@ -125,7 +125,7 @@ discard block
		block discarded – undo
125	125
126	126	$crawler->filterXPath("//head/meta[@property='og:keywords']")
127	127	->each(
128		- function (Crawler $node) use (&$ret) {
	128	+ function(Crawler $node) use (&$ret) {
129	129
130	130	$node_txt = trim($node->attr('content'));
131	131	if (!empty($node_txt)) {
		@@ -150,7 +150,7 @@ discard block
		block discarded – undo
150	150
151	151	$crawler->filterXPath("//head/meta[@property='article:published_time']")
152	152	->each(
153		- function (Crawler $node) use (&$date_str) {
	153	+ function(Crawler $node) use (&$date_str) {
154	154	$date_str = $node->attr('content');
155	155	}
156	156	);
		@@ -168,7 +168,7 @@ discard block
		block discarded – undo
168	168	$ret = null;
169	169	$crawler->filterXPath("//head/meta[@property='article:author']")
170	170	->each(
171		- function (Crawler $node) use (&$ret) {
	171	+ function(Crawler $node) use (&$ret) {
172	172	$ret = $node->attr('content');
173	173	}
174	174	);

		@@ -18,7 +18,7 @@ discard block
		block discarded – undo
18	18
19	19	$crawler->filterXPath('//meta[@name="parsely-title"]')
20	20	->each(
21		- function (Crawler $node) use (&$ret) {
	21	+ function(Crawler $node) use (&$ret) {
22	22	$ret = $node->attr('content');
23	23	}
24	24	);
		@@ -33,7 +33,7 @@ discard block
		block discarded – undo
33	33
34	34	$crawler->filterXPath('//meta[@name="parsely-image-url"]')
35	35	->each(
36		- function (Crawler $node) use (&$ret) {
	36	+ function(Crawler $node) use (&$ret) {
37	37	$ret = $node->attr('content');
38	38	}
39	39	);
		@@ -55,7 +55,7 @@ discard block
		block discarded – undo
55	55
56	56	$crawler->filterXPath('//meta[@name="parsely-tags"]')
57	57	->each(
58		- function (Crawler $node) use (&$ret) {
	58	+ function(Crawler $node) use (&$ret) {
59	59	$ret = explode(',', $node->attr('content'));
60	60	}
61	61	);
		@@ -74,7 +74,7 @@ discard block
		block discarded – undo
74	74
75	75	$crawler->filterXPath('//meta[@name="parsely-pub-date"]')
76	76	->each(
77		- function (Crawler $node) use (&$date_str) {
	77	+ function(Crawler $node) use (&$date_str) {
78	78	$date_str = $node->attr('content');
79	79	}
80	80	);
		@@ -90,7 +90,7 @@ discard block
		block discarded – undo
90	90	$ret = null;
91	91	$crawler->filterXPath('//meta[@name="parsely-author"]')
92	92	->each(
93		- function (Crawler $node) use (&$ret) {
	93	+ function(Crawler $node) use (&$ret) {
94	94	$ret = $node->attr('content');
95	95	}
96	96	);

		@@ -32,12 +32,12 @@ discard block
		block discarded – undo
32	32
33	33	$this->scrapClient->followRedirects();
34	34	$this->scrapClient->getClient()->setDefaultOption(
35		- 'config/curl/' .
	35	+ 'config/curl/'.
36	36	CURLOPT_SSL_VERIFYHOST,
37	37	false
38	38	);
39	39	$this->scrapClient->getClient()->setDefaultOption(
40		- 'config/curl/' .
	40	+ 'config/curl/'.
41	41	CURLOPT_SSL_VERIFYPEER,
42	42	false
43	43	);
		@@ -61,7 +61,7 @@ discard block
		block discarded – undo
61	61	*/
62	62	public function setAdapter($adapter_name)
63	63	{
64		- $adapterClass = "\Zrashwani\NewsScrapper\Adapters\\" . $adapter_name . "Adapter";
	64	+ $adapterClass = "\Zrashwani\NewsScrapper\Adapters\\".$adapter_name."Adapter";
65	65	if (class_exists($adapterClass)) {
66	66	$this->adapter = new $adapterClass();
67	67	} else {
		@@ -88,11 +88,11 @@ discard block
		block discarded – undo
88	88	$theAdapter->currentUrl = $baseUrl;
89	89
90	90	$isXpath = Selector::isXPath($linkSelector);
91		- $method = ($isXpath ===false)?'filter':'filterXPath';
	91	+ $method = ($isXpath === false) ? 'filter' : 'filterXPath';
92	92
93	93	$crawler->$method($linkSelector)
94	94	->each(
95		- function (Crawler $link_node) use (&$scrap_result, $theAdapter, &$limit) {
	95	+ function(Crawler $link_node) use (&$scrap_result, $theAdapter, &$limit) {
96	96	if (!is_null($limit) && count($scrap_result) >= $limit) {
97	97	return;
98	98	}
		@@ -147,21 +147,21 @@ discard block
		block discarded – undo
147	147	) {
148	148	$adapter->currentUrl = $article_info->url; //associate link url to adapter
149	149
150		- $article_info->title = empty($article_info->title) === true?
151		- $adapter->extractTitle($pageCrawler):$article_info->title;
152		- $article_info->image = empty($article_info->image) === true?
153		- $adapter->extractImage($pageCrawler, $article_info->url):$article_info->image;
154		- $article_info->description = empty($article_info->description) === true?
155		- $adapter->extractDescription($pageCrawler):$article_info->description;
156		- $article_info->keywords = !isset($article_info->keywords) \|\| count($article_info->keywords) === 0?
157		- $adapter->extractKeywords($pageCrawler):$article_info->keywords;
	150	+ $article_info->title = empty($article_info->title) === true ?
	151	+ $adapter->extractTitle($pageCrawler) : $article_info->title;
	152	+ $article_info->image = empty($article_info->image) === true ?
	153	+ $adapter->extractImage($pageCrawler, $article_info->url) : $article_info->image;
	154	+ $article_info->description = empty($article_info->description) === true ?
	155	+ $adapter->extractDescription($pageCrawler) : $article_info->description;
	156	+ $article_info->keywords = !isset($article_info->keywords) \|\| count($article_info->keywords) === 0 ?
	157	+ $adapter->extractKeywords($pageCrawler) : $article_info->keywords;
158	158
159		- $article_info->author = empty($article_info->author) === true?
160		- $adapter->extractAuthor($pageCrawler):$article_info->author;
161		- $article_info->publishDate = empty($article_info->publishDate) === true?
162		- $adapter->extractPublishDate($pageCrawler):$article_info->publishDate;
163		- $article_info->body = empty($article_info->body) === true?
164		- $adapter->extractBody($pageCrawler):$article_info->body;
	159	+ $article_info->author = empty($article_info->author) === true ?
	160	+ $adapter->extractAuthor($pageCrawler) : $article_info->author;
	161	+ $article_info->publishDate = empty($article_info->publishDate) === true ?
	162	+ $adapter->extractPublishDate($pageCrawler) : $article_info->publishDate;
	163	+ $article_info->body = empty($article_info->body) === true ?
	164	+ $adapter->extractBody($pageCrawler) : $article_info->body;
165	165
166	166	}
167	167	}

zrashwani / news-scrapper

Branch — master (9824e2)

Category

Spacing +7 added lines, -7 removed lines patch added patch discarded remove patch

Spacing +1 added lines, -1 removed lines patch added patch discarded remove patch

Spacing +7 added lines, -7 removed lines patch added patch discarded remove patch

Spacing +6 added lines, -6 removed lines patch added patch discarded remove patch

Spacing +5 added lines, -5 removed lines patch added patch discarded remove patch

Spacing +8 added lines, -8 removed lines patch added patch discarded remove patch

Spacing +9 added lines, -9 removed lines patch added patch discarded remove patch

Spacing +5 added lines, -5 removed lines patch added patch discarded remove patch

Spacing +19 added lines, -19 removed lines patch added patch discarded remove patch