MetaExtractor::getOpenGraph() - Code Metrics - Inspection of "Added additional publish date search functions to..." - scotteh/php-goose - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 140b40...7cb46a )

by Andrew

created 2016-06-02 11:27 UTC

MetaExtractor::getOpenGraph() B

↳ Parent: MetaExtractor

Complexity

Conditions	4
Paths	4

Size

Total Lines	24
Code Lines	12

Duplication

Lines	10
Ratio	41.67 %

Importance

Changes	2
Bugs	0	Features	1

Metric	Value
dl	10
loc	24
c	2
b	0
f	1
rs	8.6845
cc	4
eloc	12
nc	4
nop	0

<?php

namespace Goose\Modules\Extractors;

use Goose\Article;
use Goose\Utils\Helper;
use Goose\Traits\ArticleMutatorTrait;
use Goose\Modules\AbstractModule;
use Goose\Modules\ModuleInterface;
use DOMWrap\Element;
use DOMWrap\Document;

/**
 * Content Extractor
 *
 * @package Goose\Modules\Extractors
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
 */
class MetaExtractor extends AbstractModule implements ModuleInterface {
    use ArticleMutatorTrait;

    /** @var string[] */
    protected static $SPLITTER_CHARS = [
        '|', '-', '»', ':',
    ];

    /** @var string */
    protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";

    /** @var string[] */
    protected static $VIDEO_PROVIDERS = [
        'youtube\.com',
        'youtu\.be',
        'vimeo\.com',
        'blip\.tv',
        'dailymotion\.com',
        'dai\.ly',
        'flickr\.com',
        'flic\.kr',
    ];

    /**
     * @param Article $article
     */
    public function run(Article $article) {
        $this->article($article);

        $article->setOpenGraph($this->getOpenGraph());
        $article->setTitle($this->getTitle());
        $article->setMetaDescription($this->getMetaDescription());
        $article->setMetaKeywords($this->getMetaKeywords());
        $article->setCanonicalLink($this->getCanonicalLink());
        $article->setTags($this->getTags());

        if ($this->article()->getTopNode() instanceof Element) {
            $article->setVideos($this->getVideos());
            $article->setLinks($this->getLinks());
            $article->setPopularWords($this->getPopularWords());
        }

        $article->setLanguage($this->getMetaLanguage());

        $this->config()->set('language', $article->getLanguage());
    }

    /**
     * Retrieve all OpenGraph meta data
     *
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
     * 
     * @return string[]
     */
    private function getOpenGraph() {
        $results = array();

        $nodes = $this->article()->getDoc()->find('meta[property^="og:"]');

        foreach ($nodes as $node) {

            $property = explode(':', $node->attr('property'));

            $results[$property[1]] = $node->attr('content');
        }

        // Additionally retrieve type values based on provided og:type (http://ogp.me/#types)
        if (isset($results['type'])) {
            $nodes = $this->article()->getDoc()->find('meta[property^="' . $results['type'] .':"]');

            foreach ($nodes as $node) {

                $property = explode(':', $node->attr('property'));

                $results[$property[1]] = $node->attr('content');
            }
        }

        return $results;
    }

    /**
     * Clean title text
     *
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
     * 
     * @param string $title
     *
     * @return string
     */
    private function cleanTitle($title) {
        $openGraph = $this->article()->getOpenGraph();

        // Check if we have the site name in OpenGraph data
        if (isset($openGraph['site_name'])) {
            $title = str_replace($openGraph['site_name'], '', $title);
        }

        // Try to remove the domain from URL
        if ($this->article()->getDomain()) {
            $title = str_ireplace($this->article()->getDomain(), '', $title);
        }

        // Split the title in words
        // TechCrunch | my wonderfull article
        // my wonderfull article | TechCrunch
        $titleWords = preg_split('@[\s]+@', trim($title));

        // Check for an empty title
        if (empty($titleWords)) {
            return '';
        }

        // Check if last letter is in self::$SPLITTER_CHARS
        // if so remove it
        if (in_array($titleWords[count($titleWords) - 1], self::$SPLITTER_CHARS)) {
            array_pop($titleWords);
        }

        // Check if first letter is in self::$SPLITTER_CHARS
        // if so remove it
        if (isset($titleWords[0]) && in_array($titleWords[0], self::$SPLITTER_CHARS)) {
            array_shift($titleWords);
        }

        // Rebuild the title
        $title = trim(implode(' ', $titleWords));

        return $title;
    }

    /**
     * Get article title
     *
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
     * 
     * @return string
     */
    private function getTitle() {
        $openGraph = $this->article()->getOpenGraph();

        // Rely on OpenGraph in case we have the data
        if (isset($openGraph['title'])) {
            return $this->cleanTitle($openGraph['title']);
        }

        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'headline');
        if ($nodes->count()) {
            return $this->cleanTitle($nodes->first()->attr('content'));
        }

        $nodes = $this->article()->getDoc()->find('html > head > title');
        if ($nodes->count()) {
            return $this->cleanTitle(Helper::textNormalise($nodes->first()->text()));
        }

        return '';
    }

    /**
     * @param Document $doc
     * @param string $tag
     * @param string $property
     * @param string $value
     *
     * @return \DOMWrap\NodeList
     */
    private function getNodesByLowercasePropertyValue(Document $doc, $tag, $property, $value) {
        return $doc->findXPath("descendant::".$tag."[translate(@".$property.", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='".$value."']");
    }

    /**
     * @param Document $doc
     * @param string $property
     * @param string $value
     * @param string $attr
     *
     * @return string
     */
    private function getMetaContent(Document $doc, $property, $value, $attr = 'content') {
        $nodes = $this->getNodesByLowercasePropertyValue($doc, 'meta', $property, $value);

        if (!$nodes->count()) {
            return '';
        }

        $content = $nodes->first()->attr($attr);
        $content = trim($content);

        return $content;
    }

    /**
     * If the article has meta language set in the source, use that
     *
     * @return string
     */
    private function getMetaLanguage() {
        $lang = '';

        $el = $this->article()->getDoc()->find('html[lang]');

        if ($el->count()) {
            $lang = $el->first()->attr('lang');
        }

        if (empty($lang)) {
            $selectors = [
                'html > head > meta[http-equiv=content-language]',
                'html > head > meta[name=lang]',
            ];

            foreach ($selectors as $selector) {
                $el = $this->article()->getDoc()->find($selector);

                if ($el->count()) {
                    $lang = $el->first()->attr('content');
                    break;
                }
            }
        }

        if (preg_match('@^[A-Za-z]{2}$@', $lang)) {
            return strtolower($lang);
        }

        return '';
    }

    /**
     * If the article has meta description set in the source, use that
     *
     * @return string
     */
    private function getMetaDescription() {
        $desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'description');

        if (empty($desc)) {
            $desc = $this->getMetaContent($this->article()->getDoc(), 'property', 'og:description');
        }

        if (empty($desc)) {
            $desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'twitter:description');
        }

        return trim($desc);
    }

    /**
     * If the article has meta keywords set in the source, use that
     *
     * @return string
     */
    private function getMetaKeywords() {
        return $this->getMetaContent($this->article()->getDoc(), 'name', 'keywords');
    }

    /**
     * If the article has meta canonical link set in the url
     *
     * @return string
     */
    private function getCanonicalLink() {
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'link', 'rel', 'canonical');

        if ($nodes->count()) {
            return trim($nodes->first()->attr('href'));
        }

        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'property', 'og:url');

        if ($nodes->count()) {
            return trim($nodes->first()->attr('content'));
        }

        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'twitter:url');

        if ($nodes->count()) {
            return trim($nodes->first()->attr('content'));
        }

        return $this->article()->getFinalUrl();
    }

    /**
     * @return string[]
     */
    private function getTags() {
        $nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);

        $tags = [];

        foreach ($nodes as $node) {
            $tags[] = Helper::textNormalise($node->text());
        }

        return $tags;
    }

    /**
     * Pulls out videos we like
     *
     * @return string[]
     */
    private function getVideos() {
        $videos = [];

        $nodes = $this->article()->getTopNode()->parent()->find('embed, object, iframe');

        foreach ($nodes as $node) {
            if ($node->hasAttribute('src')) {
                $src = $node->attr('src');
            } else {
                $src = $node->attr('data');
            }

            $match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
                $srcHost = parse_url($src, PHP_URL_HOST);
                $srcScheme = parse_url($src, PHP_URL_SCHEME);

                return $match || preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
            });

            if ($match) {
                $videos[] = $src;
            }
        }

        return $videos;
    }

    /**
     * Pulls out links we like
     *
     * @return string[]
     */
    private function getLinks() {
        $goodLinks = [];

        $candidates = $this->article()->getTopNode()->parent()->find('a[href]');

        foreach ($candidates as $el) {
            if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
                $goodLinks[] = [
                    'url' => $el->attr('href'),
                    'text' => Helper::textNormalise($el->text()),
                ];
            }
        }

        return $goodLinks;
    }

    /**
     * @return string[]
     */
    private function getPopularWords() {
        $limit = 5;
        $minimumFrequency = 1;
        $stopWords = $this->config()->getStopWords()->getCurrentStopWords();

        $text = $this->article()->getTitle();
        $text .= ' ' . $this->article()->getMetaDescription();

        if ($this->article()->getTopNode()) {
            $text .= ' ' . $this->article()->getCleanedArticleText();
        }

        // Decode and split words by white-space
        $text = html_entity_decode($text, ENT_COMPAT | ENT_HTML5, 'UTF-8');
        $words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);

        // Determine stop words currently in $words
        $ignoreWords = array_intersect($words, $stopWords);
        // Remove ignored words from $words
        $words = array_diff($words, $ignoreWords);

        // Count and sort $words
        $words = array_count_values($words);
        arsort($words);

        // Limit and filter $words
        $words = array_slice($words, 0, $limit);
        $words = array_filter($words, function($value) use ($minimumFrequency){
            return !($value < $minimumFrequency);
        });

        return $words;
    }
}


1		<?php
2
3		namespace Goose\Modules\Extractors;
4
5		use Goose\Article;
6		use Goose\Utils\Helper;
7		use Goose\Traits\ArticleMutatorTrait;
8		use Goose\Modules\AbstractModule;
9		use Goose\Modules\ModuleInterface;
10		use DOMWrap\Element;
11		use DOMWrap\Document;
12
13		/**
14		* Content Extractor
15		*
16		* @package Goose\Modules\Extractors
17		* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
18		*/
19		class MetaExtractor extends AbstractModule implements ModuleInterface {
20		use ArticleMutatorTrait;
21
22		/** @var string[] */
23		protected static $SPLITTER_CHARS = [
24		'\|', '-', '»', ':',
25		];
26
27		/** @var string */
28		protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";
29
30		/** @var string[] */
31		protected static $VIDEO_PROVIDERS = [
32		'youtube\.com',
33		'youtu\.be',
34		'vimeo\.com',
35		'blip\.tv',
36		'dailymotion\.com',
37		'dai\.ly',
38		'flickr\.com',
39		'flic\.kr',
40		];
41
42		/**
43		* @param Article $article
44		*/
45		public function run(Article $article) {
46		$this->article($article);
47
48		$article->setOpenGraph($this->getOpenGraph());
49		$article->setTitle($this->getTitle());
50		$article->setMetaDescription($this->getMetaDescription());
51		$article->setMetaKeywords($this->getMetaKeywords());
52		$article->setCanonicalLink($this->getCanonicalLink());
53		$article->setTags($this->getTags());
54
55		if ($this->article()->getTopNode() instanceof Element) {
56		$article->setVideos($this->getVideos());
57		$article->setLinks($this->getLinks());
58		$article->setPopularWords($this->getPopularWords());
59		}
60
61		$article->setLanguage($this->getMetaLanguage());
62
63		$this->config()->set('language', $article->getLanguage());
64		}
65
66		/**
67		* Retrieve all OpenGraph meta data
68		*
69		* Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
70		*
71		* @return string[]
72		*/
73		private function getOpenGraph() {
74		$results = array();
75
76		$nodes = $this->article()->getDoc()->find('meta[property^="og:"]');
77
78	View Code Duplication	foreach ($nodes as $node) {
		0 ignored issues – show Duplication introduced 2016-06-01 00:26 UTC by Report Bug Copy Issue Report This code seems to be duplicated across your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
79		$property = explode(':', $node->attr('property'));
80
81		$results[$property[1]] = $node->attr('content');
82		}
83
84		// Additionally retrieve type values based on provided og:type (http://ogp.me/#types)
85		if (isset($results['type'])) {
86		$nodes = $this->article()->getDoc()->find('meta[property^="' . $results['type'] .':"]');
87
88	View Code Duplication	foreach ($nodes as $node) {
		0 ignored issues – show Duplication introduced 2016-06-01 00:26 UTC by Report Bug Copy Issue Report This code seems to be duplicated across your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
89		$property = explode(':', $node->attr('property'));
90
91		$results[$property[1]] = $node->attr('content');
92		}
93		}
94
95		return $results;
96		}
97
98		/**
99		* Clean title text
100		*
101		* Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
102		*
103		* @param string $title
104		*
105		* @return string
106		*/
107		private function cleanTitle($title) {
108		$openGraph = $this->article()->getOpenGraph();
109
110		// Check if we have the site name in OpenGraph data
111		if (isset($openGraph['site_name'])) {
112		$title = str_replace($openGraph['site_name'], '', $title);
113		}
114
115		// Try to remove the domain from URL
116		if ($this->article()->getDomain()) {
117		$title = str_ireplace($this->article()->getDomain(), '', $title);
118		}
119
120		// Split the title in words
121		// TechCrunch \| my wonderfull article
122		// my wonderfull article \| TechCrunch
123		$titleWords = preg_split('@[\s]+@', trim($title));
124
125		// Check for an empty title
126		if (empty($titleWords)) {
127		return '';
128		}
129
130		// Check if last letter is in self::$SPLITTER_CHARS
131		// if so remove it
132		if (in_array($titleWords[count($titleWords) - 1], self::$SPLITTER_CHARS)) {
133		array_pop($titleWords);
134		}
135
136		// Check if first letter is in self::$SPLITTER_CHARS
137		// if so remove it
138		if (isset($titleWords[0]) && in_array($titleWords[0], self::$SPLITTER_CHARS)) {
139		array_shift($titleWords);
140		}
141
142		// Rebuild the title
143		$title = trim(implode(' ', $titleWords));
144
145		return $title;
146		}
147
148		/**
149		* Get article title
150		*
151		* Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
152		*
153		* @return string
154		*/
155		private function getTitle() {
156		$openGraph = $this->article()->getOpenGraph();
157
158		// Rely on OpenGraph in case we have the data
159		if (isset($openGraph['title'])) {
160		return $this->cleanTitle($openGraph['title']);
161		}
162
163		$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'headline');
164		if ($nodes->count()) {
165		return $this->cleanTitle($nodes->first()->attr('content'));
166		}
167
168		$nodes = $this->article()->getDoc()->find('html > head > title');
169		if ($nodes->count()) {
170		return $this->cleanTitle(Helper::textNormalise($nodes->first()->text()));
171		}
172
173		return '';
174		}
175
176		/**
177		* @param Document $doc
178		* @param string $tag
179		* @param string $property
180		* @param string $value
181		*
182		* @return \DOMWrap\NodeList
183		*/
184		private function getNodesByLowercasePropertyValue(Document $doc, $tag, $property, $value) {
185		return $doc->findXPath("descendant::".$tag."[translate(@".$property.", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='".$value."']");
186		}
187
188		/**
189		* @param Document $doc
190		* @param string $property
191		* @param string $value
192		* @param string $attr
193		*
194		* @return string
195		*/
196		private function getMetaContent(Document $doc, $property, $value, $attr = 'content') {
197		$nodes = $this->getNodesByLowercasePropertyValue($doc, 'meta', $property, $value);
198
199		if (!$nodes->count()) {
200		return '';
201		}
202
203		$content = $nodes->first()->attr($attr);
204		$content = trim($content);
205
206		return $content;
207		}
208
209		/**
210		* If the article has meta language set in the source, use that
211		*
212		* @return string
213		*/
214		private function getMetaLanguage() {
215		$lang = '';
216
217		$el = $this->article()->getDoc()->find('html[lang]');
218
219		if ($el->count()) {
220		$lang = $el->first()->attr('lang');
221		}
222
223		if (empty($lang)) {
224		$selectors = [
225		'html > head > meta[http-equiv=content-language]',
226		'html > head > meta[name=lang]',
227		];
228
229		foreach ($selectors as $selector) {
230		$el = $this->article()->getDoc()->find($selector);
231
232		if ($el->count()) {
233		$lang = $el->first()->attr('content');
234		break;
235		}
236		}
237		}
238
239		if (preg_match('@^[A-Za-z]{2}$@', $lang)) {
240		return strtolower($lang);
241		}
242
243		return '';
244		}
245
246		/**
247		* If the article has meta description set in the source, use that
248		*
249		* @return string
250		*/
251		private function getMetaDescription() {
252		$desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'description');
253
254		if (empty($desc)) {
255		$desc = $this->getMetaContent($this->article()->getDoc(), 'property', 'og:description');
256		}
257
258		if (empty($desc)) {
259		$desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'twitter:description');
260		}
261
262		return trim($desc);
263		}
264
265		/**
266		* If the article has meta keywords set in the source, use that
267		*
268		* @return string
269		*/
270		private function getMetaKeywords() {
271		return $this->getMetaContent($this->article()->getDoc(), 'name', 'keywords');
272		}
273
274		/**
275		* If the article has meta canonical link set in the url
276		*
277		* @return string
278		*/
279		private function getCanonicalLink() {
280		$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'link', 'rel', 'canonical');
281
282		if ($nodes->count()) {
283		return trim($nodes->first()->attr('href'));
284		}
285
286		$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'property', 'og:url');
287
288		if ($nodes->count()) {
289		return trim($nodes->first()->attr('content'));
290		}
291
292		$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'twitter:url');
293
294		if ($nodes->count()) {
295		return trim($nodes->first()->attr('content'));
296		}
297
298		return $this->article()->getFinalUrl();
299		}
300
301		/**
302		* @return string[]
303		*/
304		private function getTags() {
305		$nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);
306
307		$tags = [];
308
309		foreach ($nodes as $node) {
310		$tags[] = Helper::textNormalise($node->text());
311		}
312
313		return $tags;
314		}
315
316		/**
317		* Pulls out videos we like
318		*
319		* @return string[]
320		*/
321		private function getVideos() {
322		$videos = [];
323
324		$nodes = $this->article()->getTopNode()->parent()->find('embed, object, iframe');
325
326		foreach ($nodes as $node) {
327		if ($node->hasAttribute('src')) {
328		$src = $node->attr('src');
329		} else {
330		$src = $node->attr('data');
331		}
332
333		$match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
334		$srcHost = parse_url($src, PHP_URL_HOST);
335		$srcScheme = parse_url($src, PHP_URL_SCHEME);
336
337		return $match \|\| preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
338		});
339
340		if ($match) {
341		$videos[] = $src;
342		}
343		}
344
345		return $videos;
346		}
347
348		/**
349		* Pulls out links we like
350		*
351		* @return string[]
352		*/
353		private function getLinks() {
354		$goodLinks = [];
355
356		$candidates = $this->article()->getTopNode()->parent()->find('a[href]');
357
358		foreach ($candidates as $el) {
359		if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
360		$goodLinks[] = [
361		'url' => $el->attr('href'),
362		'text' => Helper::textNormalise($el->text()),
363		];
364		}
365		}
366
367		return $goodLinks;
368		}
369
370		/**
371		* @return string[]
372		*/
373		private function getPopularWords() {
374		$limit = 5;
375		$minimumFrequency = 1;
376		$stopWords = $this->config()->getStopWords()->getCurrentStopWords();
377
378		$text = $this->article()->getTitle();
379		$text .= ' ' . $this->article()->getMetaDescription();
380
381		if ($this->article()->getTopNode()) {
382		$text .= ' ' . $this->article()->getCleanedArticleText();
383		}
384
385		// Decode and split words by white-space
386		$text = html_entity_decode($text, ENT_COMPAT \| ENT_HTML5, 'UTF-8');
387		$words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);
388
389		// Determine stop words currently in $words
390		$ignoreWords = array_intersect($words, $stopWords);
391		// Remove ignored words from $words
392		$words = array_diff($words, $ignoreWords);
393
394		// Count and sort $words
395		$words = array_count_values($words);
396		arsort($words);
397
398		// Limit and filter $words
399		$words = array_slice($words, 0, $limit);
400		$words = array_filter($words, function($value) use ($minimumFrequency){
401		return !($value < $minimumFrequency);
402		});
403
404		return $words;
405		}
406		}
407

scotteh / php-goose

Push — master ( 140b40...7cb46a )

MetaExtractor::getOpenGraph() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like