AdditionalDataExtractor::getLinks() - Code Metrics - Inspection of "Merge branch 'master' of github.com:scotteh/php-go..." - scotteh/php-goose - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 9c6482...84b1b9 )

by Andrew

created 2017-11-15 10:12 UTC

AdditionalDataExtractor::getLinks() B

↳ Parent: AdditionalDataExtractor

Complexity

Conditions	5
Paths	2

Size

Total Lines	20
Code Lines	11

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
dl	0
loc	20
rs	8.8571
c	0
b	0
f	0
cc	5
eloc	11
nc	2
nop	0

<?php

namespace Goose\Modules\Extractors;

use Goose\Article;
use Goose\Utils\Helper;
use Goose\Traits\ArticleMutatorTrait;
use Goose\Modules\AbstractModule;
use Goose\Modules\ModuleInterface;
use DOMWrap\Element;

/**
 * Additional Data Extractor
 *
 * @package Goose\Modules\Extractors
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
 */
class AdditionalDataExtractor extends AbstractModule implements ModuleInterface {
    use ArticleMutatorTrait;

    /** @var string */
    protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";

    /** @var string[] */
    protected static $VIDEO_PROVIDERS = [
        'youtube\.com',
        'youtu\.be',
        'vimeo\.com',
        'blip\.tv',
        'dailymotion\.com',
        'dai\.ly',
        'flickr\.com',
        'flic\.kr',
    ];

    /**
     * @param Article $article
     */
    public function run(Article $article) {
        $this->article($article);

        $article->setTags($this->getTags());

        if ($this->article()->getTopNode() instanceof Element) {
            $article->setVideos($this->getVideos());
            $article->setLinks($this->getLinks());
            $article->setPopularWords($this->getPopularWords());
        }
    }

    /**
     * @return string[]
     */
    private function getTags() {
        $nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);

        $tags = [];

        foreach ($nodes as $node) {
            $tags[] = Helper::textNormalise($node->text());
        }

        return $tags;
    }

    /**
     * Pulls out videos we like
     *
     * @return string[]
     */
    private function getVideos() {
        $videos = [];

        $parentNode = $this->article()->getTopNode()->parent();

        if ($parentNode instanceof Element) {
            $nodes = $parentNode->find('embed, object, iframe');

            foreach ($nodes as $node) {
                if ($node->hasAttribute('src')) {
                    $src = $node->attr('src');
                } else {
                    $src = $node->attr('data');
                }

                $match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
                    $srcHost = parse_url($src, PHP_URL_HOST);
                    $srcScheme = parse_url($src, PHP_URL_SCHEME);

                    return $match || preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
                });

                if ($match) {
                    $videos[] = $src;
                }
            }
        }

        return $videos;
    }

    /**
     * Pulls out links we like
     *
     * @return string[]
     */
    private function getLinks() {
        $goodLinks = [];

        $parentNode = $this->article()->getTopNode()->parent();

        if ($parentNode instanceof Element) {
            $candidates = $parentNode->find('a[href]');

            foreach ($candidates as $el) {
                if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
                    $goodLinks[] = [
                        'url' => $el->attr('href'),
                        'text' => Helper::textNormalise($el->text()),
                    ];
                }
            }
        }

        return $goodLinks;
    }

    /**
     * @return string[]
     */
    private function getPopularWords() {
        $limit = 5;
        $minimumFrequency = 1;
        $stopWords = $this->config()->getStopWords()->getCurrentStopWords();

        $text = $this->article()->getTitle();
        $text .= ' ' . $this->article()->getMetaDescription();

        if ($this->article()->getTopNode()) {
            $text .= ' ' . $this->article()->getCleanedArticleText();
        }

        // Decode and split words by white-space
        $text = html_entity_decode($text, ENT_COMPAT | ENT_HTML5, 'UTF-8');
        $words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);

        // Determine stop words currently in $words
        $ignoreWords = array_intersect($words, $stopWords);
        // Remove ignored words from $words
        $words = array_udiff($words, $ignoreWords, 'strcasecmp');

        // Count and sort $words
        $words = array_count_values($words);
        arsort($words);

        // Limit and filter $words
        $words = array_slice($words, 0, $limit);
        $words = array_filter($words, function($value) use ($minimumFrequency){
            return !($value < $minimumFrequency);
        });

        return $words;
    }
}


1			<?php
2
3			namespace Goose\Modules\Extractors;
4
5			use Goose\Article;
6			use Goose\Utils\Helper;
7			use Goose\Traits\ArticleMutatorTrait;
8			use Goose\Modules\AbstractModule;
9			use Goose\Modules\ModuleInterface;
10			use DOMWrap\Element;
11
12			/**
13			* Additional Data Extractor
14			*
15			* @package Goose\Modules\Extractors
16			* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
17			*/
18			class AdditionalDataExtractor extends AbstractModule implements ModuleInterface {
19			use ArticleMutatorTrait;
20
21			/** @var string */
22			protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";
23
24			/** @var string[] */
25			protected static $VIDEO_PROVIDERS = [
26			'youtube\.com',
27			'youtu\.be',
28			'vimeo\.com',
29			'blip\.tv',
30			'dailymotion\.com',
31			'dai\.ly',
32			'flickr\.com',
33			'flic\.kr',
34			];
35
36			/**
37			* @param Article $article
38			*/
39			public function run(Article $article) {
40			$this->article($article);
41
42			$article->setTags($this->getTags());
43
44			if ($this->article()->getTopNode() instanceof Element) {
45			$article->setVideos($this->getVideos());
46			$article->setLinks($this->getLinks());
47			$article->setPopularWords($this->getPopularWords());
48			}
49			}
50
51			/**
52			* @return string[]
53			*/
54			private function getTags() {
55			$nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);
56
57			$tags = [];
58
59			foreach ($nodes as $node) {
60			$tags[] = Helper::textNormalise($node->text());
61			}
62
63			return $tags;
64			}
65
66			/**
67			* Pulls out videos we like
68			*
69			* @return string[]
70			*/
71			private function getVideos() {
72			$videos = [];
73
74			$parentNode = $this->article()->getTopNode()->parent();
75
76			if ($parentNode instanceof Element) {
77			$nodes = $parentNode->find('embed, object, iframe');
78
79			foreach ($nodes as $node) {
80			if ($node->hasAttribute('src')) {
81			$src = $node->attr('src');
82			} else {
83			$src = $node->attr('data');
84			}
85
86			$match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
87			$srcHost = parse_url($src, PHP_URL_HOST);
88			$srcScheme = parse_url($src, PHP_URL_SCHEME);
89
90			return $match \|\| preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
91			});
92
93			if ($match) {
94			$videos[] = $src;
95			}
96			}
97			}
98
99			return $videos;
100			}
101
102			/**
103			* Pulls out links we like
104			*
105			* @return string[]
106			*/
107			private function getLinks() {
108			$goodLinks = [];
109
110			$parentNode = $this->article()->getTopNode()->parent();
111
112			if ($parentNode instanceof Element) {
113			$candidates = $parentNode->find('a[href]');
114
115			foreach ($candidates as $el) {
116			if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
117			$goodLinks[] = [
118			'url' => $el->attr('href'),
119			'text' => Helper::textNormalise($el->text()),
120			];
121			}
122			}
123			}
124
125			return $goodLinks;
126			}
127
128			/**
129			* @return string[]
130			*/
131			private function getPopularWords() {
132			$limit = 5;
133			$minimumFrequency = 1;
134			$stopWords = $this->config()->getStopWords()->getCurrentStopWords();
135
136			$text = $this->article()->getTitle();
137			$text .= ' ' . $this->article()->getMetaDescription();
138
139			if ($this->article()->getTopNode()) {
140			$text .= ' ' . $this->article()->getCleanedArticleText();
141			}
142
143			// Decode and split words by white-space
144			$text = html_entity_decode($text, ENT_COMPAT \| ENT_HTML5, 'UTF-8');
145			$words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);
146
147			// Determine stop words currently in $words
148			$ignoreWords = array_intersect($words, $stopWords);
149			// Remove ignored words from $words
150			$words = array_udiff($words, $ignoreWords, 'strcasecmp');
151
152			// Count and sort $words
153			$words = array_count_values($words);
154			arsort($words);
155
156			// Limit and filter $words
157			$words = array_slice($words, 0, $limit);
158			$words = array_filter($words, function($value) use ($minimumFrequency){
159			return !($value < $minimumFrequency);
160			});
161
162			return $words;
163			}
164			}
165

scotteh / php-goose

Push — master ( 9c6482...84b1b9 )

AdditionalDataExtractor::getLinks() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like