AdditionalDataExtractor::getTags() - Code Metrics - Inspection of "Move methods that rely on getTopNode() to Addition..." - scotteh/php-goose - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( c7a267...479701 )

by Andrew

created 2016-06-07 12:20 UTC

AdditionalDataExtractor::getTags() A

↳ Parent: AdditionalDataExtractor

Complexity

Conditions	2
Paths	2

Size

Total Lines	11
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
c	1
b	0
f	0
dl	0
loc	11
rs	9.4285
cc	2
eloc	6
nc	2
nop	0

<?php

namespace Goose\Modules\Extractors;

use Goose\Article;
use Goose\Utils\Helper;
use Goose\Traits\ArticleMutatorTrait;
use Goose\Modules\AbstractModule;
use Goose\Modules\ModuleInterface;
use DOMWrap\Element;

/**
 * Additional Data Extractor
 *
 * @package Goose\Modules\Extractors
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
 */
class AdditionalDataExtractor extends AbstractModule implements ModuleInterface {
    use ArticleMutatorTrait;

    /** @var string */
    protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";

    /** @var string[] */
    protected static $VIDEO_PROVIDERS = [
        'youtube\.com',
        'youtu\.be',
        'vimeo\.com',
        'blip\.tv',
        'dailymotion\.com',
        'dai\.ly',
        'flickr\.com',
        'flic\.kr',
    ];

    /**
     * @param Article $article
     */
    public function run(Article $article) {
        $this->article($article);

        $article->setTags($this->getTags());

        if ($this->article()->getTopNode() instanceof Element) {
            $article->setVideos($this->getVideos());
            $article->setLinks($this->getLinks());
            $article->setPopularWords($this->getPopularWords());
        }
    }

    /**
     * @return string[]
     */
    private function getTags() {
        $nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);

        $tags = [];

        foreach ($nodes as $node) {
            $tags[] = Helper::textNormalise($node->text());
        }

        return $tags;
    }

    /**
     * Pulls out videos we like
     *
     * @return string[]
     */
    private function getVideos() {
        $videos = [];

        $nodes = $this->article()->getTopNode()->parent()->find('embed, object, iframe');

        foreach ($nodes as $node) {
            if ($node->hasAttribute('src')) {
                $src = $node->attr('src');
            } else {
                $src = $node->attr('data');
            }

            $match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
                $srcHost = parse_url($src, PHP_URL_HOST);
                $srcScheme = parse_url($src, PHP_URL_SCHEME);

                return $match || preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
            });

            if ($match) {
                $videos[] = $src;
            }
        }

        return $videos;
    }

    /**
     * Pulls out links we like
     *
     * @return string[]
     */
    private function getLinks() {
        $goodLinks = [];

        $candidates = $this->article()->getTopNode()->parent()->find('a[href]');

        foreach ($candidates as $el) {
            if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
                $goodLinks[] = [
                    'url' => $el->attr('href'),
                    'text' => Helper::textNormalise($el->text()),
                ];
            }
        }

        return $goodLinks;
    }

    /**
     * @return string[]
     */
    private function getPopularWords() {
        $limit = 5;
        $minimumFrequency = 1;
        $stopWords = $this->config()->getStopWords()->getCurrentStopWords();

        $text = $this->article()->getTitle();
        $text .= ' ' . $this->article()->getMetaDescription();

        if ($this->article()->getTopNode()) {
            $text .= ' ' . $this->article()->getCleanedArticleText();
        }

        // Decode and split words by white-space
        $text = html_entity_decode($text, ENT_COMPAT | ENT_HTML5, 'UTF-8');
        $words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);

        // Determine stop words currently in $words
        $ignoreWords = array_intersect($words, $stopWords);
        // Remove ignored words from $words
        $words = array_diff($words, $ignoreWords);

        // Count and sort $words
        $words = array_count_values($words);
        arsort($words);

        // Limit and filter $words
        $words = array_slice($words, 0, $limit);
        $words = array_filter($words, function($value) use ($minimumFrequency){
            return !($value < $minimumFrequency);
        });

        return $words;
    }
}


1			<?php
2
3			namespace Goose\Modules\Extractors;
4
5			use Goose\Article;
6			use Goose\Utils\Helper;
7			use Goose\Traits\ArticleMutatorTrait;
8			use Goose\Modules\AbstractModule;
9			use Goose\Modules\ModuleInterface;
10			use DOMWrap\Element;
11
12			/**
13			* Additional Data Extractor
14			*
15			* @package Goose\Modules\Extractors
16			* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
17			*/
18			class AdditionalDataExtractor extends AbstractModule implements ModuleInterface {
19			use ArticleMutatorTrait;
20
21			/** @var string */
22			protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";
23
24			/** @var string[] */
25			protected static $VIDEO_PROVIDERS = [
26			'youtube\.com',
27			'youtu\.be',
28			'vimeo\.com',
29			'blip\.tv',
30			'dailymotion\.com',
31			'dai\.ly',
32			'flickr\.com',
33			'flic\.kr',
34			];
35
36			/**
37			* @param Article $article
38			*/
39			public function run(Article $article) {
40			$this->article($article);
41
42			$article->setTags($this->getTags());
43
44			if ($this->article()->getTopNode() instanceof Element) {
45			$article->setVideos($this->getVideos());
46			$article->setLinks($this->getLinks());
47			$article->setPopularWords($this->getPopularWords());
48			}
49			}
50
51			/**
52			* @return string[]
53			*/
54			private function getTags() {
55			$nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);
56
57			$tags = [];
58
59			foreach ($nodes as $node) {
60			$tags[] = Helper::textNormalise($node->text());
61			}
62
63			return $tags;
64			}
65
66			/**
67			* Pulls out videos we like
68			*
69			* @return string[]
70			*/
71			private function getVideos() {
72			$videos = [];
73
74			$nodes = $this->article()->getTopNode()->parent()->find('embed, object, iframe');
75
76			foreach ($nodes as $node) {
77			if ($node->hasAttribute('src')) {
78			$src = $node->attr('src');
79			} else {
80			$src = $node->attr('data');
81			}
82
83			$match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
84			$srcHost = parse_url($src, PHP_URL_HOST);
85			$srcScheme = parse_url($src, PHP_URL_SCHEME);
86
87			return $match \|\| preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
88			});
89
90			if ($match) {
91			$videos[] = $src;
92			}
93			}
94
95			return $videos;
96			}
97
98			/**
99			* Pulls out links we like
100			*
101			* @return string[]
102			*/
103			private function getLinks() {
104			$goodLinks = [];
105
106			$candidates = $this->article()->getTopNode()->parent()->find('a[href]');
107
108			foreach ($candidates as $el) {
109			if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
110			$goodLinks[] = [
111			'url' => $el->attr('href'),
112			'text' => Helper::textNormalise($el->text()),
113			];
114			}
115			}
116
117			return $goodLinks;
118			}
119
120			/**
121			* @return string[]
122			*/
123			private function getPopularWords() {
124			$limit = 5;
125			$minimumFrequency = 1;
126			$stopWords = $this->config()->getStopWords()->getCurrentStopWords();
127
128			$text = $this->article()->getTitle();
129			$text .= ' ' . $this->article()->getMetaDescription();
130
131			if ($this->article()->getTopNode()) {
132			$text .= ' ' . $this->article()->getCleanedArticleText();
133			}
134
135			// Decode and split words by white-space
136			$text = html_entity_decode($text, ENT_COMPAT \| ENT_HTML5, 'UTF-8');
137			$words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);
138
139			// Determine stop words currently in $words
140			$ignoreWords = array_intersect($words, $stopWords);
141			// Remove ignored words from $words
142			$words = array_diff($words, $ignoreWords);
143
144			// Count and sort $words
145			$words = array_count_values($words);
146			arsort($words);
147
148			// Limit and filter $words
149			$words = array_slice($words, 0, $limit);
150			$words = array_filter($words, function($value) use ($minimumFrequency){
151			return !($value < $minimumFrequency);
152			});
153
154			return $words;
155			}
156			}
157

scotteh / php-goose

Push — master ( c7a267...479701 )

AdditionalDataExtractor::getTags() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like