AdditionalDataExtractor - Code Metrics - scotteh/php-goose - Measure and Improve Code Quality continuously with Scrutinizer

AdditionalDataExtractor A
last analyzed 2023-09-05 13:38 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	169
Duplicated Lines	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
wmc	21
eloc	83
c	1
b	0
f	0
dl	0
loc	169
rs	10

5 Methods

Rating	Name	Size	Complexity
A	getTags()	10	2
A	getPopularWords()	36	3
A	getLinks()	19	5
B	getVideos()	36	9
A	run()	12	2

<?php declare(strict_types=1);

namespace Goose\Modules\Extractors;

use Goose\Article;
use Goose\Utils\Helper;
use Goose\Traits\ArticleMutatorTrait;
use Goose\Modules\{AbstractModule, ModuleInterface};
use DOMWrap\Element;

/**
 * Additional Data Extractor
 *
 * @package Goose\Modules\Extractors
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
 */
class AdditionalDataExtractor extends AbstractModule implements ModuleInterface {
    use ArticleMutatorTrait;

    /** @var string */
    protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";

    /** @var string[] */
    protected static $VIDEO_PROVIDERS = [
        'youtube\.com',
        'youtu\.be',
        'vimeo\.com',
        'blip\.tv',
        'dailymotion\.com',
        'dai\.ly',
        'flickr\.com',
        'flic\.kr',
    ];

    /** @var string[] */
    protected static $VIDEO_EXTENSIONS = [
        'mpg',
        'mp4',
        'avi',
        'flv',
        'mov',
        'wmv',
        'ogv',
        'gifv',
        'mpeg',
    ];

    /** @inheritdoc */
    public function run(Article $article): self {
        $this->article($article);

        $article->setTags($this->getTags());

        if ($this->article()->getTopNode() instanceof Element) {
            $article->setVideos($this->getVideos());
            $article->setLinks($this->getLinks());
            $article->setPopularWords($this->getPopularWords());
        }

        return $this;
    }

    /**
     * @return string[]
     */
    private function getTags(): array {
        $nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);

        $tags = [];

        foreach ($nodes as $node) {
            $tags[] = Helper::textNormalise($node->text());
        }

        return $tags;
    }

    /**
     * Pulls out videos we like
     *
     * @return string[]
     */
    private function getVideos(): array {
        $videos = [];

        $topNode = $this->article()->getTopNode();

        if ($topNode instanceof Element && $topNode->parent() instanceof Element) {
            $nodes = $topNode->parent()->find('embed, object, iframe, video');

            foreach ($nodes as $node) {
                if ($node->hasAttribute('src')) {
                    $src = $node->attr('src');
                } else {
                    $src = $node->attr('data');
                }

                $match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
                    $srcHost = (string)parse_url($src, PHP_URL_HOST);
                    $srcScheme = (string)parse_url($src, PHP_URL_SCHEME);

                    return $match || preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
                });

                if (!$match) {
                    $srcPath = parse_url(strtolower($src), PHP_URL_PATH);
                    $srcExtension = pathinfo((string)$srcPath, PATHINFO_EXTENSION);

                    $match = in_array($srcExtension, self::$VIDEO_EXTENSIONS);
                }

                if ($match) {
                    $videos[] = $src;
                }
            }
        }

        return $videos;
    }

    /**
     * Pulls out links we like
     *
     * @return array
     */
    private function getLinks(): array {
        $goodLinks = [];

        $parentNode = $this->article()->getTopNode()->parent();

        if ($parentNode instanceof Element) {
            $candidates = $parentNode->find('a[href]');

            foreach ($candidates as $el) {
                if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
                    $goodLinks[] = [
                        'url' => $el->attr('href'),
                        'text' => Helper::textNormalise($el->text()),
                    ];
                }
            }
        }

        return $goodLinks;
    }

    /**
     * @return string[]
     */
    private function getPopularWords(): array {
        $limit = 5;
        $minimumFrequency = 1;
        $stopWords = $this->config()->getStopWords()->getWordList();

        $text = $this->article()->getTitle();
        $text .= ' ' . $this->article()->getMetaDescription();

        if ($this->article()->getTopNode()) {
            $text .= ' ' . $this->article()->getCleanedArticleText();
        }

        // Decode and split words by white-space
        $text = html_entity_decode($text, ENT_COMPAT | ENT_HTML5, 'UTF-8');
        $words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);

        if (!is_array($words)) {

            return array();
        }

        // Determine stop words currently in $words
        $ignoreWords = array_intersect($words, $stopWords);
        // Remove ignored words from $words
        $words = array_udiff($words, $ignoreWords, 'strcasecmp');

        // Count and sort $words
        $words = array_count_values($words);
        arsort($words);

        // Limit and filter $words
        $words = array_slice($words, 0, $limit);
        $words = array_filter($words, function($value) use ($minimumFrequency){
            return !($value < $minimumFrequency);
        });

        return $words;
    }
}


1			<?php declare(strict_types=1);
2
3			namespace Goose\Modules\Extractors;
4
5			use Goose\Article;
6			use Goose\Utils\Helper;
7			use Goose\Traits\ArticleMutatorTrait;
8			use Goose\Modules\{AbstractModule, ModuleInterface};
9			use DOMWrap\Element;
10
11			/**
12			* Additional Data Extractor
13			*
14			* @package Goose\Modules\Extractors
15			* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
16			*/
17			class AdditionalDataExtractor extends AbstractModule implements ModuleInterface {
18			use ArticleMutatorTrait;
19
20			/** @var string */
21			protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";
22
23			/** @var string[] */
24			protected static $VIDEO_PROVIDERS = [
25			'youtube\.com',
26			'youtu\.be',
27			'vimeo\.com',
28			'blip\.tv',
29			'dailymotion\.com',
30			'dai\.ly',
31			'flickr\.com',
32			'flic\.kr',
33			];
34
35			/** @var string[] */
36			protected static $VIDEO_EXTENSIONS = [
37			'mpg',
38			'mp4',
39			'avi',
40			'flv',
41			'mov',
42			'wmv',
43			'ogv',
44			'gifv',
45			'mpeg',
46			];
47
48			/** @inheritdoc */
49			public function run(Article $article): self {
50			$this->article($article);
51
52			$article->setTags($this->getTags());
53
54			if ($this->article()->getTopNode() instanceof Element) {
55			$article->setVideos($this->getVideos());
56			$article->setLinks($this->getLinks());
57			$article->setPopularWords($this->getPopularWords());
58			}
59
60			return $this;
61			}
62
63			/**
64			* @return string[]
65			*/
66			private function getTags(): array {
67			$nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);
68
69			$tags = [];
70
71			foreach ($nodes as $node) {
72			$tags[] = Helper::textNormalise($node->text());
73			}
74
75			return $tags;
76			}
77
78			/**
79			* Pulls out videos we like
80			*
81			* @return string[]
82			*/
83			private function getVideos(): array {
84			$videos = [];
85
86			$topNode = $this->article()->getTopNode();
87
88			if ($topNode instanceof Element && $topNode->parent() instanceof Element) {
89			$nodes = $topNode->parent()->find('embed, object, iframe, video');
90
91			foreach ($nodes as $node) {
92			if ($node->hasAttribute('src')) {
93			$src = $node->attr('src');
94			} else {
95			$src = $node->attr('data');
96			}
97
98			$match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
99			$srcHost = (string)parse_url($src, PHP_URL_HOST);
100			$srcScheme = (string)parse_url($src, PHP_URL_SCHEME);
101
102			return $match \|\| preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
103			});
104
105			if (!$match) {
106			$srcPath = parse_url(strtolower($src), PHP_URL_PATH);
107			$srcExtension = pathinfo((string)$srcPath, PATHINFO_EXTENSION);
108
109			$match = in_array($srcExtension, self::$VIDEO_EXTENSIONS);
110			}
111
112			if ($match) {
113			$videos[] = $src;
114			}
115			}
116			}
117
118			return $videos;
119			}
120
121			/**
122			* Pulls out links we like
123			*
124			* @return array
125			*/
126			private function getLinks(): array {
127			$goodLinks = [];
128
129			$parentNode = $this->article()->getTopNode()->parent();
130
131			if ($parentNode instanceof Element) {
132			$candidates = $parentNode->find('a[href]');
133
134			foreach ($candidates as $el) {
135			if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
136			$goodLinks[] = [
137			'url' => $el->attr('href'),
138			'text' => Helper::textNormalise($el->text()),
139			];
140			}
141			}
142			}
143
144			return $goodLinks;
145			}
146
147			/**
148			* @return string[]
149			*/
150			private function getPopularWords(): array {
151			$limit = 5;
152			$minimumFrequency = 1;
153			$stopWords = $this->config()->getStopWords()->getWordList();
154
155			$text = $this->article()->getTitle();
156			$text .= ' ' . $this->article()->getMetaDescription();
157
158			if ($this->article()->getTopNode()) {
159			$text .= ' ' . $this->article()->getCleanedArticleText();
160			}
161
162			// Decode and split words by white-space
163			$text = html_entity_decode($text, ENT_COMPAT \| ENT_HTML5, 'UTF-8');
164			$words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);
165
166			if (!is_array($words)) {
			0 ignored issues – show introduced 2021-01-24 06:28 UTC by Report Bug Copy Issue Report The condition `is_array($words)` is always `true`. Loading history...
167			return array();
168			}
169
170			// Determine stop words currently in $words
171			$ignoreWords = array_intersect($words, $stopWords);
172			// Remove ignored words from $words
173			$words = array_udiff($words, $ignoreWords, 'strcasecmp');
174
175			// Count and sort $words
176			$words = array_count_values($words);
177			arsort($words);
178
179			// Limit and filter $words
180			$words = array_slice($words, 0, $limit);
181			$words = array_filter($words, function($value) use ($minimumFrequency){
182			return !($value < $minimumFrequency);
183			});
184
185			return $words;
186			}
187			}
188

scotteh / php-goose

AdditionalDataExtractor A last analyzed 2023-09-05 13:38 UTC

Complexity

Size/Duplication

Importance

5 Methods

Duplication Side-by-Side

Filter issues like

AdditionalDataExtractor A
last analyzed 2023-09-05 13:38 UTC