AbstractCrawler::crawl() - Code Metrics - Inspection of "[MINOR] Added new crawler for events" - core23/lastfm-php-api - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f85b82...dc3bec )

by Christian

created 2019-04-19 06:46 UTC

AbstractCrawler::crawl() A

↳ Parent: AbstractCrawler

Complexity

Conditions	2
Paths	2

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
dl	0
loc	8
rs	10
c	0
b	0
f	0
cc	2
nc	2
nop	1

<?php

declare(strict_types=1);

/*
 * (c) Christian Gripp <[email protected]>
 *
 * For the full copyright and license information, please view the LICENSE
 * file that was distributed with this source code.
 */

namespace Core23\LastFm\Crawler;

use Core23\LastFm\Connection\ConnectionInterface;
use Core23\LastFm\Exception\CrawlException;
use Core23\LastFm\Model\Event;
use Core23\LastFm\Model\Image;
use DateTime;
use Symfony\Component\DomCrawler\Crawler;

abstract class AbstractCrawler
{
    public const URL_PREFIX = 'http://last.fm';

    public const NEWLINE = "\n";

    /**
     * @var ConnectionInterface
     */
    private $connection;

    /**
     * @param ConnectionInterface $connection
     */
    public function __construct(ConnectionInterface $connection)
    {
        $this->connection = $connection;
    }

    /**
     * @param Crawler|null $node
     *
     * @return array
     */
    final protected function crawlEventList(Crawler $node): array
    {
        $resultList = [];

        $node->filter('.page-content section')->each(function (Crawler $node) use (&$resultList) {
            $headingNode = $node->filter('.group-heading');

            $datetime = new DateTime(trim($headingNode->text()));

            $resultList = array_merge($resultList, $this->crawlEventListGroup($node, $datetime));
        });

        return $resultList;
    }

    /**
     * @param Crawler  $node
     * @param DateTime $datetime
     *
     * @return array
     */
    protected function crawlEventListGroup(Crawler $node, DateTime $datetime): array
    {
        return $node->filter('.events-list-item')->each(
            function (Crawler $node) use ($datetime): Event {

                $eventNode = $node->filter('.events-list-item-event--title a');

                $url = $this->parseUrl($eventNode);

                if (null === $url) {
                    throw new CrawlException('Error parsing event id.');
                }

                $id = (int) preg_replace('/.*\/(\d+)+.*/', '$1', $url);

                if (0 === $id) {
                    throw new CrawlException('Error parsing event id.');
                }

                return new Event(
                    $id,
                    $this->parseString($eventNode) ?? '',
                    $datetime,
                    $url
                );
            }
        );
    }

    /**
     * Crawles a url.
     *
     * @param string $url
     *
     * @return Crawler|null
     */
    final protected function crawl(string $url): ?Crawler
    {
        if ($content = $this->connection->getPageBody($url)) {
            return new Crawler($content);
        }

        return null;
    }

    /**
     * Parses a url node.
     *
     * @param Crawler $node
     * @param string  $attr
     *
     * @return string|null
     */
    final protected function parseUrl(Crawler $node, string $attr = 'href'): ?string
    {
        if (0 === $node->count()) {
            return null;
        }

        if ($url = $node->attr($attr)) {
            return preg_replace('/^\//', static::URL_PREFIX.'/', $url);
        }

        return null;
    }

    /**
     * Parses an image node.
     *
     * @param Crawler $node
     *
     * @return Image|null
     */
    final protected function parseImage(Crawler $node): ?Image
    {
        $src = $this->parseUrl($node, 'src');

        if (!$src) {
''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
            return null;
        }

        return new Image($src);
    }

    /**
     * Parses a string node.
     *
     * @param Crawler $node
     * @param bool    $multiline
     *
     * @return string|null
     */
    final protected function parseString(Crawler $node, bool $multiline = false): ?string
    {
        if (0 === $node->count()) {
            return null;
        }

        $content = $node->attr('content');

        if (null === $content) {
            if ($multiline) {
                $content = $node->html();
                $content = (string) preg_replace('/<p[^>]*?>/', '', $content);
                $content = str_replace('</p>', static::NEWLINE, $content);
                $content = (string) preg_replace('/<br\s?\/?>/i', static::NEWLINE, $content);
            } else {
                $content = $node->text();
            }
        }

        return trim(strip_tags($content));
    }

    /**
     * Parses a date note.
     *
     * @param Crawler $node
     *
     * @return \DateTime|null
     */
    final protected function parseDate(Crawler $node): ?\DateTime
    {
        $content = $this->parseString($node);

        if (null !== $content) {
            return new \DateTime($content);
        }

        return null;
    }
}


1			<?php
2
3			declare(strict_types=1);
4
5			/*
6			* (c) Christian Gripp <[email protected]>
7			*
8			* For the full copyright and license information, please view the LICENSE
9			* file that was distributed with this source code.
10			*/
11
12			namespace Core23\LastFm\Crawler;
13
14			use Core23\LastFm\Connection\ConnectionInterface;
15			use Core23\LastFm\Exception\CrawlException;
16			use Core23\LastFm\Model\Event;
17			use Core23\LastFm\Model\Image;
18			use DateTime;
19			use Symfony\Component\DomCrawler\Crawler;
20
21			abstract class AbstractCrawler
22			{
23			public const URL_PREFIX = 'http://last.fm';
24
25			public const NEWLINE = "\n";
26
27			/**
28			* @var ConnectionInterface
29			*/
30			private $connection;
31
32			/**
33			* @param ConnectionInterface $connection
34			*/
35			public function __construct(ConnectionInterface $connection)
36			{
37			$this->connection = $connection;
38			}
39
40			/**
41			* @param Crawler\|null $node
42			*
43			* @return array
44			*/
45			final protected function crawlEventList(Crawler $node): array
46			{
47			$resultList = [];
48
49			$node->filter('.page-content section')->each(function (Crawler $node) use (&$resultList) {
50			$headingNode = $node->filter('.group-heading');
51
52			$datetime = new DateTime(trim($headingNode->text()));
53
54			$resultList = array_merge($resultList, $this->crawlEventListGroup($node, $datetime));
55			});
56
57			return $resultList;
58			}
59
60			/**
61			* @param Crawler $node
62			* @param DateTime $datetime
63			*
64			* @return array
65			*/
66			protected function crawlEventListGroup(Crawler $node, DateTime $datetime): array
67			{
68			return $node->filter('.events-list-item')->each(
69			function (Crawler $node) use ($datetime): Event {
			0 ignored issues – show Coding Style introduced 2019-04-19 06:48 UTC by Report Bug Copy Issue Report Expected 1 space after closing parenthesis; found 0 Loading history...
70			$eventNode = $node->filter('.events-list-item-event--title a');
71
72			$url = $this->parseUrl($eventNode);
73
74			if (null === $url) {
75			throw new CrawlException('Error parsing event id.');
76			}
77
78			$id = (int) preg_replace('/.\/(\d+)+./', '$1', $url);
79
80			if (0 === $id) {
81			throw new CrawlException('Error parsing event id.');
82			}
83
84			return new Event(
85			$id,
86			$this->parseString($eventNode) ?? '',
87			$datetime,
88			$url
89			);
90			}
91			);
92			}
93
94			/**
95			* Crawles a url.
96			*
97			* @param string $url
98			*
99			* @return Crawler\|null
100			*/
101			final protected function crawl(string $url): ?Crawler
102			{
103			if ($content = $this->connection->getPageBody($url)) {
104			return new Crawler($content);
105			}
106
107			return null;
108			}
109
110			/**
111			* Parses a url node.
112			*
113			* @param Crawler $node
114			* @param string $attr
115			*
116			* @return string\|null
117			*/
118			final protected function parseUrl(Crawler $node, string $attr = 'href'): ?string
119			{
120			if (0 === $node->count()) {
121			return null;
122			}
123
124			if ($url = $node->attr($attr)) {
125			return preg_replace('/^\//', static::URL_PREFIX.'/', $url);
126			}
127
128			return null;
129			}
130
131			/**
132			* Parses an image node.
133			*
134			* @param Crawler $node
135			*
136			* @return Image\|null
137			*/
138			final protected function parseImage(Crawler $node): ?Image
139			{
140			$src = $this->parseUrl($node, 'src');
141
142			if (!$src) {
			0 ignored issues – show Bug Best Practice introduced 2018-03-31 15:29 UTC by Report Bug Copy Issue Report The expression `$src` of type `null\|string` is loosely compared to `false`; this is ambiguous if the string can be empty. You might want to explicitly use `=== null` instead. In PHP, under loose comparison (like `==`, or `!=`, or `switch` conditions), values of different types might be equal. For `string` values, the empty string `''` is a special case, in particular the following results might be unexpected: '' == false // true '' == null // true 'ab' == false // false 'ab' == null // false // It is often better to use strict comparison '' === false // false '' === null // false Loading history...
143			return null;
144			}
145
146			return new Image($src);
147			}
148
149			/**
150			* Parses a string node.
151			*
152			* @param Crawler $node
153			* @param bool $multiline
154			*
155			* @return string\|null
156			*/
157			final protected function parseString(Crawler $node, bool $multiline = false): ?string
158			{
159			if (0 === $node->count()) {
160			return null;
161			}
162
163			$content = $node->attr('content');
164
165			if (null === $content) {
166			if ($multiline) {
167			$content = $node->html();
168			$content = (string) preg_replace('/<p[^>]*?>/', '', $content);
169			$content = str_replace('</p>', static::NEWLINE, $content);
170			$content = (string) preg_replace('/<br\s?\/?>/i', static::NEWLINE, $content);
171			} else {
172			$content = $node->text();
173			}
174			}
175
176			return trim(strip_tags($content));
177			}
178
179			/**
180			* Parses a date note.
181			*
182			* @param Crawler $node
183			*
184			* @return \DateTime\|null
185			*/
186			final protected function parseDate(Crawler $node): ?\DateTime
187			{
188			$content = $this->parseString($node);
189
190			if (null !== $content) {
191			return new \DateTime($content);
192			}
193
194			return null;
195			}
196			}
197

core23 / lastfm-php-api

GitHub Access Token became invalid

Push — master ( f85b82...dc3bec )

AbstractCrawler::crawl() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like