ExtractBreadcrumb::findBreadcrumb()   A
last analyzed

Complexity

Conditions 2
Paths 2

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 2
c 1
b 0
f 0
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
cc 2
nc 2
nop 0
crap 2
1
<?php
2
3
/**
4
 * Entity.
5
 */
6
7
namespace PiedWeb\UrlHarvester;
8
9
/**
10
 * Quelques notes :
11
 * - Un bc ne contient pas l'élément courant.
12
 */
13
class ExtractBreadcrumb
14
{
15
    protected $breadcrumb = [];
16
17
    protected $parentDoc;
18
19
    public const BC_RGX = '#<(div|p|nav|ul)[^>]*(id|class)="?(breadcrumbs?|fil_?d?arian?ne)"?[^>]*>(.*)<\/(\1)>#siU';
20
21
    public const BC_DIVIDER = [
22
        'class="?navigation-pipe"?',
23
        '&gt;',
24
        'class="?divider"?',
25
        '›',
26
        '</li>',
27
    ];
28
29
    /**
30
     * @param string $source  HTML code from the page
31
     * @param string $baseUrl To get absolute urls
32
     * @param string $current The current url. If not set we thing it's the same than $baseUrl
33
     *
34
     * @return array|null
35 6
     */
36
    public static function get(Harvest $parent)
37 6
    {
38
        $self = new self();
39 6
40
        $self->parentDoc = $parent;
41 6
42
        return $self->extractBreadcrumb();
43
    }
44 6
45
    protected function __construct()
46 6
    {
47
    }
48
49
    /**
50
     * @return array|null
51 6
     */
52
    public function extractBreadcrumb()
53 6
    {
54 6
        $breadcrumb = $this->findBreadcrumb();
55 6
        if (null !== $breadcrumb) {
56 6
            foreach (self::BC_DIVIDER as $divider) {
57 6
                $exploded = $this->divideBreadcrumb($breadcrumb, $divider);
58 6
                if (false !== $exploded) {
59
                    $this->extractBreadcrumbData($exploded);
60 6
61
                    return $this->breadcrumb;
62
                }
63
            }
64
        }
65
    }
66 6
67
    protected function findBreadcrumb()
68 6
    {
69 6
        if (preg_match(self::BC_RGX, $this->parentDoc->getResponse()->getContent(), $match)) {
70
            return $match[4];
71
        }
72
    }
73 6
74
    protected function divideBreadcrumb($breadcrumb, $divider)
75 6
    {
76
        $exploded = preg_split('/'.str_replace('/', '\/', $divider).'/si', $breadcrumb);
77 6
78
        return false !== $exploded && \count($exploded) > 1 ? $exploded : false;
79
    }
80
81
    /**
82
     * On essaye d'extraire l'url et l'ancre.
83 6
     */
84
    protected function extractBreadcrumbData($array)
85 6
    {
86 6
        foreach ($array as $a) {
87 6
            $link = $this->extractHref($a);
88 6
            if (null === $link || $link == $this->parentDoc->getUrl()->get()) {
89
                break;
90 6
            }
91 6
            $this->breadcrumb[] = new BreadcrumbItem(
92 6
                $link,
93
                $this->extractAnchor($a)
94
            );
95 6
        }
96
    }
97 6
98
    protected function extractAnchor($str)
99 6
    {
100
        return trim(strtolower(Helper::htmlToPlainText($str)), '> ');
101
    }
102 6
103
    protected function extractHref($str)
104
    {
105 6
        $regex = [
106
            'href="([^"]*)"',
107
            'href=\'([^\']*)\'',
108
            'href=(\S+) ',
109 6
        ];
110 6
        foreach ($regex as $r) {
111 6
            if (preg_match('/'.$r.'/siU', $str, $match)) {
112 6
                if (ExtractLinks::isWebLink($match[1])) {
113
                    return $this->parentDoc->getUrl()->resolve($match[1]);
114
                }
115
            }
116 6
        }
117
    }
118
}
119