Passed
Push — master ( 92084b...91cbe9 )
by Dev
13:57 queued 12:21
created

ExtractBreadcrumb   A

Complexity

Total Complexity 20

Size/Duplication

Total Lines 100
Duplicated Lines 0 %

Test Coverage

Coverage 94.87%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
wmc 20
eloc 40
c 1
b 0
f 0
dl 0
loc 100
ccs 37
cts 39
cp 0.9487
rs 10

8 Methods

Rating   Name   Duplication   Size   Complexity  
A extractAnchor() 0 3 1
A extractBreadcrumb() 0 10 4
A findBreadcrumb() 0 4 2
A __construct() 0 2 1
A divideBreadcrumb() 0 5 3
A extractHref() 0 11 4
A extractBreadcrumbData() 0 10 4
A get() 0 7 1
1
<?php
2
3
/**
4
 * Entity.
5
 */
6
7
namespace PiedWeb\UrlHarvester;
8
9
/**
10
 * Quelques notes :
11
 * - Un bc ne contient pas l'élément courant.
12
 */
13
class ExtractBreadcrumb
14
{
15
    protected $breadcrumb = [];
16
    protected $parentDoc;
17
18
    const BC_RGX = '#<(div|p|nav|ul)[^>]*(id|class)="?(breadcrumbs?|fil_?d?arian?ne)"?[^>]*>(.*)<\/(\1)>#siU';
19
20
    const BC_DIVIDER = [
21
        'class="?navigation-pipe"?',
22
        '&gt;',
23
        'class="?divider"?',
24
        '›',
25
        '</li>',
26
    ];
27
28
    /**
29
     * @param string $source  HTML code from the page
30
     * @param string $baseUrl To get absolute urls
31
     * @param string $current The current url. If not set we thing it's the same than $baseUrl
32
     *
33
     * @return array|null
34
     */
35 6
    public static function get(Harvest $parent)
36
    {
37 6
        $self = new self();
38
39 6
        $self->parentDoc = $parent;
40
41 6
        return $self->extractBreadcrumb();
42
    }
43
44 6
    protected function __construct()
45
    {
46 6
    }
47
48
    /**
49
     * @return array|null
50
     */
51 6
    public function extractBreadcrumb()
52
    {
53 6
        $breadcrumb = $this->findBreadcrumb();
54 6
        if (null !== $breadcrumb) {
55 6
            foreach (self::BC_DIVIDER as $divider) {
56 6
                $exploded = $this->divideBreadcrumb($breadcrumb, $divider);
57 6
                if (false !== $exploded) {
58 6
                    $this->extractBreadcrumbData($exploded);
59
60 6
                    return $this->breadcrumb;
61
                }
62
            }
63
        }
64
    }
65
66 6
    protected function findBreadcrumb()
67
    {
68 6
        if (preg_match(self::BC_RGX, $this->parentDoc->getResponse()->getContent(), $match)) {
69 6
            return $match[4];
70
        }
71
    }
72
73 6
    protected function divideBreadcrumb($breadcrumb, $divider)
74
    {
75 6
        $exploded = preg_split('/'.str_replace('/', '\/', $divider).'/si', $breadcrumb);
76
77 6
        return false !== $exploded && count($exploded) > 1 ? $exploded : false;
78
    }
79
80
    /**
81
     * On essaye d'extraire l'url et l'ancre.
82
     */
83 6
    protected function extractBreadcrumbData($array)
84
    {
85 6
        foreach ($array as $a) {
86 6
            $link = $this->extractHref($a);
87 6
            if (null === $link || $link == $this->parentDoc->getUrl()->get()) {
88 6
                break;
89
            }
90 6
            $this->breadcrumb[] = new BreadcrumbItem(
91 6
                $link,
92 6
                $this->extractAnchor($a)
93
            );
94
        }
95 6
    }
96
97 6
    protected function extractAnchor($str)
98
    {
99 6
        return trim(strtolower(Helper::htmlToPlainText($str)), '> ');
100
    }
101
102 6
    protected function extractHref($str)
103
    {
104
        $regex = [
105 6
            'href="([^"]*)"',
106
            'href=\'([^\']*)\'',
107
            'href=(\S+) ',
108
        ];
109 6
        foreach ($regex as $r) {
110 6
            if (preg_match('/'.$r.'/siU', $str, $match)) {
111 6
                if (ExtractLinks::isWebLink($match[1])) {
112 6
                    return $this->parentDoc->getUrl()->resolve($match[1]);
113
                }
114
            }
115
        }
116 6
    }
117
}
118