1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PiedWeb\UrlHarvester; |
4
|
|
|
|
5
|
|
|
use phpUri; |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* Quelques notes : |
9
|
|
|
* - Un bc ne contient pas l'élément courant. |
10
|
|
|
*/ |
11
|
|
|
class ExtractBreadcrumb |
12
|
|
|
{ |
13
|
|
|
protected $source; |
14
|
|
|
protected $breadcrumb = []; |
15
|
|
|
protected $baseUrl; |
16
|
|
|
protected $currentUrl; |
17
|
|
|
|
18
|
|
|
const BC_RGX = '#<(div|p|nav|ul)[^>]*(id|class)="?(breadcrumbs?|fil_?d?arian?ne)"?[^>]*>(.*)<\/(\1)>#siU'; |
19
|
|
|
|
20
|
|
|
const BC_DIVIDER = [ |
21
|
|
|
'class="?navigation-pipe"?', |
22
|
|
|
'>', |
23
|
|
|
'class="?divider"?', |
24
|
|
|
'›', |
25
|
|
|
'</li>', |
26
|
|
|
]; |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* @param string $source HTML code from the page |
30
|
|
|
* @param string $baseUrl To get absolute urls |
31
|
|
|
* @param string $current The current url. If not set we thing it's the same than $baseUrl |
32
|
|
|
* |
33
|
|
|
* @return array |
34
|
|
|
*/ |
35
|
6 |
|
public static function get(string $source, string $baseUrl, $current = null) |
36
|
|
|
{ |
37
|
6 |
|
$self = new self(); |
38
|
|
|
|
39
|
6 |
|
$self->source = $source; |
40
|
6 |
|
$self->baseUrl = $baseUrl; |
41
|
6 |
|
$self->currentUrl = null === $current ? $baseUrl : $current; |
42
|
|
|
|
43
|
6 |
|
return $self->extractBreadcrumb(); |
44
|
|
|
} |
45
|
|
|
|
46
|
6 |
|
protected function __construct() |
47
|
|
|
{ |
48
|
6 |
|
} |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* @return array |
52
|
|
|
*/ |
53
|
6 |
|
public function extractBreadcrumb() |
54
|
|
|
{ |
55
|
6 |
|
$breadcrumb = $this->findBreadcrumb(); |
56
|
6 |
|
if (null !== $breadcrumb) { |
57
|
6 |
|
foreach (self::BC_DIVIDER as $divider) { |
58
|
6 |
|
$exploded = $this->divideBreadcrumb($breadcrumb, $divider); |
59
|
6 |
|
if (false !== $exploded) { |
60
|
6 |
|
$this->extractBreadcrumbData($exploded); |
61
|
|
|
|
62
|
6 |
|
return $this->breadcrumb; |
63
|
|
|
} |
64
|
|
|
} |
65
|
|
|
} |
66
|
|
|
} |
67
|
|
|
|
68
|
6 |
|
protected function findBreadcrumb() |
69
|
|
|
{ |
70
|
6 |
|
if (preg_match(self::BC_RGX, $this->source, $match)) { |
71
|
6 |
|
return $match[4]; |
72
|
|
|
} |
73
|
|
|
} |
74
|
|
|
|
75
|
6 |
|
protected function divideBreadcrumb($breadcrumb, $divider) |
76
|
|
|
{ |
77
|
6 |
|
$exploded = preg_split('/'.str_replace('/', '\/', $divider).'/si', $breadcrumb); |
78
|
|
|
|
79
|
6 |
|
return false !== $exploded && count($exploded) > 1 ? $exploded : false; |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
/** |
83
|
|
|
* On essaye d'extraire l'url et l'ancre. |
84
|
|
|
*/ |
85
|
6 |
|
protected function extractBreadcrumbData($array) |
86
|
|
|
{ |
87
|
6 |
|
foreach ($array as $a) { |
88
|
6 |
|
$link = $this->extractHref($a); |
89
|
6 |
|
if (null === $link || $link == $this->currentUrl) { |
90
|
6 |
|
break; |
91
|
|
|
} |
92
|
6 |
|
$this->breadcrumb[] = new BreadcrumbItem( |
93
|
6 |
|
$link, |
94
|
6 |
|
$this->extractAnchor($a) |
95
|
|
|
); |
96
|
|
|
} |
97
|
6 |
|
} |
98
|
|
|
|
99
|
6 |
|
protected function extractAnchor($str) |
100
|
|
|
{ |
101
|
6 |
|
return trim(strtolower(Helper::htmlToPlainText($str)), '> '); |
102
|
|
|
} |
103
|
|
|
|
104
|
6 |
|
protected function extractHref($str) |
105
|
|
|
{ |
106
|
|
|
$regex = [ |
107
|
6 |
|
'href="([^"]*)"', |
108
|
|
|
'href=\'([^\']*)\'', |
109
|
|
|
'href=(\S+) ', |
110
|
|
|
]; |
111
|
6 |
|
foreach ($regex as $r) { |
112
|
6 |
|
if (preg_match('/'.$r.'/siU', $str, $match)) { |
113
|
6 |
|
return phpUri::parse($this->baseUrl)->join($match[1]); |
114
|
|
|
} |
115
|
|
|
} |
116
|
6 |
|
} |
117
|
|
|
} |
118
|
|
|
|