1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* Entity. |
5
|
|
|
*/ |
6
|
|
|
|
7
|
|
|
namespace PiedWeb\UrlHarvester; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Quelques notes : |
11
|
|
|
* - Un bc ne contient pas l'élément courant. |
12
|
|
|
*/ |
13
|
|
|
class ExtractBreadcrumb |
14
|
|
|
{ |
15
|
|
|
protected $breadcrumb = []; |
16
|
|
|
|
17
|
|
|
protected $parentDoc; |
18
|
|
|
|
19
|
|
|
public const BC_RGX = '#<(div|p|nav|ul)[^>]*(id|class)="?(breadcrumbs?|fil_?d?arian?ne)"?[^>]*>(.*)<\/(\1)>#siU'; |
20
|
|
|
|
21
|
|
|
public const BC_DIVIDER = [ |
22
|
|
|
'class="?navigation-pipe"?', |
23
|
|
|
'>', |
24
|
|
|
'class="?divider"?', |
25
|
|
|
'›', |
26
|
|
|
'</li>', |
27
|
|
|
]; |
28
|
|
|
|
29
|
|
|
/** |
30
|
|
|
* @param string $source HTML code from the page |
31
|
|
|
* @param string $baseUrl To get absolute urls |
32
|
|
|
* @param string $current The current url. If not set we thing it's the same than $baseUrl |
33
|
|
|
* |
34
|
|
|
* @return array|null |
35
|
6 |
|
*/ |
36
|
|
|
public static function get(Harvest $parent) |
37
|
6 |
|
{ |
38
|
|
|
$self = new self(); |
39
|
6 |
|
|
40
|
|
|
$self->parentDoc = $parent; |
41
|
6 |
|
|
42
|
|
|
return $self->extractBreadcrumb(); |
43
|
|
|
} |
44
|
6 |
|
|
45
|
|
|
protected function __construct() |
46
|
6 |
|
{ |
47
|
|
|
} |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* @return array|null |
51
|
6 |
|
*/ |
52
|
|
|
public function extractBreadcrumb() |
53
|
6 |
|
{ |
54
|
6 |
|
$breadcrumb = $this->findBreadcrumb(); |
55
|
6 |
|
if (null !== $breadcrumb) { |
56
|
6 |
|
foreach (self::BC_DIVIDER as $divider) { |
57
|
6 |
|
$exploded = $this->divideBreadcrumb($breadcrumb, $divider); |
58
|
6 |
|
if (false !== $exploded) { |
59
|
|
|
$this->extractBreadcrumbData($exploded); |
60
|
6 |
|
|
61
|
|
|
return $this->breadcrumb; |
62
|
|
|
} |
63
|
|
|
} |
64
|
|
|
} |
65
|
|
|
} |
66
|
6 |
|
|
67
|
|
|
protected function findBreadcrumb() |
68
|
6 |
|
{ |
69
|
6 |
|
if (preg_match(self::BC_RGX, $this->parentDoc->getResponse()->getContent(), $match)) { |
70
|
|
|
return $match[4]; |
71
|
|
|
} |
72
|
|
|
} |
73
|
6 |
|
|
74
|
|
|
protected function divideBreadcrumb($breadcrumb, $divider) |
75
|
6 |
|
{ |
76
|
|
|
$exploded = preg_split('/'.str_replace('/', '\/', $divider).'/si', $breadcrumb); |
77
|
6 |
|
|
78
|
|
|
return false !== $exploded && \count($exploded) > 1 ? $exploded : false; |
79
|
|
|
} |
80
|
|
|
|
81
|
|
|
/** |
82
|
|
|
* On essaye d'extraire l'url et l'ancre. |
83
|
6 |
|
*/ |
84
|
|
|
protected function extractBreadcrumbData($array) |
85
|
6 |
|
{ |
86
|
6 |
|
foreach ($array as $a) { |
87
|
6 |
|
$link = $this->extractHref($a); |
88
|
6 |
|
if (null === $link || $link == $this->parentDoc->getUrl()->get()) { |
89
|
|
|
break; |
90
|
6 |
|
} |
91
|
6 |
|
$this->breadcrumb[] = new BreadcrumbItem( |
92
|
6 |
|
$link, |
93
|
|
|
$this->extractAnchor($a) |
94
|
|
|
); |
95
|
6 |
|
} |
96
|
|
|
} |
97
|
6 |
|
|
98
|
|
|
protected function extractAnchor($str) |
99
|
6 |
|
{ |
100
|
|
|
return trim(strtolower(Helper::htmlToPlainText($str)), '> '); |
101
|
|
|
} |
102
|
6 |
|
|
103
|
|
|
protected function extractHref($str) |
104
|
|
|
{ |
105
|
6 |
|
$regex = [ |
106
|
|
|
'href="([^"]*)"', |
107
|
|
|
'href=\'([^\']*)\'', |
108
|
|
|
'href=(\S+) ', |
109
|
6 |
|
]; |
110
|
6 |
|
foreach ($regex as $r) { |
111
|
6 |
|
if (preg_match('/'.$r.'/siU', $str, $match)) { |
112
|
6 |
|
if (ExtractLinks::isWebLink($match[1])) { |
113
|
|
|
return $this->parentDoc->getUrl()->resolve($match[1]); |
114
|
|
|
} |
115
|
|
|
} |
116
|
6 |
|
} |
117
|
|
|
} |
118
|
|
|
} |
119
|
|
|
|