|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
/** |
|
4
|
|
|
* Entity. |
|
5
|
|
|
*/ |
|
6
|
|
|
|
|
7
|
|
|
namespace PiedWeb\UrlHarvester; |
|
8
|
|
|
|
|
9
|
|
|
/** |
|
10
|
|
|
* Quelques notes : |
|
11
|
|
|
* - Un bc ne contient pas l'élément courant. |
|
12
|
|
|
*/ |
|
13
|
|
|
class ExtractBreadcrumb |
|
14
|
|
|
{ |
|
15
|
|
|
protected $breadcrumb = []; |
|
16
|
|
|
|
|
17
|
|
|
protected $parentDoc; |
|
18
|
|
|
|
|
19
|
|
|
public const BC_RGX = '#<(div|p|nav|ul)[^>]*(id|class)="?(breadcrumbs?|fil_?d?arian?ne)"?[^>]*>(.*)<\/(\1)>#siU'; |
|
20
|
|
|
|
|
21
|
|
|
public const BC_DIVIDER = [ |
|
22
|
|
|
'class="?navigation-pipe"?', |
|
23
|
|
|
'>', |
|
24
|
|
|
'class="?divider"?', |
|
25
|
|
|
'›', |
|
26
|
|
|
'</li>', |
|
27
|
|
|
]; |
|
28
|
|
|
|
|
29
|
|
|
/** |
|
30
|
|
|
* @param string $source HTML code from the page |
|
31
|
|
|
* @param string $baseUrl To get absolute urls |
|
32
|
|
|
* @param string $current The current url. If not set we thing it's the same than $baseUrl |
|
33
|
|
|
* |
|
34
|
|
|
* @return array|null |
|
35
|
6 |
|
*/ |
|
36
|
|
|
public static function get(Harvest $parent) |
|
37
|
6 |
|
{ |
|
38
|
|
|
$self = new self(); |
|
39
|
6 |
|
|
|
40
|
|
|
$self->parentDoc = $parent; |
|
41
|
6 |
|
|
|
42
|
|
|
return $self->extractBreadcrumb(); |
|
43
|
|
|
} |
|
44
|
6 |
|
|
|
45
|
|
|
protected function __construct() |
|
46
|
6 |
|
{ |
|
47
|
|
|
} |
|
48
|
|
|
|
|
49
|
|
|
/** |
|
50
|
|
|
* @return array|null |
|
51
|
6 |
|
*/ |
|
52
|
|
|
public function extractBreadcrumb() |
|
53
|
6 |
|
{ |
|
54
|
6 |
|
$breadcrumb = $this->findBreadcrumb(); |
|
55
|
6 |
|
if (null !== $breadcrumb) { |
|
56
|
6 |
|
foreach (self::BC_DIVIDER as $divider) { |
|
57
|
6 |
|
$exploded = $this->divideBreadcrumb($breadcrumb, $divider); |
|
58
|
6 |
|
if (false !== $exploded) { |
|
59
|
|
|
$this->extractBreadcrumbData($exploded); |
|
60
|
6 |
|
|
|
61
|
|
|
return $this->breadcrumb; |
|
62
|
|
|
} |
|
63
|
|
|
} |
|
64
|
|
|
} |
|
65
|
|
|
} |
|
66
|
6 |
|
|
|
67
|
|
|
protected function findBreadcrumb() |
|
68
|
6 |
|
{ |
|
69
|
6 |
|
if (preg_match(self::BC_RGX, $this->parentDoc->getResponse()->getContent(), $match)) { |
|
70
|
|
|
return $match[4]; |
|
71
|
|
|
} |
|
72
|
|
|
} |
|
73
|
6 |
|
|
|
74
|
|
|
protected function divideBreadcrumb($breadcrumb, $divider) |
|
75
|
6 |
|
{ |
|
76
|
|
|
$exploded = preg_split('/'.str_replace('/', '\/', $divider).'/si', $breadcrumb); |
|
77
|
6 |
|
|
|
78
|
|
|
return false !== $exploded && \count($exploded) > 1 ? $exploded : false; |
|
79
|
|
|
} |
|
80
|
|
|
|
|
81
|
|
|
/** |
|
82
|
|
|
* On essaye d'extraire l'url et l'ancre. |
|
83
|
6 |
|
*/ |
|
84
|
|
|
protected function extractBreadcrumbData($array) |
|
85
|
6 |
|
{ |
|
86
|
6 |
|
foreach ($array as $a) { |
|
87
|
6 |
|
$link = $this->extractHref($a); |
|
88
|
6 |
|
if (null === $link || $link == $this->parentDoc->getUrl()->get()) { |
|
89
|
|
|
break; |
|
90
|
6 |
|
} |
|
91
|
6 |
|
$this->breadcrumb[] = new BreadcrumbItem( |
|
92
|
6 |
|
$link, |
|
93
|
|
|
$this->extractAnchor($a) |
|
94
|
|
|
); |
|
95
|
6 |
|
} |
|
96
|
|
|
} |
|
97
|
6 |
|
|
|
98
|
|
|
protected function extractAnchor($str) |
|
99
|
6 |
|
{ |
|
100
|
|
|
return trim(strtolower(Helper::htmlToPlainText($str)), '> '); |
|
101
|
|
|
} |
|
102
|
6 |
|
|
|
103
|
|
|
protected function extractHref($str) |
|
104
|
|
|
{ |
|
105
|
6 |
|
$regex = [ |
|
106
|
|
|
'href="([^"]*)"', |
|
107
|
|
|
'href=\'([^\']*)\'', |
|
108
|
|
|
'href=(\S+) ', |
|
109
|
6 |
|
]; |
|
110
|
6 |
|
foreach ($regex as $r) { |
|
111
|
6 |
|
if (preg_match('/'.$r.'/siU', $str, $match)) { |
|
112
|
6 |
|
if (ExtractLinks::isWebLink($match[1])) { |
|
113
|
|
|
return $this->parentDoc->getUrl()->resolve($match[1]); |
|
114
|
|
|
} |
|
115
|
|
|
} |
|
116
|
6 |
|
} |
|
117
|
|
|
} |
|
118
|
|
|
} |
|
119
|
|
|
|