1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* Entity. |
5
|
|
|
*/ |
6
|
|
|
|
7
|
|
|
namespace PiedWeb\UrlHarvester; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Quelques notes : |
11
|
|
|
* - Un bc ne contient pas l'élément courant. |
12
|
|
|
*/ |
13
|
|
|
class ExtractBreadcrumb |
14
|
|
|
{ |
15
|
|
|
protected $breadcrumb = []; |
16
|
|
|
protected $parentDoc; |
17
|
|
|
|
18
|
|
|
const BC_RGX = '#<(div|p|nav|ul)[^>]*(id|class)="?(breadcrumbs?|fil_?d?arian?ne)"?[^>]*>(.*)<\/(\1)>#siU'; |
19
|
|
|
|
20
|
|
|
const BC_DIVIDER = [ |
21
|
|
|
'class="?navigation-pipe"?', |
22
|
|
|
'>', |
23
|
|
|
'class="?divider"?', |
24
|
|
|
'›', |
25
|
|
|
'</li>', |
26
|
|
|
]; |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* @param string $source HTML code from the page |
30
|
|
|
* @param string $baseUrl To get absolute urls |
31
|
|
|
* @param string $current The current url. If not set we thing it's the same than $baseUrl |
32
|
|
|
* |
33
|
|
|
* @return array|null |
34
|
|
|
*/ |
35
|
6 |
|
public static function get(Harvest $parent) |
36
|
|
|
{ |
37
|
6 |
|
$self = new self(); |
38
|
|
|
|
39
|
6 |
|
$self->parentDoc = $parent; |
40
|
|
|
|
41
|
6 |
|
return $self->extractBreadcrumb(); |
42
|
|
|
} |
43
|
|
|
|
44
|
6 |
|
protected function __construct() |
45
|
|
|
{ |
46
|
6 |
|
} |
47
|
|
|
|
48
|
|
|
/** |
49
|
|
|
* @return array|null |
50
|
|
|
*/ |
51
|
6 |
|
public function extractBreadcrumb() |
52
|
|
|
{ |
53
|
6 |
|
$breadcrumb = $this->findBreadcrumb(); |
54
|
6 |
|
if (null !== $breadcrumb) { |
55
|
6 |
|
foreach (self::BC_DIVIDER as $divider) { |
56
|
6 |
|
$exploded = $this->divideBreadcrumb($breadcrumb, $divider); |
57
|
6 |
|
if (false !== $exploded) { |
58
|
6 |
|
$this->extractBreadcrumbData($exploded); |
59
|
|
|
|
60
|
6 |
|
return $this->breadcrumb; |
61
|
|
|
} |
62
|
|
|
} |
63
|
|
|
} |
64
|
|
|
} |
65
|
|
|
|
66
|
6 |
|
protected function findBreadcrumb() |
67
|
|
|
{ |
68
|
6 |
|
if (preg_match(self::BC_RGX, $this->parentDoc->getResponse()->getContent(), $match)) { |
69
|
6 |
|
return $match[4]; |
70
|
|
|
} |
71
|
|
|
} |
72
|
|
|
|
73
|
6 |
|
protected function divideBreadcrumb($breadcrumb, $divider) |
74
|
|
|
{ |
75
|
6 |
|
$exploded = preg_split('/'.str_replace('/', '\/', $divider).'/si', $breadcrumb); |
76
|
|
|
|
77
|
6 |
|
return false !== $exploded && count($exploded) > 1 ? $exploded : false; |
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
/** |
81
|
|
|
* On essaye d'extraire l'url et l'ancre. |
82
|
|
|
*/ |
83
|
6 |
|
protected function extractBreadcrumbData($array) |
84
|
|
|
{ |
85
|
6 |
|
foreach ($array as $a) { |
86
|
6 |
|
$link = $this->extractHref($a); |
87
|
6 |
|
if (null === $link || $link == $this->parentDoc->getUrl()->get()) { |
88
|
6 |
|
break; |
89
|
|
|
} |
90
|
6 |
|
$this->breadcrumb[] = new BreadcrumbItem( |
91
|
6 |
|
$link, |
92
|
6 |
|
$this->extractAnchor($a) |
93
|
|
|
); |
94
|
|
|
} |
95
|
6 |
|
} |
96
|
|
|
|
97
|
6 |
|
protected function extractAnchor($str) |
98
|
|
|
{ |
99
|
6 |
|
return trim(strtolower(Helper::htmlToPlainText($str)), '> '); |
100
|
|
|
} |
101
|
|
|
|
102
|
6 |
|
protected function extractHref($str) |
103
|
|
|
{ |
104
|
|
|
$regex = [ |
105
|
6 |
|
'href="([^"]*)"', |
106
|
|
|
'href=\'([^\']*)\'', |
107
|
|
|
'href=(\S+) ', |
108
|
|
|
]; |
109
|
6 |
|
foreach ($regex as $r) { |
110
|
6 |
|
if (preg_match('/'.$r.'/siU', $str, $match)) { |
111
|
6 |
|
if (ExtractLinks::isWebLink($match[1])) { |
112
|
6 |
|
return $this->parentDoc->getUrl()->resolve($match[1]); |
113
|
|
|
} |
114
|
|
|
} |
115
|
|
|
} |
116
|
6 |
|
} |
117
|
|
|
} |
118
|
|
|
|