|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace Colligator\Scrapers; |
|
4
|
|
|
|
|
5
|
|
|
use Symfony\Component\DomCrawler\Crawler; |
|
6
|
|
|
|
|
7
|
|
|
class BsScraper extends Scraper implements ScraperInterface |
|
8
|
|
|
{ |
|
9
|
|
|
public function recognizes($url) |
|
10
|
|
|
{ |
|
11
|
|
|
return strpos($url, 'bibsys.no'); |
|
12
|
|
|
} |
|
13
|
|
|
|
|
14
|
|
|
public function getSections($texts) |
|
15
|
|
|
{ |
|
16
|
|
|
// Secion list ordered by preference |
|
17
|
|
|
$sections = [ |
|
18
|
|
|
'Beskrivelse fra forlaget (lang)' => '', |
|
19
|
|
|
'Publisher\'s description (full)' => '', |
|
20
|
|
|
'Beskrivelse fra Forlagssentralen' => '', |
|
21
|
|
|
'Beskrivelse fra forlaget (kort)' => '', |
|
22
|
|
|
'Publisher\'s description (brief)' => '', |
|
23
|
|
|
'Innholdsfortegnelse' => '', |
|
24
|
|
|
]; |
|
25
|
|
|
|
|
26
|
|
|
$next = ''; |
|
27
|
|
|
foreach ($texts as $t) { |
|
28
|
|
|
if ($next != '') { |
|
29
|
|
|
$sections[$next] = $t; |
|
30
|
|
|
$next = ''; |
|
31
|
|
|
} |
|
32
|
|
|
if (isset($sections[$t])) { |
|
33
|
|
|
// It's a section heading |
|
34
|
|
|
$next = $t; |
|
35
|
|
|
} |
|
36
|
|
|
} |
|
37
|
|
|
|
|
38
|
|
|
return $sections; |
|
39
|
|
|
} |
|
40
|
|
|
|
|
41
|
|
|
public function getFirstNonEmpty($sections) |
|
42
|
|
|
{ |
|
43
|
|
|
$text = ''; |
|
44
|
|
|
$source = ''; |
|
45
|
|
|
foreach ($sections as $k => $v) { |
|
46
|
|
|
$v = explode('©', $v); |
|
47
|
|
|
if (count($v) > 1 && !empty($v[0])) { |
|
48
|
|
|
$text = $v[0]; |
|
49
|
|
|
$source = $v[1]; |
|
50
|
|
|
break; |
|
51
|
|
|
} else if (count($v) == 1 && !empty($v[0])) { |
|
52
|
|
|
$text = $v[0]; |
|
53
|
|
|
$source = ''; |
|
54
|
|
|
} |
|
55
|
|
|
} |
|
56
|
|
|
|
|
57
|
|
|
return [$text, $source]; |
|
58
|
|
|
} |
|
59
|
|
|
|
|
60
|
|
|
public function scrape(Crawler $crawler) |
|
61
|
|
|
{ |
|
62
|
|
|
$texts = $crawler->filter('#accordion > *')->each(function (Crawler $node) { |
|
63
|
|
|
return $node->text(); |
|
64
|
|
|
}); |
|
65
|
|
|
|
|
66
|
|
|
$sections = $this->getSections($texts); |
|
67
|
|
|
list($text, $source) = $this->getFirstNonEmpty($sections); |
|
68
|
|
|
|
|
69
|
|
|
return $this->returnResult($text, $source); |
|
70
|
|
|
} |
|
71
|
|
|
} |
|
72
|
|
|
|